From 5f103003d616dcb771432626150dca3f9a689c1d Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Tue, 16 May 2023 11:20:30 +0000 Subject: [PATCH 01/87] gpu: Update kernel building to the latest changes Use now the sev.conf rather then the snp.conf. Devices can be prestend in two different way in the container (1) as vfio devices /dev/vfio/ (2) the device is managed by whataever driver in the VM kernel claims it. Fixes: #6844 Signed-off-by: Zvonko Kaiser --- .../local-build/kata-deploy-binaries.sh | 2 +- tools/packaging/kernel/build-kernel.sh | 20 ++++++++----------- .../kernel/configs/fragments/gpu/nvidia.conf | 14 ------------- 3 files changed, 9 insertions(+), 27 deletions(-) delete mode 100644 tools/packaging/kernel/configs/fragments/gpu/nvidia.conf diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh index a552aed12..c170cb759 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh @@ -293,7 +293,7 @@ install_kernel_nvidia_gpu_snp() { install_kernel_helper \ "assets.kernel.snp.version" \ "kernel-nvidia-gpu-snp" \ - "-x snp -g nvidia -u ${kernel_url} -H deb" + "-x sev -g nvidia -u ${kernel_url} -H deb" } #Install GPU and TDX experimental enabled kernel asset diff --git a/tools/packaging/kernel/build-kernel.sh b/tools/packaging/kernel/build-kernel.sh index c1f89cfeb..cb755b189 100755 --- a/tools/packaging/kernel/build-kernel.sh +++ b/tools/packaging/kernel/build-kernel.sh @@ -241,7 +241,7 @@ get_kernel_frag_path() { local redefined_string="redefined" local redundant_string="redundant" - # Later, if we need to add kernel version specific subdirs in order to + # Later, if we need to add kernel version specifqic subdirs in order to # handle specific cases, then add the path definition and search/list/cat # here. local all_configs="${common_configs} ${arch_configs}" @@ -251,25 +251,21 @@ get_kernel_frag_path() { if [[ "${gpu_vendor}" != "" ]];then info "Add kernel config for GPU due to '-g ${gpu_vendor}'" - local gpu_configs="$(ls ${gpu_path}/${gpu_vendor}.conf)" - all_configs="${all_configs} ${gpu_configs}" # If conf_guest is set we need to update the CONFIG_LOCALVERSION # to match the suffix created in install_kata # -nvidia-gpu-{snp|tdx}, the linux headers will be named the very # same if build with make deb-pkg for TDX or SNP. + local gpu_configs=$(mktemp).conf + local gpu_subst_configs="$(ls ${gpu_path}/${gpu_vendor}.${arch_target}.conf.in)" if [[ "${conf_guest}" != "" ]];then - local gpu_cc_configs=$(mktemp).conf - local gpu_subst_configs="$(ls ${gpu_path}/${gpu_vendor}.conf.in)" - export CONF_GUEST_SUFFIX="-${conf_guest}" - envsubst <${gpu_subst_configs} >${gpu_cc_configs} - unset CONF_GUEST_SUFFIX - - all_configs="${all_configs} ${gpu_cc_configs}" else - local gpu_configs="$(ls ${gpu_path}/${gpu_vendor}.conf)" - all_configs="${all_configs} ${gpu_configs}" + export CONF_GUEST_SUFFIX="" fi + envsubst <${gpu_subst_configs} >${gpu_configs} + unset CONF_GUEST_SUFFIX + + all_configs="${all_configs} ${gpu_configs}" fi if [ "${MEASURED_ROOTFS}" == "yes" ]; then diff --git a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf b/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf deleted file mode 100644 index 883c0f3af..000000000 --- a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf +++ /dev/null @@ -1,14 +0,0 @@ -# Support mmconfig PCI config space access. -# It's used to enable the MMIO access method for PCIe devices. -CONFIG_PCI_MMCONFIG=y - -# Support for loading modules. -# It is used to support loading GPU drivers. -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y - -# CRYPTO_FIPS requires this config when loading modules is enabled. -CONFIG_MODULE_SIG=y - -# Linux kernel version suffix -CONFIG_LOCALVERSION="-nvidia-gpu" From 211b0ab268566028e9d66c288ad837011a76fded Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Wed, 24 May 2023 09:07:20 +0000 Subject: [PATCH 02/87] gpu: Update Kernel Config Newer drivers need more symbols so lets enable them Signed-off-by: Zvonko Kaiser --- tools/packaging/kernel/build-kernel.sh | 2 +- .../packaging/kernel/configs/fragments/gpu/nvidia.conf.in | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/packaging/kernel/build-kernel.sh b/tools/packaging/kernel/build-kernel.sh index cb755b189..9f2dfd607 100755 --- a/tools/packaging/kernel/build-kernel.sh +++ b/tools/packaging/kernel/build-kernel.sh @@ -241,7 +241,7 @@ get_kernel_frag_path() { local redefined_string="redefined" local redundant_string="redundant" - # Later, if we need to add kernel version specifqic subdirs in order to + # Later, if we need to add kernel version specific subdirs in order to # handle specific cases, then add the path definition and search/list/cat # here. local all_configs="${common_configs} ${arch_configs}" diff --git a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf.in b/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf.in index 73cce6173..040d87a9d 100644 --- a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf.in +++ b/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf.in @@ -12,3 +12,10 @@ CONFIG_MODULE_SIG=y # Linux kernel version suffix CONFIG_LOCALVERSION="-nvidia-gpu${CONF_GUEST_SUFFIX}" + +# Newer NVIDIA drivers need additional symbols +CONFIG_X86_MCE=y +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y + From b7932be4b6f52ef4f81086b812a542a90089d2ac Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Wed, 24 May 2023 09:22:41 +0000 Subject: [PATCH 03/87] gpu: Add Arm64 Kernel Settings For different archs we need diferent settings use ${ARCH} to choose the right fragment Signed-off-by: Zvonko Kaiser --- .../local-build/kata-deploy-binaries.sh | 4 ++-- tools/packaging/kernel/build-kernel.sh | 2 +- .../fragments/gpu/nvidia.arm64.conf.in | 24 +++++++++++++++++++ .../{nvidia.conf.in => nvidia.x86_64.conf.in} | 1 - 4 files changed, 27 insertions(+), 4 deletions(-) create mode 100644 tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in rename tools/packaging/kernel/configs/fragments/gpu/{nvidia.conf.in => nvidia.x86_64.conf.in} (99%) diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh index c170cb759..3db8c76d0 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh @@ -288,10 +288,10 @@ install_kernel_nvidia_gpu() { #Install GPU and SNP enabled kernel asset install_kernel_nvidia_gpu_snp() { - local kernel_url="$(get_from_kata_deps assets.kernel.snp.url)" + local kernel_url="$(get_from_kata_deps assets.kernel.sev.url)" install_kernel_helper \ - "assets.kernel.snp.version" \ + "assets.kernel.sev.version" \ "kernel-nvidia-gpu-snp" \ "-x sev -g nvidia -u ${kernel_url} -H deb" } diff --git a/tools/packaging/kernel/build-kernel.sh b/tools/packaging/kernel/build-kernel.sh index 9f2dfd607..a89040828 100755 --- a/tools/packaging/kernel/build-kernel.sh +++ b/tools/packaging/kernel/build-kernel.sh @@ -256,7 +256,7 @@ get_kernel_frag_path() { # -nvidia-gpu-{snp|tdx}, the linux headers will be named the very # same if build with make deb-pkg for TDX or SNP. local gpu_configs=$(mktemp).conf - local gpu_subst_configs="$(ls ${gpu_path}/${gpu_vendor}.${arch_target}.conf.in)" + local gpu_subst_configs="${gpu_path}/${gpu_vendor}.${arch_target}.conf.in" if [[ "${conf_guest}" != "" ]];then export CONF_GUEST_SUFFIX="-${conf_guest}" else diff --git a/tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in b/tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in new file mode 100644 index 000000000..b260cbc12 --- /dev/null +++ b/tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in @@ -0,0 +1,24 @@ +# Support for loading modules. +# It is used to support loading GPU drivers. +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y + +# CRYPTO_FIPS requires this config when loading modules is enabled. +CONFIG_MODULE_SIG=y + +# Linux kernel version suffix +CONFIG_LOCALVERSION="-nvidia-gpu${CONF_GUEST_SUFFIX}" + +# Newer NVIDIA drivers need additional symbols +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y + + +# VFIO/IOMMU setttings +CONFIG_MMU_NOTIFIER=y +CONFIG_IOASID=y +CONFIG_IOMMU_IO_PGTABLE=y +CONFIG_IOMMU_IO_PGTABLE_LPAE=y +CONFIG_IOMMU_SVA=y +CONFIG_ARM_SMMU_V3=y +CONFIG_ARM_SMMU_V3_SVA=y diff --git a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf.in b/tools/packaging/kernel/configs/fragments/gpu/nvidia.x86_64.conf.in similarity index 99% rename from tools/packaging/kernel/configs/fragments/gpu/nvidia.conf.in rename to tools/packaging/kernel/configs/fragments/gpu/nvidia.x86_64.conf.in index 040d87a9d..f07c8aec8 100644 --- a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf.in +++ b/tools/packaging/kernel/configs/fragments/gpu/nvidia.x86_64.conf.in @@ -18,4 +18,3 @@ CONFIG_X86_MCE=y CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y CONFIG_MEMORY_FAILURE=y - From 9318e022af4b2d81c36435bd13b1f6b7dff01723 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Tue, 13 Jun 2023 09:37:17 +0000 Subject: [PATCH 04/87] gpu: Add CC relates configs For the GPU CC use case we need to set several crypto algorithms. The driver relies on them in the CC case. Signed-off-by: Zvonko Kaiser --- .../kernel/configs/fragments/gpu/nvidia.arm64.conf.in | 5 +++++ .../kernel/configs/fragments/gpu/nvidia.x86_64.conf.in | 5 +++++ tools/packaging/kernel/kata_config_version | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in b/tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in index b260cbc12..8cb9cf511 100644 --- a/tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in +++ b/tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in @@ -22,3 +22,8 @@ CONFIG_IOMMU_IO_PGTABLE_LPAE=y CONFIG_IOMMU_SVA=y CONFIG_ARM_SMMU_V3=y CONFIG_ARM_SMMU_V3_SVA=y + +# CC related configs +CONFIG_CRYPTO_ECC=y +CONFIG_CRYPTO_ECDH=y +CONFIG_CRYPTO_ECDSA=y diff --git a/tools/packaging/kernel/configs/fragments/gpu/nvidia.x86_64.conf.in b/tools/packaging/kernel/configs/fragments/gpu/nvidia.x86_64.conf.in index f07c8aec8..6ef830aab 100644 --- a/tools/packaging/kernel/configs/fragments/gpu/nvidia.x86_64.conf.in +++ b/tools/packaging/kernel/configs/fragments/gpu/nvidia.x86_64.conf.in @@ -18,3 +18,8 @@ CONFIG_X86_MCE=y CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y CONFIG_MEMORY_FAILURE=y + +# CC related configs +CONFIG_CRYPTO_ECC=y +CONFIG_CRYPTO_ECDH=y +CONFIG_CRYPTO_ECDSA=y diff --git a/tools/packaging/kernel/kata_config_version b/tools/packaging/kernel/kata_config_version index 3b20426c0..e2a9fee00 100644 --- a/tools/packaging/kernel/kata_config_version +++ b/tools/packaging/kernel/kata_config_version @@ -1 +1 @@ -108 +109 From de39fb7d3897bbb318c98ddd97805e54fa2b2625 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 19 May 2022 01:34:24 -0700 Subject: [PATCH 05/87] runtime: Add support for GPUDirect and GPUDirect RDMA PCIe topology Fixes: #4491 Signed-off-by: Zvonko Kaiser --- src/runtime/cmd/kata-runtime/kata-env.go | 1 + src/runtime/pkg/device/config/config.go | 48 +- src/runtime/pkg/device/drivers/utils.go | 15 +- src/runtime/pkg/device/drivers/vfio.go | 28 +- src/runtime/pkg/device/drivers/vfio_test.go | 2 +- .../pkg/device/manager/manager_test.go | 6 +- src/runtime/pkg/govmm/qemu/qemu.go | 108 ++++ src/runtime/pkg/govmm/qemu/qmp.go | 2 + .../pkg/hypervisors/hypervisor_state.go | 4 +- .../pkg/katautils/config-settings.go.in | 2 + src/runtime/pkg/katautils/config.go | 16 +- src/runtime/pkg/katautils/config_test.go | 1 + src/runtime/pkg/oci/utils_test.go | 58 +-- src/runtime/virtcontainers/acrn.go | 2 +- src/runtime/virtcontainers/clh.go | 21 +- src/runtime/virtcontainers/clh_test.go | 10 +- src/runtime/virtcontainers/hypervisor.go | 17 +- src/runtime/virtcontainers/kata_agent.go | 40 +- src/runtime/virtcontainers/kata_agent_test.go | 24 +- src/runtime/virtcontainers/monitor.go | 11 +- src/runtime/virtcontainers/mount.go | 26 +- src/runtime/virtcontainers/nydusd_test.go | 8 +- src/runtime/virtcontainers/persist.go | 1 + .../virtcontainers/persist/api/config.go | 4 + src/runtime/virtcontainers/qemu.go | 466 ++++++++++++------ src/runtime/virtcontainers/qemu_amd64.go | 2 +- src/runtime/virtcontainers/qemu_arch_base.go | 41 +- .../virtcontainers/qemu_arch_base_test.go | 4 +- src/runtime/virtcontainers/sandbox.go | 36 +- src/runtime/virtcontainers/sandbox_test.go | 86 ++-- .../virtcontainers/veth_endpoint_test.go | 6 +- src/runtime/virtcontainers/virtiofsd_test.go | 14 +- .../osbuilder/image-builder/image_builder.sh | 2 +- .../packaging/scripts/configure-hypervisor.sh | 4 +- 34 files changed, 696 insertions(+), 420 deletions(-) diff --git a/src/runtime/cmd/kata-runtime/kata-env.go b/src/runtime/cmd/kata-runtime/kata-env.go index f17480aba..45ddfbf3e 100644 --- a/src/runtime/cmd/kata-runtime/kata-env.go +++ b/src/runtime/cmd/kata-runtime/kata-env.go @@ -115,6 +115,7 @@ type HypervisorInfo struct { MemorySlots uint32 PCIeRootPort uint32 ColdPlugVFIO hv.PCIePort + PCIeSwitchPort uint32 HotplugVFIOOnRootBus bool Debug bool } diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index dee6291ed..f3a0e879a 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -114,8 +114,8 @@ const ( // SysDevPrefix is static string of /sys/dev var SysDevPrefix = "/sys/dev" -// SysIOMMUPath is static string of /sys/kernel/iommu_groups -var SysIOMMUPath = "/sys/kernel/iommu_groups" +// SysIOMMUGroupPath is static string of /sys/kernel/iommu_groups +var SysIOMMUGroupPath = "/sys/kernel/iommu_groups" // SysBusPciDevicesPath is static string of /sys/bus/pci/devices var SysBusPciDevicesPath = "/sys/bus/pci/devices" @@ -268,14 +268,8 @@ const ( VFIOAPDeviceMediatedType ) -type VFIODev interface { - GetID() *string - GetType() VFIODeviceType - GetSysfsDev() *string -} - // VFIOPCIDev represents a VFIO PCI device used for hotplugging -type VFIOPCIDev struct { +type VFIODev struct { // ID is used to identify this drive in the hypervisor options. ID string @@ -305,44 +299,12 @@ type VFIOPCIDev struct { // IsPCIe specifies device is PCIe or PCI IsPCIe bool -} - -func (d VFIOPCIDev) GetID() *string { - return &d.ID -} - -func (d VFIOPCIDev) GetType() VFIODeviceType { - return d.Type -} - -func (d VFIOPCIDev) GetSysfsDev() *string { - return &d.SysfsDev -} - -type VFIOAPDev struct { - // ID is used to identify this drive in the hypervisor options. - ID string - - // sysfsdev of VFIO mediated device - SysfsDev string // APDevices are the Adjunct Processor devices assigned to the mdev APDevices []string - // Type of VFIO device - Type VFIODeviceType -} - -func (d VFIOAPDev) GetID() *string { - return &d.ID -} - -func (d VFIOAPDev) GetType() VFIODeviceType { - return d.Type -} - -func (d VFIOAPDev) GetSysfsDev() *string { - return &d.SysfsDev + // Rank identifies a device in a IOMMU group + Rank int } // RNGDev represents a random number generator device diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index bfffa31a2..19ac99c30 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -47,9 +47,9 @@ func deviceLogger() *logrus.Entry { return api.DeviceLogger() } -// Identify PCIe device by reading the size of the PCI config space +// IsPCIeDevice Identify PCIe device by reading the size of the PCI config space // Plain PCI device have 256 bytes of config space where PCIe devices have 4K -func isPCIeDevice(bdf string) bool { +func IsPCIeDevice(bdf string) bool { if len(strings.Split(bdf, ":")) == 2 { bdf = PCIDomain + ":" + bdf } @@ -164,7 +164,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme vfioDevs := []*config.VFIODev{} vfioGroup := filepath.Base(device.HostPath) - iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices") + iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioGroup, "devices") deviceFiles, err := os.ReadDir(iommuDevicesPath) if err != nil { @@ -174,7 +174,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme // Pass all devices in iommu group for i, deviceFile := range deviceFiles { //Get bdf of device eg 0000:00:1c.0 - deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath) + deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(deviceFile.Name(), iommuDevicesPath) if err != nil { return nil, err } @@ -196,15 +196,16 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme switch vfioDeviceType { case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - isPCIe := isPCIeDevice(deviceBDF) + isPCIe := IsPCIeDevice(deviceBDF) // Do not directly assign to `vfio` -- need to access field still - vfioPCI := config.VFIOPCIDev{ + vfioPCI := config.VFIODev{ ID: id, Type: vfioDeviceType, BDF: deviceBDF, SysfsDev: deviceSysfsDev, IsPCIe: isPCIe, Class: pciClass, + Rank: -1, } if isPCIe && !ignoreBusAssignment { vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs)) @@ -216,7 +217,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme if err != nil { return nil, err } - vfio = config.VFIOAPDev{ + vfio = config.VFIODev{ ID: id, SysfsDev: deviceSysfsDev, Type: config.VFIOAPDeviceMediatedType, diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 106220dcf..4231f0d30 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -33,6 +33,9 @@ const ( ) var ( + // AllPCIeDevs deduces the correct bus number. The BDF keeps track that + // we're not accounting for the very same device, if a user provides the + // devices multiple times. AllPCIeDevs = map[string]bool{} ) @@ -169,23 +172,18 @@ func (device *VFIODevice) Load(ds config.DeviceState) { for _, dev := range ds.VFIODevs { var vfio config.VFIODev - vfioDeviceType := (*device.VfioDevs[0]).GetType() - switch vfioDeviceType { + switch dev.Type { case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - bdf := "" - if pciDev, ok := (*dev).(config.VFIOPCIDev); ok { - bdf = pciDev.BDF - } - vfio = config.VFIOPCIDev{ - ID: *(*dev).GetID(), - Type: config.VFIODeviceType((*dev).GetType()), - BDF: bdf, - SysfsDev: *(*dev).GetSysfsDev(), + vfio = config.VFIODev{ + ID: dev.ID, + Type: config.VFIODeviceType(dev.Type), + BDF: dev.BDF, + SysfsDev: dev.SysfsDev, } case config.VFIOAPDeviceMediatedType: - vfio = config.VFIOAPDev{ - ID: *(*dev).GetID(), - SysfsDev: *(*dev).GetSysfsDev(), + vfio = config.VFIODev{ + ID: dev.ID, + SysfsDev: dev.SysfsDev, } default: deviceLogger().WithError( @@ -200,7 +198,7 @@ func (device *VFIODevice) Load(ds config.DeviceState) { // It should implement GetAttachCount() and DeviceID() as api.Device implementation // here it shares function from *GenericDevice so we don't need duplicate codes -func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) { +func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) { sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName) vfioDeviceType, err = GetVFIODeviceType(sysfsDevStr) if err != nil { diff --git a/src/runtime/pkg/device/drivers/vfio_test.go b/src/runtime/pkg/device/drivers/vfio_test.go index 6a1ab61eb..2ded0f850 100644 --- a/src/runtime/pkg/device/drivers/vfio_test.go +++ b/src/runtime/pkg/device/drivers/vfio_test.go @@ -29,7 +29,7 @@ func TestGetVFIODetails(t *testing.T) { } for _, d := range data { - deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(d.deviceStr, "") + deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(d.deviceStr, "") switch vfioDeviceType { case config.VFIOPCIDeviceNormalType: diff --git a/src/runtime/pkg/device/manager/manager_test.go b/src/runtime/pkg/device/manager/manager_test.go index 49e339f60..70c76b67d 100644 --- a/src/runtime/pkg/device/manager/manager_test.go +++ b/src/runtime/pkg/device/manager/manager_test.go @@ -116,14 +116,14 @@ func TestAttachVFIODevice(t *testing.T) { _, err = os.Create(deviceConfigFile) assert.Nil(t, err) - savedIOMMUPath := config.SysIOMMUPath - config.SysIOMMUPath = tmpDir + savedIOMMUPath := config.SysIOMMUGroupPath + config.SysIOMMUGroupPath = tmpDir savedSysBusPciDevicesPath := config.SysBusPciDevicesPath config.SysBusPciDevicesPath = devicesDir defer func() { - config.SysIOMMUPath = savedIOMMUPath + config.SysIOMMUGroupPath = savedIOMMUPath config.SysBusPciDevicesPath = savedSysBusPciDevicesPath }() diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 06d7e5895..7b4b9037f 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -123,6 +123,14 @@ const ( // PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port. PCIeRootPort DeviceDriver = "pcie-root-port" + // PCIeSwitchUpstreamPort is a PCIe switch upstream port + // A upstream port connects to a PCIe Root Port + PCIeSwitchUpstreamPort DeviceDriver = "x3130-upstream" + + // PCIeSwitchDownstreamPort is a PCIe switch downstream port + // PCIe devices can be hot-plugged to the downstream port. + PCIeSwitchDownstreamPort DeviceDriver = "xio3130-downstream" + // Loader is the Loader device driver. Loader DeviceDriver = "loader" @@ -1681,6 +1689,106 @@ func (b PCIeRootPortDevice) Valid() bool { return true } +// PCIeSwitchUpstreamPortDevice is the port connecting to the root port +type PCIeSwitchUpstreamPortDevice struct { + ID string // format: sup{n}, n>=0 + Bus string // default is rp0 +} + +// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice. +func (b PCIeSwitchUpstreamPortDevice) QemuParams(config *Config) []string { + var qemuParams []string + var deviceParams []string + + driver := PCIeSwitchUpstreamPort + + deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID)) + deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus)) + + qemuParams = append(qemuParams, "-device") + qemuParams = append(qemuParams, strings.Join(deviceParams, ",")) + return qemuParams +} + +// Valid returns true if the PCIeSwitchUpstreamPortDevice structure is valid and complete. +func (b PCIeSwitchUpstreamPortDevice) Valid() bool { + if b.ID == "" { + return false + } + if b.Bus == "" { + return false + } + return true +} + +// PCIeSwitchDownstreamPortDevice is the port connecting to the root port +type PCIeSwitchDownstreamPortDevice struct { + ID string // format: sup{n}, n>=0 + Bus string // default is rp0 + Chassis string // (slot, chassis) pair is mandatory and must be unique for each downstream port, >=0, default is 0x00 + Slot string // >=0, default is 0x00 + // This to work needs patches to QEMU + BusReserve string + // Pref64 and Pref32 are not allowed to be set simultaneously + Pref64Reserve string // reserve prefetched MMIO aperture, 64-bit + Pref32Reserve string // reserve prefetched MMIO aperture, 32-bit + MemReserve string // reserve non-prefetched MMIO aperture, 32-bit *only* + IOReserve string // IO reservation + +} + +// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice. +func (b PCIeSwitchDownstreamPortDevice) QemuParams(config *Config) []string { + var qemuParams []string + var deviceParams []string + driver := PCIeSwitchDownstreamPort + + deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID)) + deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus)) + deviceParams = append(deviceParams, fmt.Sprintf("chassis=%s", b.Chassis)) + deviceParams = append(deviceParams, fmt.Sprintf("slot=%s", b.Slot)) + if b.BusReserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("bus-reserve=%s", b.BusReserve)) + } + + if b.Pref64Reserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("pref64-reserve=%s", b.Pref64Reserve)) + } + + if b.Pref32Reserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("pref32-reserve=%s", b.Pref32Reserve)) + } + + if b.MemReserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("mem-reserve=%s", b.MemReserve)) + } + + if b.IOReserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("io-reserve=%s", b.IOReserve)) + } + + qemuParams = append(qemuParams, "-device") + qemuParams = append(qemuParams, strings.Join(deviceParams, ",")) + return qemuParams +} + +// Valid returns true if the PCIeSwitchUpstremPortDevice structure is valid and complete. +func (b PCIeSwitchDownstreamPortDevice) Valid() bool { + if b.ID == "" { + return false + } + if b.Bus == "" { + return false + } + if b.Chassis == "" { + return false + } + if b.Slot == "" { + return false + } + return true +} + // VFIODevice represents a qemu vfio device meant for direct access by guest OS. type VFIODevice struct { // Bus-Device-Function of device diff --git a/src/runtime/pkg/govmm/qemu/qmp.go b/src/runtime/pkg/govmm/qemu/qmp.go index 6b020103d..5630ff9de 100644 --- a/src/runtime/pkg/govmm/qemu/qmp.go +++ b/src/runtime/pkg/govmm/qemu/qmp.go @@ -656,6 +656,7 @@ func (q *QMP) executeCommand(ctx context.Context, name string, args map[string]i filter *qmpEventFilter) error { _, err := q.executeCommandWithResponse(ctx, name, args, nil, filter) + return err } @@ -1191,6 +1192,7 @@ func (q *QMP) ExecutePCIVFIODeviceAdd(ctx context.Context, devID, bdf, addr, bus if bus != "" { args["bus"] = bus } + return q.executeCommand(ctx, "device_add", args, nil) } diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go index 482b7e9e2..735ac089b 100644 --- a/src/runtime/pkg/hypervisors/hypervisor_state.go +++ b/src/runtime/pkg/hypervisors/hypervisor_state.go @@ -58,7 +58,6 @@ func (p PCIePort) String() string { type HypervisorState struct { BlockIndexMap map[int]struct{} - // Type of hypervisor, E.g. qemu/firecracker/acrn. Type string UUID string @@ -68,6 +67,7 @@ type HypervisorState struct { // Belows are qemu specific // Refs: virtcontainers/qemu.go:QemuState Bridges []Bridge + // HotpluggedCPUs is the list of CPUs that were hot-added HotpluggedVCPUs []CPUDevice @@ -75,6 +75,8 @@ type HypervisorState struct { VirtiofsDaemonPid int Pid int PCIeRootPort int + PCIeSwitchPort int ColdPlugVFIO PCIePort + HotPlugVFIO PCIePort HotplugVFIOOnRootBus bool } diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 139d54826..0de02ce40 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -109,3 +109,5 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock" var defaultRuntimeConfiguration = "@CONFIG_PATH@" const defaultColdPlugVFIO = hv.NoPort +const defaultHotPlugVFIO = hv.BridgePort +const defaultPCIeSwitchPort = 0 diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 6b08f4afe..e38526fae 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -76,7 +76,6 @@ type factory struct { VMCacheNumber uint `toml:"vm_cache_number"` Template bool `toml:"enable_template"` } - type hypervisor struct { Path string `toml:"path"` JailerPath string `toml:"jailer_path"` @@ -131,6 +130,7 @@ type hypervisor struct { MemSlots uint32 `toml:"memory_slots"` DefaultBridges uint32 `toml:"default_bridges"` Msize9p uint32 `toml:"msize_9p"` + PCIeSwitchPort uint32 `toml:"pcie_switch_port"` PCIeRootPort uint32 `toml:"pcie_root_port"` NumVCPUs int32 `toml:"default_vcpus"` BlockDeviceCacheSet bool `toml:"block_device_cache_set"` @@ -149,6 +149,7 @@ type hypervisor struct { EnableIOThreads bool `toml:"enable_iothreads"` DisableImageNvdimm bool `toml:"disable_image_nvdimm"` HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` + HotPlugVFIO hv.PCIePort `toml:"hotplug_vfio"` ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"` DisableVhostNet bool `toml:"disable_vhost_net"` GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` @@ -293,6 +294,12 @@ func (h hypervisor) coldPlugVFIO() hv.PCIePort { } return h.ColdPlugVFIO } +func (h hypervisor) hotPlugVFIO() hv.PCIePort { + if h.HotPlugVFIO == "" { + return defaultHotPlugVFIO + } + return h.HotPlugVFIO +} func (h hypervisor) firmwareVolume() (string, error) { p := h.FirmwareVolume @@ -864,7 +871,9 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { DisableImageNvdimm: h.DisableImageNvdimm, HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, ColdPlugVFIO: h.coldPlugVFIO(), + HotPlugVFIO: h.HotPlugVFIO, PCIeRootPort: h.PCIeRootPort, + PCIeSwitchPort: h.PCIeSwitchPort, DisableVhostNet: h.DisableVhostNet, EnableVhostUserStore: h.EnableVhostUserStore, VhostUserStorePath: h.vhostUserStorePath(), @@ -877,7 +886,6 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { GuestMemoryDumpPath: h.GuestMemoryDumpPath, GuestMemoryDumpPaging: h.GuestMemoryDumpPaging, ConfidentialGuest: h.ConfidentialGuest, - SevSnpGuest: h.SevSnpGuest, GuestSwap: h.GuestSwap, Rootless: h.Rootless, LegacySerial: h.LegacySerial, @@ -1059,7 +1067,9 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { Msize9p: h.msize9p(), HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, ColdPlugVFIO: h.coldPlugVFIO(), + HotPlugVFIO: h.hotPlugVFIO(), PCIeRootPort: h.PCIeRootPort, + PCIeSwitchPort: h.PCIeSwitchPort, DisableVhostNet: true, GuestHookPath: h.guestHookPath(), VirtioFSExtraArgs: h.VirtioFSExtraArgs, @@ -1290,6 +1300,8 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { Msize9p: defaultMsize9p, HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus, ColdPlugVFIO: defaultColdPlugVFIO, + HotPlugVFIO: defaultHotPlugVFIO, + PCIeSwitchPort: defaultPCIeSwitchPort, PCIeRootPort: defaultPCIeRootPort, GuestHookPath: defaultGuestHookPath, VhostUserStorePath: defaultVhostUserStorePath, diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index 171f011b8..fb506ba6a 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -566,6 +566,7 @@ func TestMinimalRuntimeConfig(t *testing.T) { GuestHookPath: defaultGuestHookPath, VhostUserStorePath: defaultVhostUserStorePath, VirtioFSCache: defaultVirtioFSCacheMode, + HotPlugVFIO: defaultHotPlugVFIO, BlockDeviceAIO: defaultBlockDeviceAIO, DisableGuestSeLinux: defaultDisableGuestSeLinux, ColdPlugVFIO: defaultColdPlugVFIO, diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go index 7b2c10f2f..f3899f1b8 100644 --- a/src/runtime/pkg/oci/utils_test.go +++ b/src/runtime/pkg/oci/utils_test.go @@ -821,22 +821,22 @@ func TestRegexpContains(t *testing.T) { //nolint: govet type testData struct { - regexps []string toMatch string + regexps []string expected bool } data := []testData{ - {[]string{}, "", false}, - {[]string{}, "nonempty", false}, - {[]string{"simple"}, "simple", true}, - {[]string{"simple"}, "some_simple_text", true}, - {[]string{"simple"}, "simp", false}, - {[]string{"one", "two"}, "one", true}, - {[]string{"one", "two"}, "two", true}, - {[]string{"o*"}, "oooo", true}, - {[]string{"o*"}, "oooa", true}, - {[]string{"^o*$"}, "oooa", false}, + {regexps: []string{}, toMatch: "", expected: false}, + {regexps: []string{}, toMatch: "nonempty", expected: false}, + {regexps: []string{"simple"}, toMatch: "simple", expected: true}, + {regexps: []string{"simple"}, toMatch: "some_simple_text", expected: true}, + {regexps: []string{"simple"}, toMatch: "simp", expected: false}, + {regexps: []string{"one", "two"}, toMatch: "one", expected: true}, + {regexps: []string{"one", "two"}, toMatch: "two", expected: true}, + {regexps: []string{"o*"}, toMatch: "oooo", expected: true}, + {regexps: []string{"o*"}, toMatch: "oooa", expected: true}, + {regexps: []string{"^o*$"}, toMatch: "oooa", expected: false}, } for _, d := range data { @@ -850,25 +850,25 @@ func TestCheckPathIsInGlobs(t *testing.T) { //nolint: govet type testData struct { - globs []string toMatch string + globs []string expected bool } data := []testData{ - {[]string{}, "", false}, - {[]string{}, "nonempty", false}, - {[]string{"simple"}, "simple", false}, - {[]string{"simple"}, "some_simple_text", false}, - {[]string{"/bin/ls"}, "/bin/ls", true}, - {[]string{"/bin/ls", "/bin/false"}, "/bin/ls", true}, - {[]string{"/bin/ls", "/bin/false"}, "/bin/false", true}, - {[]string{"/bin/ls", "/bin/false"}, "/bin/bar", false}, - {[]string{"/bin/*ls*"}, "/bin/ls", true}, - {[]string{"/bin/*ls*"}, "/bin/false", true}, - {[]string{"bin/ls"}, "/bin/ls", false}, - {[]string{"./bin/ls"}, "/bin/ls", false}, - {[]string{"*/bin/ls"}, "/bin/ls", false}, + {globs: []string{}, toMatch: "", expected: false}, + {globs: []string{}, toMatch: "nonempty", expected: false}, + {globs: []string{"simple"}, toMatch: "simple", expected: false}, + {globs: []string{"simple"}, toMatch: "some_simple_text", expected: false}, + {globs: []string{"/bin/ls"}, toMatch: "/bin/ls", expected: true}, + {globs: []string{"/bin/ls", "/bin/false"}, toMatch: "/bin/ls", expected: true}, + {globs: []string{"/bin/ls", "/bin/false"}, toMatch: "/bin/false", expected: true}, + {globs: []string{"/bin/ls", "/bin/false"}, toMatch: "/bin/bar", expected: false}, + {globs: []string{"/bin/*ls*"}, toMatch: "/bin/ls", expected: true}, + {globs: []string{"/bin/*ls*"}, toMatch: "/bin/false", expected: true}, + {globs: []string{"bin/ls"}, toMatch: "/bin/ls", expected: false}, + {globs: []string{"./bin/ls"}, toMatch: "/bin/ls", expected: false}, + {globs: []string{"*/bin/ls"}, toMatch: "/bin/ls", expected: false}, } for _, d := range data { @@ -923,10 +923,10 @@ func TestParseAnnotationUintConfiguration(t *testing.T) { // nolint: govet testCases := []struct { - annotations map[string]string - expected uint64 err error + annotations map[string]string validFunc func(uint64) error + expected uint64 }{ { annotations: map[string]string{key: ""}, @@ -1007,10 +1007,10 @@ func TestParseAnnotationBoolConfiguration(t *testing.T) { // nolint: govet testCases := []struct { + err error annotationKey string annotationValueList []string expected bool - err error }{ { annotationKey: boolKey, @@ -1207,8 +1207,8 @@ func TestNewMount(t *testing.T) { assert := assert.New(t) testCases := []struct { - out vc.Mount in specs.Mount + out vc.Mount }{ { in: specs.Mount{ diff --git a/src/runtime/virtcontainers/acrn.go b/src/runtime/virtcontainers/acrn.go index 35c71a6d6..735b20019 100644 --- a/src/runtime/virtcontainers/acrn.go +++ b/src/runtime/virtcontainers/acrn.go @@ -45,10 +45,10 @@ type AcrnState struct { // Acrn is an Hypervisor interface implementation for the Linux acrn hypervisor. type Acrn struct { - sandbox *Sandbox ctx context.Context arch acrnArch store persistapi.PersistDriver + sandbox *Sandbox id string acrnConfig Config config HypervisorConfig diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 6ae99d673..cec2e1c63 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -246,15 +246,15 @@ func (s *CloudHypervisorState) reset() { } type cloudHypervisor struct { + vmconfig chclient.VmConfig console console.Console virtiofsDaemon VirtiofsDaemon - APIClient clhClient ctx context.Context - id string + APIClient clhClient netDevices *[]chclient.NetConfig devicesIds map[string]string netDevicesFiles map[string][]*os.File - vmconfig chclient.VmConfig + id string state CloudHypervisorState config HypervisorConfig stopped int32 @@ -860,12 +860,12 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error { defer cancel() // Create the clh device config via the constructor to ensure default values are properly assigned - clhDevice := *chclient.NewDeviceConfig(*(*device).GetSysfsDev()) + clhDevice := *chclient.NewDeviceConfig(device.SysfsDev) pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice) if err != nil { return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err)) } - clh.devicesIds[*(*device).GetID()] = pciInfo.GetId() + clh.devicesIds[device.ID] = pciInfo.GetId() // clh doesn't use bridges, so the PCI path is simply the slot // number of the device. This will break if clh starts using @@ -882,14 +882,11 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error { return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf) } - guestPciPath, err := types.PciPathFromString(tokens[0]) - - pciDevice, ok := (*device).(config.VFIOPCIDev) - if !ok { + if device.Type == config.VFIOAPDeviceMediatedType { return fmt.Errorf("VFIO device %+v is not PCI, only PCI is supported in Cloud Hypervisor", device) } - pciDevice.GuestPciPath = guestPciPath - *device = pciDevice + + device.GuestPciPath, err = types.PciPathFromString(tokens[0]) return err } @@ -933,7 +930,7 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int case BlockDev: deviceID = clhDriveIndexToID(devInfo.(*config.BlockDrive).Index) case VfioDev: - deviceID = *devInfo.(config.VFIODev).GetID() + deviceID = devInfo.(*config.VFIODev).ID default: clh.Logger().WithFields(log.Fields{"devInfo": devInfo, "deviceType": devType}).Error("HotplugRemoveDevice: unsupported device") diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go index b5c800e95..0ab7ef5b9 100644 --- a/src/runtime/virtcontainers/clh_test.go +++ b/src/runtime/virtcontainers/clh_test.go @@ -188,13 +188,13 @@ func TestCloudHypervisorAddNetCheckEnpointTypes(t *testing.T) { } // nolint: govet tests := []struct { - name string args args + name string wantErr bool }{ - {"TapEndpoint", args{e: &TapEndpoint{}}, true}, - {"Empty VethEndpoint", args{e: &VethEndpoint{}}, true}, - {"Valid VethEndpoint", args{e: validVeth}, false}, + {name: "TapEndpoint", args: args{e: &TapEndpoint{}}, wantErr: true}, + {name: "Empty VethEndpoint", args: args{e: &VethEndpoint{}}, wantErr: true}, + {name: "Valid VethEndpoint", args: args{e: validVeth}, wantErr: false}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -673,7 +673,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) { _, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev) assert.NoError(err, "Hotplug remove block device expected no error") - _, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIOPCIDev{}, VfioDev) + _, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIODev{}, VfioDev) assert.NoError(err, "Hotplug remove vfio block device expected no error") _, err = clh.HotplugRemoveDevice(context.Background(), nil, NetDev) diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 0a490ef57..73dbc6656 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -505,13 +505,24 @@ type HypervisorConfig struct { // MemOffset specifies memory space for nvdimm device MemOffset uint64 - // PCIeRootPort is used to indicate the number of PCIe Root Port devices - // The PCIe Root Port device is used to hot-plug the PCIe device - PCIeRootPort uint32 + // RawDevics are used to get PCIe device info early before the sandbox + // is started to make better PCIe topology decisions + RawDevices []config.DeviceInfo + + // HotplugVFIO is used to indicate if devices need to be hotplugged on the + // root port or a switch + HotPlugVFIO hv.PCIePort // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the // root port, switch or no port ColdPlugVFIO hv.PCIePort + // PCIeSwitchPort is used to indicate the number of PCIe Switch devices + // The PCIe Switch Port device is sued to hot-plug PCIe devices + PCIeSwitchPort uint32 + + // PCIeRootPort is used to indicate the number of PCIe Root Port devices + // The PCIe Root Port device is used to hot-plug the PCIe device + PCIeRootPort uint32 // NumVCPUs specifies default number of vCPUs for the VM. NumVCPUs uint32 diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 327f57343..ed2be337c 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -284,23 +284,16 @@ type KataAgentState struct { // nolint: govet type kataAgent struct { - ctx context.Context - vmSocket interface{} - - client *kataclient.AgentClient - - // lock protects the client pointer - sync.Mutex - - state KataAgentState - + ctx context.Context + vmSocket interface{} + client *kataclient.AgentClient reqHandlers map[string]reqFunc + state KataAgentState kmodules []string - + sync.Mutex dialTimout uint32 - - keepConn bool - dead bool + keepConn bool + dead bool } func (k *kataAgent) Logger() *logrus.Entry { @@ -1137,7 +1130,7 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c * ContainerPath: dev.ContainerPath, Type: kataVfioPciDevType, Id: groupNum, - Options: nil, + Options: make([]string, len(devList)), } // We always pass the device information to the agent, since @@ -1147,16 +1140,14 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c * if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel { kataDevice.Type = kataVfioPciGuestKernelDevType } - - if (*devList[0]).GetType() == config.VFIOAPDeviceMediatedType { - kataDevice.Type = kataVfioApDevType - kataDevice.Options = (*devList[0]).(config.VFIOAPDev).APDevices - } else { - kataDevice.Options = make([]string, len(devList)) - for i, device := range devList { - pciDevice := (*device).(config.VFIOPCIDev) - kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDevice.BDF, pciDevice.GuestPciPath) + for i, dev := range devList { + if dev.Type == config.VFIOAPDeviceMediatedType { + kataDevice.Type = kataVfioApDevType + kataDevice.Options = dev.APDevices + } else { + kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", dev.BDF, dev.GuestPciPath) } + } return kataDevice @@ -1342,7 +1333,6 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co if _, err = k.sendReq(ctx, req); err != nil { return nil, err } - return buildProcessFromExecID(req.ExecId) } diff --git a/src/runtime/virtcontainers/kata_agent_test.go b/src/runtime/virtcontainers/kata_agent_test.go index c7fa059dc..8fba81837 100644 --- a/src/runtime/virtcontainers/kata_agent_test.go +++ b/src/runtime/virtcontainers/kata_agent_test.go @@ -235,10 +235,10 @@ func TestHandleDeviceBlockVolume(t *testing.T) { // nolint: govet tests := []struct { - BlockDeviceDriver string - inputMount Mount inputDev *drivers.BlockDevice resultVol *pb.Storage + BlockDeviceDriver string + inputMount Mount }{ { inputDev: &drivers.BlockDevice{ @@ -1024,10 +1024,10 @@ func TestKataAgentKernelParams(t *testing.T) { // nolint: govet type testData struct { + expectedParams []Param + containerPipeSize uint32 debug bool trace bool - containerPipeSize uint32 - expectedParams []Param } debugParam := Param{Key: "agent.log", Value: "debug"} @@ -1036,28 +1036,28 @@ func TestKataAgentKernelParams(t *testing.T) { containerPipeSizeParam := Param{Key: vcAnnotations.ContainerPipeSizeKernelParam, Value: "2097152"} data := []testData{ - {false, false, 0, []Param{}}, + {debug: false, trace: false, containerPipeSize: 0, expectedParams: []Param{}}, // Debug - {true, false, 0, []Param{debugParam}}, + {debug: true, trace: false, containerPipeSize: 0, expectedParams: []Param{debugParam}}, // Tracing - {false, true, 0, []Param{traceParam}}, + {debug: false, trace: true, containerPipeSize: 0, expectedParams: []Param{traceParam}}, // Debug + Tracing - {true, true, 0, []Param{debugParam, traceParam}}, + {debug: true, trace: true, containerPipeSize: 0, expectedParams: []Param{debugParam, traceParam}}, // pipesize - {false, false, 2097152, []Param{containerPipeSizeParam}}, + {debug: false, trace: false, containerPipeSize: 2097152, expectedParams: []Param{containerPipeSizeParam}}, // Debug + pipesize - {true, false, 2097152, []Param{debugParam, containerPipeSizeParam}}, + {debug: true, trace: false, containerPipeSize: 2097152, expectedParams: []Param{debugParam, containerPipeSizeParam}}, // Tracing + pipesize - {false, true, 2097152, []Param{traceParam, containerPipeSizeParam}}, + {debug: false, trace: true, containerPipeSize: 2097152, expectedParams: []Param{traceParam, containerPipeSizeParam}}, // Debug + Tracing + pipesize - {true, true, 2097152, []Param{debugParam, traceParam, containerPipeSizeParam}}, + {debug: true, trace: true, containerPipeSize: 2097152, expectedParams: []Param{debugParam, traceParam, containerPipeSizeParam}}, } for i, d := range data { diff --git a/src/runtime/virtcontainers/monitor.go b/src/runtime/virtcontainers/monitor.go index ae7843bea..75e06fc8f 100644 --- a/src/runtime/virtcontainers/monitor.go +++ b/src/runtime/virtcontainers/monitor.go @@ -22,15 +22,12 @@ var monitorLog = virtLog.WithField("subsystem", "virtcontainers/monitor") // nolint: govet type monitor struct { - watchers []chan error - sandbox *Sandbox - - wg sync.WaitGroup - sync.Mutex - + sandbox *Sandbox stopCh chan bool + watchers []chan error checkInterval time.Duration - + wg sync.WaitGroup + sync.Mutex running bool } diff --git a/src/runtime/virtcontainers/mount.go b/src/runtime/virtcontainers/mount.go index 6c2e20420..acf4f05f6 100644 --- a/src/runtime/virtcontainers/mount.go +++ b/src/runtime/virtcontainers/mount.go @@ -239,40 +239,32 @@ func evalMountPath(source, destination string) (string, string, error) { // Mount describes a container mount. // nolint: govet type Mount struct { + // FSGroup a group ID that the group ownership of the files for the mounted volume + // will need to be changed when set. + FSGroup *int // Source is the source of the mount. Source string // Destination is the destination of the mount (within the container). Destination string - - // Type specifies the type of filesystem to mount. - Type string - // HostPath used to store host side bind mount path HostPath string - // GuestDeviceMount represents the path within the VM that the device // is mounted. Only relevant for block devices. This is tracked in the event // runtime wants to query the agent for mount stats. GuestDeviceMount string - // BlockDeviceID represents block device that is attached to the // VM in case this mount is a block device file or a directory // backed by a block device. BlockDeviceID string - - // Options list all the mount options of the filesystem. - Options []string - - // ReadOnly specifies if the mount should be read only or not - ReadOnly bool - - // FSGroup a group ID that the group ownership of the files for the mounted volume - // will need to be changed when set. - FSGroup *int - + // Type specifies the type of filesystem to mount. + Type string // FSGroupChangePolicy specifies the policy that will be used when applying // group id ownership change for a volume. FSGroupChangePolicy volume.FSGroupChangePolicy + // Options list all the mount options of the filesystem. + Options []string + // ReadOnly specifies if the mount should be read only or not + ReadOnly bool } func isSymlink(path string) bool { diff --git a/src/runtime/virtcontainers/nydusd_test.go b/src/runtime/virtcontainers/nydusd_test.go index a8ec6dc9b..712113015 100644 --- a/src/runtime/virtcontainers/nydusd_test.go +++ b/src/runtime/virtcontainers/nydusd_test.go @@ -57,13 +57,13 @@ func TestNydusdStart(t *testing.T) { // nolint: govet tests := []struct { - name string fields fields + name string wantErr bool }{ - {"empty config", fields{}, true}, - {"directory source path not exist", SourcePathNoExist, true}, - {"valid config", validConfig, false}, + {name: "empty config", fields: fields{}, wantErr: true}, + {name: "directory source path not exist", fields: SourcePathNoExist, wantErr: true}, + {name: "valid config", fields: validConfig, wantErr: false}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/src/runtime/virtcontainers/persist.go b/src/runtime/virtcontainers/persist.go index cbba44e60..574270c80 100644 --- a/src/runtime/virtcontainers/persist.go +++ b/src/runtime/virtcontainers/persist.go @@ -246,6 +246,7 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) { DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm, HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus, PCIeRootPort: sconfig.HypervisorConfig.PCIeRootPort, + PCIeSwitchPort: sconfig.HypervisorConfig.PCIeSwitchPort, BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate, BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate, DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet, diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go index e4facc6b9..6457478c5 100644 --- a/src/runtime/virtcontainers/persist/api/config.go +++ b/src/runtime/virtcontainers/persist/api/config.go @@ -135,6 +135,10 @@ type HypervisorConfig struct { // The PCIe Root Port device is used to hot-plug the PCIe device PCIeRootPort uint32 + // PCIeSwitchPort is used to indicate the number of PCIe Switch Downstream Port + // devices. The PCIe Switch Downstream Port is used to hot-plug the PCIe devices. + PCIeSwitchPort uint32 + // NumVCPUs specifies default number of vCPUs for the VM. NumVCPUs uint32 diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index d856490e5..05476ae85 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -75,13 +75,14 @@ type qmpChannel struct { // QemuState keeps Qemu's state type QemuState struct { - UUID string - Bridges []types.Bridge - // HotpluggedCPUs is the list of CPUs that were hot-added + UUID string + HotPlugVFIO hv.PCIePort + Bridges []types.Bridge HotpluggedVCPUs []hv.CPUDevice HotpluggedMemory int VirtiofsDaemonPid int PCIeRootPort int + PCIeSwitchPort int HotplugVFIOOnRootBus bool ColdPlugVFIO hv.PCIePort } @@ -286,6 +287,8 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso q.state.ColdPlugVFIO = q.config.ColdPlugVFIO q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus q.state.PCIeRootPort = int(q.config.PCIeRootPort) + q.state.PCIeSwitchPort = int(q.config.PCIeSwitchPort) + q.state.HotPlugVFIO = q.config.HotPlugVFIO // The path might already exist, but in case of VM templating, // we have to create it since the sandbox has not created it yet. @@ -701,25 +704,11 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi } } - // Add PCIe Root Port devices to hypervisor - // The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port. - // For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt - memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory() - - if hypervisorConfig.PCIeRootPort > 0 { - qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort, memSize32bit, memSize64bit) - } - - // The default OVMF MMIO aperture is too small for some PCIe devices - // with huge BARs so we need to increase it. - // memSize64bit is in bytes, convert to MB, OVMF expects MB as a string - if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") { - pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024)) - fwCfg := govmmQemu.FwCfg{ - Name: "opt/ovmf/X-PciMmio64Mb", - Str: pciMmio64Mb, + if machine.Type == QemuQ35 { + if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig); err != nil { + q.Logger().WithError(err).Errorf("Cannot create PCIe topology") + return err } - qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg) } q.qemuConfig = qemuConfig @@ -747,6 +736,104 @@ func (q *qemu) checkBpfEnabled() { } } +// If a user uses 8 GPUs with 4 devices in each IOMMU Group that means we need +// to hotplug 32 devices. We do not have enough PCIe root bus slots to +// accomplish this task. Kata will use already some slots for vfio-xxxx-pci +// devices. +// Max PCI slots per root bus is 32 +// Max PCIe root ports is 16 +// Max PCIe switch ports is 16 +// There is only 64kB of IO memory each root,switch port will consume 4k hence +// only 16 ports possible. +func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig) error { + // We do not need to do anything if we want to hotplug a VFIO to a + // pcie-pci-bridge, just return + if hypervisorConfig.HotPlugVFIO == hv.BridgePort { + return nil + } + // Add PCIe Root Port or PCIe Switches to the hypervisor + // The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged + // into a PCIe Root Port or PCIe Switch. + // For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt + + // Deduce the right values for mem-reserve and pref-64-reserve memory regions + memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory() + + // The default OVMF MMIO aperture is too small for some PCIe devices + // with huge BARs so we need to increase it. + // memSize64bit is in bytes, convert to MB, OVMF expects MB as a string + if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") { + pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024)) + fwCfg := govmmQemu.FwCfg{ + Name: "opt/ovmf/X-PciMmio64Mb", + Str: pciMmio64Mb, + } + qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg) + } + + // Get the number of hotpluggable ports needed from the provided devices + var numOfHotPluggablePorts uint32 = 0 + for _, dev := range hypervisorConfig.RawDevices { + hostPath, _ := config.GetHostPath(dev, false, "") + if hostPath == "" { + continue + } + + vfioNumberOrContainer := filepath.Base(hostPath) + // If we want to have VFIO inside of the VM we need to passthrough + // additionally /dev/vfio/vfio which is the VFIO "container". + // Ignore it and handle the remaining devices. + if strings.Compare(vfioNumberOrContainer, "vfio") == 0 { + continue + } + iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioNumberOrContainer, "devices") + deviceFiles, err := os.ReadDir(iommuDevicesPath) + if err != nil { + return err + } + + for _, deviceFile := range deviceFiles { + deviceBDF, _, _, err := drivers.GetVFIODetails(deviceFile.Name(), iommuDevicesPath) + if err != nil { + return err + } + if drivers.IsPCIeDevice(deviceBDF) { + numOfHotPluggablePorts = numOfHotPluggablePorts + 1 + } + } + } + + // If number of PCIe root ports > 16 then bail out otherwise we may + // use up all slots or IO memory on the root bus and vfio-XXX-pci devices + // cannot be added which are crucial for Kata max slots on root bus is 32 + // max slots on the complete pci(e) topology is 256 in QEMU + if hypervisorConfig.PCIeRootPort > maxPCIeRootPort { + return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) + } + if hypervisorConfig.PCIeSwitchPort > maxPCIeSwitchPort { + return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeRootPort) + } + + // If the user provided more root ports than we have detected + // use the user provided number of PCIe root ports + if numOfHotPluggablePorts < hypervisorConfig.PCIeRootPort { + numOfHotPluggablePorts = hypervisorConfig.PCIeRootPort + } + // If the user provided more switch ports than we have detected + // use the user provided number of PCIe root ports + if numOfHotPluggablePorts < hypervisorConfig.PCIeSwitchPort { + numOfHotPluggablePorts = hypervisorConfig.PCIeSwitchPort + } + + if q.state.HotPlugVFIO == hv.RootPort || q.state.HotplugVFIOOnRootBus { + qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfHotPluggablePorts, memSize32bit, memSize64bit) + } + if q.state.HotPlugVFIO == hv.SwitchPort { + qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfHotPluggablePorts, memSize32bit, memSize64bit) + } + return nil +} + func (q *qemu) vhostFSSocketPath(id string) (string, error) { return utils.BuildSocketPath(q.config.VMStorePath, id, vhostFSSocket) } @@ -1552,10 +1639,10 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V //Since the dev is the first and only one on this bus(root port), it should be 0. addr := "00" - bridgeId := fmt.Sprintf("%s%d", pcieRootPortPrefix, len(drivers.AllPCIeDevs)) + bridgeID := fmt.Sprintf("%s%d", pcieRootPortPrefix, len(drivers.AllPCIeDevs)) drivers.AllPCIeDevs[devID] = true - bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeId) + bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID) bridgeSlot, err := q.qomGetSlot(bridgeQomPath) if err != nil { return err @@ -1571,7 +1658,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V return err } - if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeId); err != nil { + if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeID); err != nil { return err } @@ -1685,41 +1772,122 @@ func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) { // Query QMP to find a device's PCI path given its QOM path or ID func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) { - // XXX: For now we assume there's exactly one bridge, since - // that's always how we configure qemu from Kata for now. It - // would be good to generalize this to different PCI - // topologies + + var slots []types.PciSlot + devSlot, err := q.qomGetSlot(qemuID) if err != nil { return types.PciPath{}, err } + slots = append(slots, devSlot) - busq, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qemuID, "parent_bus") + var parentPath = qemuID + // We do not want to use a forever loop here, a deeper PCIe topology + // than 5 is already not advisable just for the sake of having enough + // buffer we limit ourselves to 10 and leave the loop early if we hit + // the root bus. + for i := 1; i <= 10; i++ { + parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus") + if err != nil { + return types.PciPath{}, err + } + + busQOM, ok := parenBusQOM.(string) + if !ok { + return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM) + } + + // If we hit /machine/q35/pcie.0 we're done this is the root bus + // we climbed the complete hierarchy + if strings.Contains(busQOM, "/machine/q35/pcie.0") { + break + } + + // `bus` is the QOM path of the QOM bus object, but we need + // the PCI parent_bus which manages that bus. There doesn't seem + // to be a way to get that other than to simply drop the last + // path component. + idx := strings.LastIndex(busQOM, "/") + if idx == -1 { + return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM) + } + parentBus := busQOM[:idx] + + parentSlot, err := q.qomGetSlot(parentBus) + if err != nil { + return types.PciPath{}, err + } + + // Prepend the slots, since we're climbing the hierarchy + slots = append([]types.PciSlot{parentSlot}, slots...) + parentPath = parentBus + } + return types.PciPathFromSlots(slots...) +} + +func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) { + + if device.IsPCIe && (q.state.PCIeRootPort <= 0) { + q.Logger().WithField("dev-id", device.ID).Warn("VFIO device is a PCIe device." + + "It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35") + return fmt.Errorf("VFIO device is a PCIe device. Hotplug (%v) only supported on PCIe Root (%d) or PCIe Switch Ports (%v)", + q.state.HotPlugVFIO, q.state.PCIeRootPort, q.state.PCIeSwitchPort) + } + + device.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, device.Rank) + return q.executeVFIODeviceAdd(device) +} + +func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.VFIODev) (err error) { + + if device.IsPCIe && (q.state.PCIeSwitchPort <= 0) { + q.Logger().WithField("dev-id", device.ID).Warn("VFIO device is a PCIe device." + + "It's recommended to add the PCIe Switch Port by setting the pcie_switch_port parameter in the configuration for q35") + return fmt.Errorf("VFIO device is a PCIe device. Hotplug (%v) only supported on PCIe Root (%d) or PCIe Switch Ports (%v)", + q.state.HotPlugVFIO, q.state.PCIeRootPort, q.state.PCIeSwitchPort) + } + device.Bus = fmt.Sprintf("%s%d", pcieSwitchDownstreamPortPrefix, device.Rank) + return q.executeVFIODeviceAdd(device) +} + +func (q *qemu) hotplugVFIODeviceBridgePort(ctx context.Context, device *config.VFIODev) (err error) { + addr, bridge, err := q.arch.addDeviceToBridge(ctx, device.ID, types.PCI) if err != nil { - return types.PciPath{}, err + return err } - bus, ok := busq.(string) - if !ok { - return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, busq) - } + defer func() { + if err != nil { + q.arch.removeDeviceFromBridge(device.ID) + } + }() + return q.executePCIVFIODeviceAdd(device, addr, bridge.ID) +} - // `bus` is the QOM path of the QOM bus object, but we need - // the PCI bridge which manages that bus. There doesn't seem - // to be a way to get that other than to simply drop the last - // path component. - idx := strings.LastIndex(bus, "/") - if idx == -1 { - return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", bus) +func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, bridgeID string) error { + switch device.Type { + case config.VFIOPCIDeviceNormalType: + return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, addr, bridgeID, romFile) + case config.VFIOPCIDeviceMediatedType: + return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile) + case config.VFIOAPDeviceMediatedType: + return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + default: + return fmt.Errorf("Incorrect VFIO device type found") } - bridge := bus[:idx] +} - bridgeSlot, err := q.qomGetSlot(bridge) - if err != nil { - return types.PciPath{}, err +func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error { + switch device.Type { + case config.VFIOPCIDeviceNormalType: + return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, device.Bus, romFile) + case config.VFIOPCIDeviceMediatedType: + return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile) + case config.VFIOAPDeviceMediatedType: + return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + default: + return fmt.Errorf("Incorrect VFIO device type found") } - - return types.PciPathFromSlots(bridgeSlot, devSlot) } func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op Operation) (err error) { @@ -1727,109 +1895,56 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op return err } - devID := *(*device).GetID() - machineType := q.HypervisorConfig().HypervisorMachineType - if op == AddDevice { buf, _ := json.Marshal(device) q.Logger().WithFields(logrus.Fields{ - "machine-type": machineType, - "hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus, - "pcie-root-port": q.state.PCIeRootPort, - "device-info": string(buf), + "machine-type": q.HypervisorConfig().HypervisorMachineType, + "hotplug-vfio": q.state.HotPlugVFIO, + "pcie-root-port": q.state.PCIeRootPort, + "pcie-switch-port": q.state.PCIeSwitchPort, + "device-info": string(buf), }).Info("Start hot-plug VFIO device") - - // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus - // for pc machine type instead of bridge. This is useful for devices that require - // a large PCI BAR which is a currently a limitation with PCI bridges. - if q.state.HotplugVFIOOnRootBus { - switch (*device).GetType() { - case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - // In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port. - pciDevice, ok := (*device).(config.VFIOPCIDev) - if !ok { - return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device) - } - switch machineType { - case QemuQ35: - if pciDevice.IsPCIe && q.state.PCIeRootPort <= 0 { - q.Logger().WithField("dev-id", (*device).GetID()).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35") - pciDevice.Bus = "" - } - default: - pciDevice.Bus = "" - } - *device = pciDevice - - if pciDevice.Type == config.VFIOPCIDeviceNormalType { - err = q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, pciDevice.Bus, romFile) - } else { - err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), "", pciDevice.Bus, romFile) - } - case config.VFIOAPDeviceMediatedType: - err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev()) - } + // In case MachineType is q35, a PCIe device is hotplugged on + // a PCIe Root Port or alternatively on a PCIe Switch Port + if q.HypervisorConfig().HypervisorMachineType != QemuQ35 { + device.Bus = "" } else { - addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI) + var err error + // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus + // for pc machine type instead of bridge. This is useful for devices that require + // a large PCI BAR which is a currently a limitation with PCI bridges. + if q.state.HotPlugVFIO == hv.RootPort || q.state.HotplugVFIOOnRootBus { + err = q.hotplugVFIODeviceRootPort(ctx, device) + } else if q.state.HotPlugVFIO == hv.SwitchPort { + err = q.hotplugVFIODeviceSwitchPort(ctx, device) + } else { + err = q.hotplugVFIODeviceBridgePort(ctx, device) + } if err != nil { return err } - - defer func() { - if err != nil { - q.arch.removeDeviceFromBridge(devID) - } - }() - - switch (*device).GetType() { - case config.VFIOPCIDeviceNormalType: - pciDevice, ok := (*device).(config.VFIOPCIDev) - if !ok { - return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device) - } - err = q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, addr, bridge.ID, romFile) - case config.VFIOPCIDeviceMediatedType: - err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), addr, bridge.ID, romFile) - case config.VFIOAPDeviceMediatedType: - err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev()) - default: - return fmt.Errorf("Incorrect VFIO device type found") - } - } - if err != nil { - return err - } - - switch (*device).GetType() { - case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - pciDevice, ok := (*device).(config.VFIOPCIDev) - if !ok { - return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device) - } - // XXX: Depending on whether we're doing root port or - // bridge hotplug, and how the bridge is set up in - // other parts of the code, we may or may not already - // have information about the slot number of the - // bridge and or the device. For simplicity, just - // query both of them back from qemu - guestPciPath, err := q.qomGetPciPath(devID) - pciDevice.GuestPciPath = guestPciPath - *device = pciDevice - return err } + // XXX: Depending on whether we're doing root port or + // bridge hotplug, and how the bridge is set up in + // other parts of the code, we may or may not already + // have information about the slot number of the + // bridge and or the device. For simplicity, just + // query both of them back from qemu + device.GuestPciPath, err = q.qomGetPciPath(device.ID) return err - } else { - q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device") - - if !q.state.HotplugVFIOOnRootBus { - if err := q.arch.removeDeviceFromBridge(devID); err != nil { - return err - } - } - - return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID) } + + q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device") + + if !q.state.HotplugVFIOOnRootBus { + if err := q.arch.removeDeviceFromBridge(device.ID); err != nil { + return err + } + } + + return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID) + } func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error { @@ -2541,6 +2656,79 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin return devices } +// gollangci-lint enforces multi-line comments to be a block comment +// not multiple single line comments ... +/* pcie.0 bus +// ------------------------------------------------- +// | +// ------------- +// | Root Port | +// ------------- +// -------------------------|------------------------ +// | ----------------- | +// | PCI Express | Upstream Port | | +// | Switch ----------------- | +// | | | | +// | ------------------- ------------------- | +// | | Downstream Port | | Downstream Port | | +// | ------------------- ------------------- | +// -------------|-----------------------|------------ +// ------------- -------------- +// | GPU/ACCEL | | IB/ETH NIC | +// ------------- -------------- +*/ +// genericAppendPCIeSwitch adds a PCIe Swtich +func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device { + + // Q35 has the correct PCIe support, + // hence ignore all other machines + if machineType != QemuQ35 { + return devices + } + + // Using an own ID for the root port, so we do not clash with already + // existing root ports adding "s" for switch prefix + pcieRootPort := govmmQemu.PCIeRootPortDevice{ + ID: fmt.Sprintf("%s%s%d", pcieSwitchPrefix, pcieRootPortPrefix, 0), + Bus: defaultBridgeBus, + Chassis: "0", + Slot: strconv.FormatUint(uint64(0), 10), + Multifunction: false, + Addr: "0", + MemReserve: fmt.Sprintf("%dB", memSize32bit), + Pref64Reserve: fmt.Sprintf("%dB", memSize64bit), + } + + devices = append(devices, pcieRootPort) + + pcieSwitchUpstreamPort := govmmQemu.PCIeSwitchUpstreamPortDevice{ + ID: fmt.Sprintf("%s%d", pcieSwitchUpstreamPortPrefix, 0), + Bus: pcieRootPort.ID, + } + devices = append(devices, pcieSwitchUpstreamPort) + + currentChassis, err := strconv.Atoi(pcieRootPort.Chassis) + if err != nil { + return devices + } + nextChassis := currentChassis + 1 + + for i := uint32(0); i < number; i++ { + + pcieSwitchDownstreamPort := govmmQemu.PCIeSwitchDownstreamPortDevice{ + ID: fmt.Sprintf("%s%d", pcieSwitchDownstreamPortPrefix, i), + Bus: pcieSwitchUpstreamPort.ID, + Chassis: fmt.Sprintf("%d", nextChassis), + Slot: strconv.FormatUint(uint64(i), 10), + // TODO: MemReserve: fmt.Sprintf("%dB", memSize32bit), + // TODO: Pref64Reserve: fmt.Sprintf("%dB", memSize64bit), + } + devices = append(devices, pcieSwitchDownstreamPort) + } + + return devices +} + func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) { span, _ := katatrace.Trace(ctx, q.Logger(), "GetThreadIDs", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() @@ -2717,6 +2905,7 @@ func (q *qemu) Save() (s hv.HypervisorState) { s.HotpluggedMemory = q.state.HotpluggedMemory s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus s.PCIeRootPort = q.state.PCIeRootPort + s.PCIeSwitchPort = q.state.PCIeSwitchPort for _, bridge := range q.arch.getBridges() { s.Bridges = append(s.Bridges, hv.Bridge{ @@ -2741,6 +2930,7 @@ func (q *qemu) Load(s hv.HypervisorState) { q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid q.state.PCIeRootPort = s.PCIeRootPort + q.state.PCIeSwitchPort = s.PCIeSwitchPort for _, bridge := range s.Bridges { q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr)) diff --git a/src/runtime/virtcontainers/qemu_amd64.go b/src/runtime/virtcontainers/qemu_amd64.go index 4ecf0804a..cb99ec1ac 100644 --- a/src/runtime/virtcontainers/qemu_amd64.go +++ b/src/runtime/virtcontainers/qemu_amd64.go @@ -123,7 +123,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) { legacySerial: config.LegacySerial, }, vmFactory: factory, - snpGuest: config.SevSnpGuest, + snpGuest: config.ConfidentialGuest, } if config.ConfidentialGuest { diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index 9aff1e76c..9a288d85a 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -140,6 +140,9 @@ type qemuArch interface { // appendPCIeRootPortDevice appends a pcie-root-port device to pcie.0 bus appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device + // appendPCIeSwitch appends a ioh3420 device to a pcie-root-port + appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device + // append vIOMMU device appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Device, error) @@ -177,13 +180,18 @@ type qemuArchBase struct { } const ( - defaultCores uint32 = 1 - defaultThreads uint32 = 1 - defaultCPUModel = "host" - defaultBridgeBus = "pcie.0" - defaultPCBridgeBus = "pci.0" - maxDevIDSize = 31 - pcieRootPortPrefix = "rp" + defaultCores uint32 = 1 + defaultThreads uint32 = 1 + defaultCPUModel = "host" + defaultBridgeBus = "pcie.0" + defaultPCBridgeBus = "pci.0" + maxDevIDSize = 31 + maxPCIeRootPort = 16 // Limitation from QEMU + maxPCIeSwitchPort = 16 // Limitation from QEMU + pcieRootPortPrefix = "rp" + pcieSwitchPrefix = "sw" + pcieSwitchUpstreamPortPrefix = "swup" + pcieSwitchDownstreamPortPrefix = "swdp" ) // This is the PCI start address assigned to the first bridge that @@ -675,17 +683,17 @@ func (q *qemuArchBase) appendVhostUserDevice(ctx context.Context, devices []govm } func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device { - pciDevice := vfioDev.(config.VFIOPCIDev) - if pciDevice.BDF == "" { + + if vfioDev.BDF == "" { return devices } devices = append(devices, govmmQemu.VFIODevice{ - BDF: pciDevice.BDF, - VendorID: pciDevice.VendorID, - DeviceID: pciDevice.DeviceID, - Bus: pciDevice.Bus, + BDF: vfioDev.BDF, + VendorID: vfioDev.VendorID, + DeviceID: vfioDev.DeviceID, + Bus: vfioDev.Bus, }, ) @@ -801,6 +809,13 @@ func (q *qemuArchBase) appendPCIeRootPortDevice(devices []govmmQemu.Device, numb return genericAppendPCIeRootPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit) } +// appendPCIeSwitchPortDevice appends a PCIe Switch with ports +func (q *qemuArchBase) appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device { + return genericAppendPCIeSwitchPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit) +} + +// getBARsMaxAddressableMemory we need to know the BAR sizes to configure the +// PCIe Root Port or PCIe Downstream Port attaching a device with huge BARs. func (q *qemuArchBase) getBARsMaxAddressableMemory() (uint64, uint64) { pci := nvpci.New() diff --git a/src/runtime/virtcontainers/qemu_arch_base_test.go b/src/runtime/virtcontainers/qemu_arch_base_test.go index 51c11bd91..37611bb5b 100644 --- a/src/runtime/virtcontainers/qemu_arch_base_test.go +++ b/src/runtime/virtcontainers/qemu_arch_base_test.go @@ -463,7 +463,7 @@ func TestQemuArchBaseAppendVFIODevice(t *testing.T) { }, } - vfDevice := config.VFIOPCIDev{ + vfDevice := config.VFIODev{ BDF: bdf, } @@ -483,7 +483,7 @@ func TestQemuArchBaseAppendVFIODeviceWithVendorDeviceID(t *testing.T) { }, } - vfDevice := config.VFIOPCIDev{ + vfDevice := config.VFIODev{ BDF: bdf, VendorID: vendorID, DeviceID: deviceID, diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 9bd5f402e..bda931b43 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -101,15 +101,10 @@ type HypervisorPidKey struct{} // SandboxStatus describes a sandbox status. type SandboxStatus struct { - ContainersStatus []ContainerStatus - - // Annotations allow clients to store arbitrary values, - // for example to add additional status values required - // to support particular specifications. - Annotations map[string]string - + Annotations map[string]string ID string Hypervisor HypervisorType + ContainersStatus []ContainerStatus State types.SandboxState HypervisorConfig HypervisorConfig } @@ -176,10 +171,8 @@ type SandboxConfig struct { // SharePidNs sets all containers to share the same sandbox level pid namespace. SharePidNs bool - // SystemdCgroup enables systemd cgroup support SystemdCgroup bool - // SandboxCgroupOnly enables cgroup only at podlevel in the host SandboxCgroupOnly bool @@ -620,6 +613,12 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor if err := validateHypervisorConfig(&sandboxConfig.HypervisorConfig); err != nil { return nil, err } + // Aggregate all the container devices and update the HV config + var devices []config.DeviceInfo + for _, ct := range sandboxConfig.Containers { + devices = append(devices, ct.DeviceInfos...) + } + sandboxConfig.HypervisorConfig.RawDevices = devices // If we have a confidential guest we need to cold-plug the PCIe VFIO devices // until we have TDISP/IDE PCIe support. @@ -1709,7 +1708,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error { defer span.End() for i := range s.config.Containers { - c, err := newContainer(ctx, s, &s.config.Containers[i]) if err != nil { return err @@ -1728,7 +1726,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error { if err := s.updateResources(ctx); err != nil { return err } - if err := s.resourceControllerUpdate(ctx); err != nil { return err } @@ -1740,7 +1737,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error { if err := s.storeSandbox(ctx); err != nil { return err } - return nil } @@ -1904,15 +1900,11 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy // adding a group of VFIO devices for _, dev := range vfioDevices { if _, err := s.hypervisor.HotplugAddDevice(ctx, dev, VfioDev); err != nil { - bdf := "" - if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok { - bdf = pciDevice.BDF - } s.Logger(). WithFields(logrus.Fields{ "sandbox": s.id, - "vfio-device-ID": (*dev).GetID(), - "vfio-device-BDF": bdf, + "vfio-device-ID": dev.ID, + "vfio-device-BDF": dev.BDF, }).WithError(err).Error("failed to hotplug VFIO device") return err } @@ -1961,15 +1953,11 @@ func (s *Sandbox) HotplugRemoveDevice(ctx context.Context, device api.Device, de // remove a group of VFIO devices for _, dev := range vfioDevices { if _, err := s.hypervisor.HotplugRemoveDevice(ctx, dev, VfioDev); err != nil { - bdf := "" - if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok { - bdf = pciDevice.BDF - } s.Logger().WithError(err). WithFields(logrus.Fields{ "sandbox": s.id, - "vfio-device-ID": (*dev).GetID(), - "vfio-device-BDF": bdf, + "vfio-device-ID": dev.ID, + "vfio-device-BDF": dev.BDF, }).Error("failed to hot unplug VFIO device") return err } diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go index de3b1885c..db1aa7f84 100644 --- a/src/runtime/virtcontainers/sandbox_test.go +++ b/src/runtime/virtcontainers/sandbox_test.go @@ -593,11 +593,11 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) { _, err = os.Create(deviceFile) assert.Nil(t, err) - savedIOMMUPath := config.SysIOMMUPath - config.SysIOMMUPath = tmpDir + savedIOMMUPath := config.SysIOMMUGroupPath + config.SysIOMMUGroupPath = tmpDir defer func() { - config.SysIOMMUPath = savedIOMMUPath + config.SysIOMMUGroupPath = savedIOMMUPath }() dm := manager.NewDeviceManager(config.VirtioSCSI, false, "", 0, nil) @@ -650,8 +650,8 @@ func TestSandboxCreateAssets(t *testing.T) { // nolint: govet type testData struct { - assetType types.AssetType annotations map[string]string + assetType types.AssetType } tmpfile, err := os.CreateTemp("", "virtcontainers-test-") @@ -687,50 +687,50 @@ func TestSandboxCreateAssets(t *testing.T) { data := []testData{ { - types.FirmwareAsset, - map[string]string{ + assetType: types.FirmwareAsset, + annotations: map[string]string{ annotations.FirmwarePath: filename, annotations.FirmwareHash: assetContentHash, }, }, { - types.HypervisorAsset, - map[string]string{ + assetType: types.HypervisorAsset, + annotations: map[string]string{ annotations.HypervisorPath: filename, annotations.HypervisorHash: assetContentHash, }, }, { - types.HypervisorCtlAsset, - map[string]string{ + assetType: types.HypervisorCtlAsset, + annotations: map[string]string{ annotations.HypervisorCtlPath: filename, annotations.HypervisorCtlHash: assetContentHash, }, }, { - types.ImageAsset, - map[string]string{ + assetType: types.ImageAsset, + annotations: map[string]string{ annotations.ImagePath: filename, annotations.ImageHash: assetContentHash, }, }, { - types.InitrdAsset, - map[string]string{ + assetType: types.InitrdAsset, + annotations: map[string]string{ annotations.InitrdPath: filename, annotations.InitrdHash: assetContentHash, }, }, { - types.JailerAsset, - map[string]string{ + assetType: types.JailerAsset, + annotations: map[string]string{ annotations.JailerPath: filename, annotations.JailerHash: assetContentHash, }, }, { - types.KernelAsset, - map[string]string{ + assetType: types.KernelAsset, + annotations: map[string]string{ annotations.KernelPath: filename, annotations.KernelHash: assetContentHash, }, @@ -1407,58 +1407,58 @@ func TestSandbox_Cgroups(t *testing.T) { // nolint: govet tests := []struct { - name string s *Sandbox + name string wantErr bool needRoot bool }{ { - "New sandbox", - &Sandbox{}, - true, - false, + name: "New sandbox", + s: &Sandbox{}, + wantErr: true, + needRoot: false, }, { - "New sandbox, new config", - &Sandbox{config: &SandboxConfig{}}, - false, - true, + name: "New sandbox, new config", + s: &Sandbox{config: &SandboxConfig{}}, + wantErr: false, + needRoot: true, }, { - "sandbox, container no sandbox type", - &Sandbox{ + name: "sandbox, container no sandbox type", + s: &Sandbox{ config: &SandboxConfig{Containers: []ContainerConfig{ {}, }}}, - false, - true, + wantErr: false, + needRoot: true, }, { - "sandbox, container sandbox type", - &Sandbox{ + name: "sandbox, container sandbox type", + s: &Sandbox{ config: &SandboxConfig{Containers: []ContainerConfig{ sandboxContainer, }}}, - false, - true, + wantErr: false, + needRoot: true, }, { - "sandbox, empty linux json", - &Sandbox{ + name: "sandbox, empty linux json", + s: &Sandbox{ config: &SandboxConfig{Containers: []ContainerConfig{ emptyJSONLinux, }}}, - false, - true, + wantErr: false, + needRoot: true, }, { - "sandbox, successful config", - &Sandbox{ + name: "sandbox, successful config", + s: &Sandbox{ config: &SandboxConfig{Containers: []ContainerConfig{ successfulContainer, }}}, - false, - true, + wantErr: false, + needRoot: true, }, } for _, tt := range tests { diff --git a/src/runtime/virtcontainers/veth_endpoint_test.go b/src/runtime/virtcontainers/veth_endpoint_test.go index dc6f03270..341185b00 100644 --- a/src/runtime/virtcontainers/veth_endpoint_test.go +++ b/src/runtime/virtcontainers/veth_endpoint_test.go @@ -85,16 +85,16 @@ func TestCreateVethNetworkEndpointChooseIfaceName(t *testing.T) { func TestCreateVethNetworkEndpointInvalidArgs(t *testing.T) { // nolint: govet type endpointValues struct { - idx int ifName string + idx int } assert := assert.New(t) // all elements are expected to result in failure failingValues := []endpointValues{ - {-1, "bar"}, - {-1, ""}, + {idx: -1, ifName: "bar"}, + {idx: -1, ifName: ""}, } for _, d := range failingValues { diff --git a/src/runtime/virtcontainers/virtiofsd_test.go b/src/runtime/virtcontainers/virtiofsd_test.go index a4252a2ba..55eb3fb1a 100644 --- a/src/runtime/virtcontainers/virtiofsd_test.go +++ b/src/runtime/virtcontainers/virtiofsd_test.go @@ -17,13 +17,13 @@ import ( func TestVirtiofsdStart(t *testing.T) { // nolint: govet type fields struct { - path string + ctx context.Context socketPath string cache string - extraArgs []string sourcePath string + path string + extraArgs []string PID int - ctx context.Context } sourcePath := t.TempDir() @@ -41,13 +41,13 @@ func TestVirtiofsdStart(t *testing.T) { // nolint: govet tests := []struct { - name string fields fields + name string wantErr bool }{ - {"empty config", fields{}, true}, - {"Directory socket does not exist", NoDirectorySocket, true}, - {"valid config", validConfig, false}, + {name: "empty config", fields: fields{}, wantErr: true}, + {name: "Directory socket does not exist", fields: NoDirectorySocket, wantErr: true}, + {name: "valid config", fields: validConfig, wantErr: false}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/tools/osbuilder/image-builder/image_builder.sh b/tools/osbuilder/image-builder/image_builder.sh index 3e7f0babc..f88f3aac0 100755 --- a/tools/osbuilder/image-builder/image_builder.sh +++ b/tools/osbuilder/image-builder/image_builder.sh @@ -271,7 +271,7 @@ calculate_required_disk_size() { readonly image="$(mktemp)" readonly mount_dir="$(mktemp -d)" readonly max_tries=20 - readonly increment=10 + readonly increment=100 for i in $(seq 1 $max_tries); do local img_size="$((rootfs_size_mb + (i * increment)))" diff --git a/tools/packaging/scripts/configure-hypervisor.sh b/tools/packaging/scripts/configure-hypervisor.sh index 8bdfc94de..9dfc9bb32 100755 --- a/tools/packaging/scripts/configure-hypervisor.sh +++ b/tools/packaging/scripts/configure-hypervisor.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env bash # # Copyright (c) 2018 Intel Corporation # @@ -14,6 +14,8 @@ # been specified. #--------------------------------------------------------------------- +set -x + script_name=${0##*/} arch="${3:-$(uname -m)}" From da42801c38f8ee226132273a257de45fe1e38cd5 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Mon, 8 May 2023 08:00:39 +0000 Subject: [PATCH 06/87] gpu: Add config settings tests for hot-plug Updated all references and config settings for hot-plug to match cold-plug Signed-off-by: Zvonko Kaiser --- src/runtime/cmd/kata-runtime/kata-env.go | 5 ++++- src/runtime/cmd/kata-runtime/kata-env_test.go | 7 +++++++ src/runtime/pkg/containerd-shim-v2/create_test.go | 5 +++++ src/runtime/pkg/hypervisors/hypervisor_state.go | 10 +++++----- src/runtime/pkg/katatestutils/utils.go | 3 +++ src/runtime/pkg/katautils/config-settings.go.in | 6 ++++-- src/runtime/pkg/katautils/config.go | 15 ++++++++++----- src/runtime/pkg/katautils/config_test.go | 9 ++++++++- .../virtcontainers/documentation/api/1.0/api.md | 12 ++++++++++-- src/runtime/virtcontainers/hypervisor.go | 7 ++++--- src/runtime/virtcontainers/persist.go | 2 ++ src/runtime/virtcontainers/persist/api/config.go | 6 +++++- src/runtime/virtcontainers/qemu.go | 3 ++- 13 files changed, 69 insertions(+), 21 deletions(-) diff --git a/src/runtime/cmd/kata-runtime/kata-env.go b/src/runtime/cmd/kata-runtime/kata-env.go index 45ddfbf3e..e5e7e87ed 100644 --- a/src/runtime/cmd/kata-runtime/kata-env.go +++ b/src/runtime/cmd/kata-runtime/kata-env.go @@ -114,8 +114,9 @@ type HypervisorInfo struct { Msize9p uint32 MemorySlots uint32 PCIeRootPort uint32 - ColdPlugVFIO hv.PCIePort PCIeSwitchPort uint32 + HotPlugVFIO hv.PCIePort + ColdPlugVFIO hv.PCIePort HotplugVFIOOnRootBus bool Debug bool } @@ -318,9 +319,11 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) { EntropySource: config.HypervisorConfig.EntropySource, SharedFS: config.HypervisorConfig.SharedFS, VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, + HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, PCIeRootPort: config.HypervisorConfig.PCIeRootPort, + PCIeSwitchPort: config.HypervisorConfig.PCIeSwitchPort, SocketPath: socketPath, }, nil } diff --git a/src/runtime/cmd/kata-runtime/kata-env_test.go b/src/runtime/cmd/kata-runtime/kata-env_test.go index 3760104d0..d897a0c4e 100644 --- a/src/runtime/cmd/kata-runtime/kata-env_test.go +++ b/src/runtime/cmd/kata-runtime/kata-env_test.go @@ -75,6 +75,7 @@ func createConfig(configPath string, fileData string) error { } func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeConfig, err error) { + var hotPlugVFIO hv.PCIePort var coldPlugVFIO hv.PCIePort const logPath = "/log/path" hypervisorPath := filepath.Join(prefixDir, "hypervisor") @@ -88,6 +89,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC enableIOThreads := true hotplugVFIOOnRootBus := true pcieRootPort := uint32(2) + pcieSwitchPort := uint32(2) + hotPlugVFIO = hv.BridgePort coldPlugVFIO = hv.NoPort disableNewNetNs := false sharedFS := "virtio-9p" @@ -132,8 +135,10 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC BlockDeviceDriver: blockStorageDriver, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, + HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, PCIeRootPort: pcieRootPort, + PCIeSwitchPort: pcieSwitchPort, DisableNewNetNs: disableNewNetNs, DefaultVCPUCount: hypConfig.NumVCPUs, DefaultMaxVCPUCount: hypConfig.DefaultMaxVCPUs, @@ -278,6 +283,8 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo { HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, PCIeRootPort: config.HypervisorConfig.PCIeRootPort, + PCIeSwitchPort: config.HypervisorConfig.PCIeSwitchPort, + HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, } diff --git a/src/runtime/pkg/containerd-shim-v2/create_test.go b/src/runtime/pkg/containerd-shim-v2/create_test.go index e3e8e9369..dc37d72dd 100644 --- a/src/runtime/pkg/containerd-shim-v2/create_test.go +++ b/src/runtime/pkg/containerd-shim-v2/create_test.go @@ -309,6 +309,7 @@ func TestCreateContainerConfigFail(t *testing.T) { } func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err error) { + var hotPlugVFIO hv.PCIePort var coldPlugVFIO hv.PCIePort if dir == "" { return "", fmt.Errorf("BUG: need directory") @@ -331,9 +332,11 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err enableIOThreads := true hotplugVFIOOnRootBus := true pcieRootPort := uint32(2) + pcieSwitchPort := uint32(3) disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := path.Join(dir, "virtiofsd") + hotPlugVFIO = hv.BridgePort coldPlugVFIO = hv.RootPort configFileOptions := ktu.RuntimeConfigOptions{ @@ -350,9 +353,11 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, PCIeRootPort: pcieRootPort, + PCIeSwitchPort: pcieSwitchPort, DisableNewNetNs: disableNewNetNs, SharedFS: sharedFS, VirtioFSDaemon: virtioFSdaemon, + HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, } diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go index 735ac089b..2eed91cae 100644 --- a/src/runtime/pkg/hypervisors/hypervisor_state.go +++ b/src/runtime/pkg/hypervisors/hypervisor_state.go @@ -45,13 +45,13 @@ const ( func (p PCIePort) String() string { switch p { case RootPort: - return "root-port" + fallthrough case SwitchPort: - return "switch-port" + fallthrough case BridgePort: - return "bridge-port" + fallthrough case NoPort: - return "no-port" + return string(p) } return fmt.Sprintf("", string(p)) } @@ -76,7 +76,7 @@ type HypervisorState struct { Pid int PCIeRootPort int PCIeSwitchPort int - ColdPlugVFIO PCIePort HotPlugVFIO PCIePort + ColdPlugVFIO PCIePort HotplugVFIOOnRootBus bool } diff --git a/src/runtime/pkg/katatestutils/utils.go b/src/runtime/pkg/katatestutils/utils.go index 4c8257a40..ef650ea0d 100644 --- a/src/runtime/pkg/katatestutils/utils.go +++ b/src/runtime/pkg/katatestutils/utils.go @@ -225,6 +225,8 @@ type RuntimeConfigOptions struct { JaegerPassword string PFlash []string PCIeRootPort uint32 + PCIeSwitchPort uint32 + HotPlugVFIO hv.PCIePort ColdPlugVFIO hv.PCIePort DefaultVCPUCount uint32 DefaultMaxVCPUCount uint32 @@ -319,6 +321,7 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string { enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + ` hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + ` pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + ` + pcie_switch_port = ` + strconv.FormatUint(uint64(config.PCIeSwitchPort), 10) + ` cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `" msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + ` enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + ` diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 0de02ce40..a6825a8a0 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -83,6 +83,7 @@ const defaultDisableNestingChecks bool = false const defaultMsize9p uint32 = 8192 const defaultHotplugVFIOOnRootBus bool = false const defaultPCIeRootPort = 0 +const defaultPCIeSwitchPort = 0 const defaultEntropySource = "/dev/urandom" const defaultGuestHookPath string = "" const defaultVirtioFSCacheMode = "never" @@ -108,6 +109,7 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock" // Default config file used by stateless systems. var defaultRuntimeConfiguration = "@CONFIG_PATH@" -const defaultColdPlugVFIO = hv.NoPort const defaultHotPlugVFIO = hv.BridgePort -const defaultPCIeSwitchPort = 0 +const defaultColdPlugVFIO = hv.NoPort + + diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index e38526fae..c0a0372df 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -149,7 +149,7 @@ type hypervisor struct { EnableIOThreads bool `toml:"enable_iothreads"` DisableImageNvdimm bool `toml:"disable_image_nvdimm"` HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` - HotPlugVFIO hv.PCIePort `toml:"hotplug_vfio"` + HotPlugVFIO hv.PCIePort `toml:"hot_plug_vfio"` ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"` DisableVhostNet bool `toml:"disable_vhost_net"` GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` @@ -870,8 +870,8 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { Msize9p: h.msize9p(), DisableImageNvdimm: h.DisableImageNvdimm, HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, + HotPlugVFIO: h.hotPlugVFIO(), ColdPlugVFIO: h.coldPlugVFIO(), - HotPlugVFIO: h.HotPlugVFIO, PCIeRootPort: h.PCIeRootPort, PCIeSwitchPort: h.PCIeSwitchPort, DisableVhostNet: h.DisableVhostNet, @@ -1674,11 +1674,15 @@ func checkConfig(config oci.RuntimeConfig) error { return err } + hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO machineType := config.HypervisorConfig.HypervisorMachineType if err := checkPCIeConfig(coldPlugVFIO, machineType); err != nil { return err } + if err := checkPCIeConfig(hotPlugVFIO, machineType); err != nil { + return err + } return nil } @@ -1692,12 +1696,13 @@ func checkPCIeConfig(vfioPort hv.PCIePort, machineType string) error { if machineType != "q35" { return nil } - if vfioPort == hv.NoPort || vfioPort == hv.RootPort || vfioPort == hv.SwitchPort { + if vfioPort == hv.NoPort || vfioPort == hv.BridgePort || + vfioPort == hv.RootPort || vfioPort == hv.SwitchPort { return nil } - return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s", - vfioPort, hv.NoPort, hv.RootPort, hv.SwitchPort) + return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s, %s", + vfioPort, hv.NoPort, hv.BridgePort, hv.RootPort, hv.SwitchPort) } // checkNetNsConfig performs sanity checks on disable_new_netns config. diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index fb506ba6a..f30a7a27d 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -71,6 +71,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf if hypervisor == "" { return config, fmt.Errorf("BUG: need hypervisor") } + var hotPlugVFIO hv.PCIePort var coldPlugVFIO hv.PCIePort hypervisorPath := path.Join(dir, "hypervisor") kernelPath := path.Join(dir, "kernel") @@ -86,6 +87,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf enableIOThreads := true hotplugVFIOOnRootBus := true pcieRootPort := uint32(2) + pcieSwitchPort := uint32(3) + hotPlugVFIO = hv.BridgePort coldPlugVFIO = hv.RootPort disableNewNetNs := false sharedFS := "virtio-9p" @@ -109,6 +112,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, PCIeRootPort: pcieRootPort, + PCIeSwitchPort: pcieSwitchPort, + HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, DisableNewNetNs: disableNewNetNs, DefaultVCPUCount: defaultVCPUCount, @@ -173,6 +178,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, PCIeRootPort: pcieRootPort, + PCIeSwitchPort: pcieSwitchPort, + HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, Msize9p: defaultMsize9p, MemSlots: defaultMemSlots, @@ -566,9 +573,9 @@ func TestMinimalRuntimeConfig(t *testing.T) { GuestHookPath: defaultGuestHookPath, VhostUserStorePath: defaultVhostUserStorePath, VirtioFSCache: defaultVirtioFSCacheMode, - HotPlugVFIO: defaultHotPlugVFIO, BlockDeviceAIO: defaultBlockDeviceAIO, DisableGuestSeLinux: defaultDisableGuestSeLinux, + HotPlugVFIO: defaultHotPlugVFIO, ColdPlugVFIO: defaultColdPlugVFIO, } diff --git a/src/runtime/virtcontainers/documentation/api/1.0/api.md b/src/runtime/virtcontainers/documentation/api/1.0/api.md index d3071a86f..a26e653ce 100644 --- a/src/runtime/virtcontainers/documentation/api/1.0/api.md +++ b/src/runtime/virtcontainers/documentation/api/1.0/api.md @@ -289,11 +289,19 @@ type HypervisorConfig struct { HotplugVFIOOnRootBus bool // PCIeRootPort is used to indicate the number of PCIe Root Port devices - // The PCIe Root Port device is used to hot-plug the PCIe device + // The PCIe Root Port device is used to hot(cold)-plug the PCIe device PCIeRootPort uint32 + // PCIeSwitchPort is used to indicate the number of PCIe Switch Ports + // The PCIe Switch port is used to hot(cold)-plug the PCIe device + PCIeSwitchPort uint32 + + // HotPlugVFIO is used to indicate if devices need to be hotplugged on the + // root port, switch, bridge or no port + HotPlugVFIO hv.PCIePort + // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the - // root port, switch or no port + // root port, switch, bridge or no port ColdPlugVFIO hv.PCIePort // BootToBeTemplate used to indicate if the VM is created to be a template VM diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 73dbc6656..7d14ab448 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -516,14 +516,15 @@ type HypervisorConfig struct { // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the // root port, switch or no port ColdPlugVFIO hv.PCIePort - // PCIeSwitchPort is used to indicate the number of PCIe Switch devices - // The PCIe Switch Port device is sued to hot-plug PCIe devices - PCIeSwitchPort uint32 // PCIeRootPort is used to indicate the number of PCIe Root Port devices // The PCIe Root Port device is used to hot-plug the PCIe device PCIeRootPort uint32 + // PCIeSwitchPort is used to indicate the number of PCIe Switch devices + // The PCIe Switch Port device is sued to hot-plug PCIe devices + PCIeSwitchPort uint32 + // NumVCPUs specifies default number of vCPUs for the VM. NumVCPUs uint32 diff --git a/src/runtime/virtcontainers/persist.go b/src/runtime/virtcontainers/persist.go index 574270c80..02cdc8387 100644 --- a/src/runtime/virtcontainers/persist.go +++ b/src/runtime/virtcontainers/persist.go @@ -488,8 +488,10 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) { DisableNestingChecks: hconf.DisableNestingChecks, DisableImageNvdimm: hconf.DisableImageNvdimm, HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus, + HotPlugVFIO: hconf.HotPlugVFIO, ColdPlugVFIO: hconf.ColdPlugVFIO, PCIeRootPort: hconf.PCIeRootPort, + PCIeSwitchPort: hconf.PCIeSwitchPort, BootToBeTemplate: hconf.BootToBeTemplate, BootFromTemplate: hconf.BootFromTemplate, DisableVhostNet: hconf.DisableVhostNet, diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go index 6457478c5..8cd752093 100644 --- a/src/runtime/virtcontainers/persist/api/config.go +++ b/src/runtime/virtcontainers/persist/api/config.go @@ -203,8 +203,12 @@ type HypervisorConfig struct { // root bus instead of a bridge. HotplugVFIOOnRootBus bool + // HotPlugVFIO is used to indicate if devices need to be hotplugged on the + // root, switch, bridge or no-port + HotPlugVFIO hv.PCIePort + // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the - // root port or a switch or no-port + // root, bridge, switch or no-port ColdPlugVFIO hv.PCIePort // BootToBeTemplate used to indicate if the VM is created to be a template VM diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 05476ae85..06e35d3e3 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -84,6 +84,7 @@ type QemuState struct { PCIeRootPort int PCIeSwitchPort int HotplugVFIOOnRootBus bool + HotplugVFIO hv.PCIePort ColdPlugVFIO hv.PCIePort } @@ -283,7 +284,7 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso q.Logger().Debug("Creating UUID") q.state.UUID = uuid.Generate().String() - + q.state.HotPlugVFIO = q.config.HotPlugVFIO q.state.ColdPlugVFIO = q.config.ColdPlugVFIO q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus q.state.PCIeRootPort = int(q.config.PCIeRootPort) From 55a66eb7fba9d31c7bb33215c8b7a222bfe70a67 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Mon, 8 May 2023 08:04:43 +0000 Subject: [PATCH 07/87] gpu: Add config to TOML Update cold-plug and hot-plug setting to include bridge, root and switch-port Signed-off-by: Zvonko Kaiser --- src/runtime/config/configuration-qemu.toml.in | 9 ++- src/runtime/pkg/device/drivers/utils.go | 10 +-- src/runtime/pkg/device/drivers/vfio.go | 2 +- .../pkg/katautils/config-settings.go.in | 2 +- src/runtime/pkg/katautils/config.go | 25 ++++-- src/runtime/pkg/oci/utils_test.go | 58 +++++++------- src/runtime/virtcontainers/acrn.go | 2 +- src/runtime/virtcontainers/clh.go | 6 +- src/runtime/virtcontainers/clh_test.go | 8 +- src/runtime/virtcontainers/hypervisor.go | 2 +- src/runtime/virtcontainers/kata_agent.go | 23 ++++-- src/runtime/virtcontainers/kata_agent_test.go | 24 +++--- src/runtime/virtcontainers/monitor.go | 13 +-- src/runtime/virtcontainers/qemu.go | 67 +++++++--------- src/runtime/virtcontainers/sandbox.go | 25 +++--- src/runtime/virtcontainers/sandbox_test.go | 80 +++++++++---------- .../virtcontainers/veth_endpoint_test.go | 6 +- src/runtime/virtcontainers/virtiofsd_test.go | 14 ++-- .../osbuilder/image-builder/image_builder.sh | 2 +- .../packaging/scripts/configure-hypervisor.sh | 4 +- 20 files changed, 199 insertions(+), 183 deletions(-) diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index f58f87141..594261c3d 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -352,8 +352,15 @@ pflashes = [] # Default false #hotplug_vfio_on_root_bus = true +# Enable hot-plugging of VFIO devices to a bridge-port, +# root-port or switch-port. +# The default setting is "no-port" +#hot_plug_vfio = "root-port" + # In a confidential compute environment hot-plugging can compromise -# security. Enable cold-plugging of VFIO devices to a root-port. +# security. +# Enable cold-plugging of VFIO devices to a bridge-port, +# root-port or switch-port. # The default setting is "no-port", which means disabled. #cold_plug_vfio = "root-port" diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index 19ac99c30..f0c210954 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -47,7 +47,7 @@ func deviceLogger() *logrus.Entry { return api.DeviceLogger() } -// IsPCIeDevice Identify PCIe device by reading the size of the PCI config space +// IsPCIeDevice Identifies PCIe device by reading the size of the PCI config space // Plain PCI device have 256 bytes of config space where PCIe devices have 4K func IsPCIeDevice(bdf string) bool { if len(strings.Split(bdf, ":")) == 2 { @@ -157,9 +157,7 @@ func checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (boo // GetAllVFIODevicesFromIOMMUGroup returns all the VFIO devices in the IOMMU group // We can reuse this function at various levels, sandbox, container. -// Only the VFIO module is allowed to do bus assignments, all other modules need to -// ignore it if used as helper function to get VFIO information. -func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignment bool) ([]*config.VFIODev, error) { +func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODev, error) { vfioDevs := []*config.VFIODev{} @@ -207,8 +205,8 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme Class: pciClass, Rank: -1, } - if isPCIe && !ignoreBusAssignment { - vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs)) + if isPCIe { + vfioPCI.Rank = len(AllPCIeDevs) AllPCIeDevs[deviceBDF] = true } vfio = vfioPCI diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 4231f0d30..3604a355d 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -73,7 +73,7 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } }() - device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo, false) + device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo) if err != nil { return err } diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index a6825a8a0..521f7b60f 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -109,7 +109,7 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock" // Default config file used by stateless systems. var defaultRuntimeConfiguration = "@CONFIG_PATH@" -const defaultHotPlugVFIO = hv.BridgePort +const defaultHotPlugVFIO = hv.NoPort const defaultColdPlugVFIO = hv.NoPort diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index c0a0372df..75a246984 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -76,6 +76,7 @@ type factory struct { VMCacheNumber uint `toml:"vm_cache_number"` Template bool `toml:"enable_template"` } + type hypervisor struct { Path string `toml:"path"` JailerPath string `toml:"jailer_path"` @@ -886,6 +887,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { GuestMemoryDumpPath: h.GuestMemoryDumpPath, GuestMemoryDumpPaging: h.GuestMemoryDumpPaging, ConfidentialGuest: h.ConfidentialGuest, + SevSnpGuest: h.SevSnpGuest, GuestSwap: h.GuestSwap, Rootless: h.Rootless, LegacySerial: h.LegacySerial, @@ -1677,10 +1679,7 @@ func checkConfig(config oci.RuntimeConfig) error { hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO machineType := config.HypervisorConfig.HypervisorMachineType - if err := checkPCIeConfig(coldPlugVFIO, machineType); err != nil { - return err - } - if err := checkPCIeConfig(hotPlugVFIO, machineType); err != nil { + if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType); err != nil { return err } @@ -1690,19 +1689,29 @@ func checkConfig(config oci.RuntimeConfig) error { // checkPCIeConfig ensures the PCIe configuration is valid. // Only allow one of the following settings for cold-plug: // no-port, root-port, switch-port -func checkPCIeConfig(vfioPort hv.PCIePort, machineType string) error { +func checkPCIeConfig(coldPlug hv.PCIePort, hotPlug hv.PCIePort, machineType string) error { // Currently only QEMU q35 supports advanced PCIe topologies // firecracker, dragonball do not have right now any PCIe support if machineType != "q35" { return nil } - if vfioPort == hv.NoPort || vfioPort == hv.BridgePort || - vfioPort == hv.RootPort || vfioPort == hv.SwitchPort { + + if coldPlug != hv.NoPort && hotPlug != hv.NoPort { + return fmt.Errorf("invalid hot-plug=%s and cold-plug=%s settings, only one of them can be set", coldPlug, hotPlug) + } + var port hv.PCIePort + if coldPlug != hv.NoPort { + port = coldPlug + } + if hotPlug != hv.NoPort { + port = hotPlug + } + if port == hv.NoPort || port == hv.BridgePort || port == hv.RootPort || port == hv.SwitchPort { return nil } return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s, %s", - vfioPort, hv.NoPort, hv.BridgePort, hv.RootPort, hv.SwitchPort) + coldPlug, hv.NoPort, hv.BridgePort, hv.RootPort, hv.SwitchPort) } // checkNetNsConfig performs sanity checks on disable_new_netns config. diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go index f3899f1b8..7b2c10f2f 100644 --- a/src/runtime/pkg/oci/utils_test.go +++ b/src/runtime/pkg/oci/utils_test.go @@ -821,22 +821,22 @@ func TestRegexpContains(t *testing.T) { //nolint: govet type testData struct { - toMatch string regexps []string + toMatch string expected bool } data := []testData{ - {regexps: []string{}, toMatch: "", expected: false}, - {regexps: []string{}, toMatch: "nonempty", expected: false}, - {regexps: []string{"simple"}, toMatch: "simple", expected: true}, - {regexps: []string{"simple"}, toMatch: "some_simple_text", expected: true}, - {regexps: []string{"simple"}, toMatch: "simp", expected: false}, - {regexps: []string{"one", "two"}, toMatch: "one", expected: true}, - {regexps: []string{"one", "two"}, toMatch: "two", expected: true}, - {regexps: []string{"o*"}, toMatch: "oooo", expected: true}, - {regexps: []string{"o*"}, toMatch: "oooa", expected: true}, - {regexps: []string{"^o*$"}, toMatch: "oooa", expected: false}, + {[]string{}, "", false}, + {[]string{}, "nonempty", false}, + {[]string{"simple"}, "simple", true}, + {[]string{"simple"}, "some_simple_text", true}, + {[]string{"simple"}, "simp", false}, + {[]string{"one", "two"}, "one", true}, + {[]string{"one", "two"}, "two", true}, + {[]string{"o*"}, "oooo", true}, + {[]string{"o*"}, "oooa", true}, + {[]string{"^o*$"}, "oooa", false}, } for _, d := range data { @@ -850,25 +850,25 @@ func TestCheckPathIsInGlobs(t *testing.T) { //nolint: govet type testData struct { - toMatch string globs []string + toMatch string expected bool } data := []testData{ - {globs: []string{}, toMatch: "", expected: false}, - {globs: []string{}, toMatch: "nonempty", expected: false}, - {globs: []string{"simple"}, toMatch: "simple", expected: false}, - {globs: []string{"simple"}, toMatch: "some_simple_text", expected: false}, - {globs: []string{"/bin/ls"}, toMatch: "/bin/ls", expected: true}, - {globs: []string{"/bin/ls", "/bin/false"}, toMatch: "/bin/ls", expected: true}, - {globs: []string{"/bin/ls", "/bin/false"}, toMatch: "/bin/false", expected: true}, - {globs: []string{"/bin/ls", "/bin/false"}, toMatch: "/bin/bar", expected: false}, - {globs: []string{"/bin/*ls*"}, toMatch: "/bin/ls", expected: true}, - {globs: []string{"/bin/*ls*"}, toMatch: "/bin/false", expected: true}, - {globs: []string{"bin/ls"}, toMatch: "/bin/ls", expected: false}, - {globs: []string{"./bin/ls"}, toMatch: "/bin/ls", expected: false}, - {globs: []string{"*/bin/ls"}, toMatch: "/bin/ls", expected: false}, + {[]string{}, "", false}, + {[]string{}, "nonempty", false}, + {[]string{"simple"}, "simple", false}, + {[]string{"simple"}, "some_simple_text", false}, + {[]string{"/bin/ls"}, "/bin/ls", true}, + {[]string{"/bin/ls", "/bin/false"}, "/bin/ls", true}, + {[]string{"/bin/ls", "/bin/false"}, "/bin/false", true}, + {[]string{"/bin/ls", "/bin/false"}, "/bin/bar", false}, + {[]string{"/bin/*ls*"}, "/bin/ls", true}, + {[]string{"/bin/*ls*"}, "/bin/false", true}, + {[]string{"bin/ls"}, "/bin/ls", false}, + {[]string{"./bin/ls"}, "/bin/ls", false}, + {[]string{"*/bin/ls"}, "/bin/ls", false}, } for _, d := range data { @@ -923,10 +923,10 @@ func TestParseAnnotationUintConfiguration(t *testing.T) { // nolint: govet testCases := []struct { - err error annotations map[string]string - validFunc func(uint64) error expected uint64 + err error + validFunc func(uint64) error }{ { annotations: map[string]string{key: ""}, @@ -1007,10 +1007,10 @@ func TestParseAnnotationBoolConfiguration(t *testing.T) { // nolint: govet testCases := []struct { - err error annotationKey string annotationValueList []string expected bool + err error }{ { annotationKey: boolKey, @@ -1207,8 +1207,8 @@ func TestNewMount(t *testing.T) { assert := assert.New(t) testCases := []struct { - in specs.Mount out vc.Mount + in specs.Mount }{ { in: specs.Mount{ diff --git a/src/runtime/virtcontainers/acrn.go b/src/runtime/virtcontainers/acrn.go index 735b20019..35c71a6d6 100644 --- a/src/runtime/virtcontainers/acrn.go +++ b/src/runtime/virtcontainers/acrn.go @@ -45,10 +45,10 @@ type AcrnState struct { // Acrn is an Hypervisor interface implementation for the Linux acrn hypervisor. type Acrn struct { + sandbox *Sandbox ctx context.Context arch acrnArch store persistapi.PersistDriver - sandbox *Sandbox id string acrnConfig Config config HypervisorConfig diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index cec2e1c63..05aa15b15 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -246,15 +246,15 @@ func (s *CloudHypervisorState) reset() { } type cloudHypervisor struct { - vmconfig chclient.VmConfig console console.Console virtiofsDaemon VirtiofsDaemon - ctx context.Context APIClient clhClient + ctx context.Context + id string netDevices *[]chclient.NetConfig devicesIds map[string]string netDevicesFiles map[string][]*os.File - id string + vmconfig chclient.VmConfig state CloudHypervisorState config HypervisorConfig stopped int32 diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go index 0ab7ef5b9..8d47b731e 100644 --- a/src/runtime/virtcontainers/clh_test.go +++ b/src/runtime/virtcontainers/clh_test.go @@ -188,13 +188,13 @@ func TestCloudHypervisorAddNetCheckEnpointTypes(t *testing.T) { } // nolint: govet tests := []struct { - args args name string + args args wantErr bool }{ - {name: "TapEndpoint", args: args{e: &TapEndpoint{}}, wantErr: true}, - {name: "Empty VethEndpoint", args: args{e: &VethEndpoint{}}, wantErr: true}, - {name: "Valid VethEndpoint", args: args{e: validVeth}, wantErr: false}, + {"TapEndpoint", args{e: &TapEndpoint{}}, true}, + {"Empty VethEndpoint", args{e: &VethEndpoint{}}, true}, + {"Valid VethEndpoint", args{e: validVeth}, false}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 7d14ab448..43974adf6 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -507,7 +507,7 @@ type HypervisorConfig struct { // RawDevics are used to get PCIe device info early before the sandbox // is started to make better PCIe topology decisions - RawDevices []config.DeviceInfo + VFIODevices []config.DeviceInfo // HotplugVFIO is used to indicate if devices need to be hotplugged on the // root port or a switch diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index ed2be337c..a350c1a48 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -284,16 +284,23 @@ type KataAgentState struct { // nolint: govet type kataAgent struct { - ctx context.Context - vmSocket interface{} - client *kataclient.AgentClient - reqHandlers map[string]reqFunc - state KataAgentState - kmodules []string + ctx context.Context + vmSocket interface{} + + client *kataclient.AgentClient + + // lock protects the client pointer sync.Mutex + + state KataAgentState + + reqHandlers map[string]reqFunc + kmodules []string + dialTimout uint32 - keepConn bool - dead bool + + keepConn bool + dead bool } func (k *kataAgent) Logger() *logrus.Entry { diff --git a/src/runtime/virtcontainers/kata_agent_test.go b/src/runtime/virtcontainers/kata_agent_test.go index 8fba81837..3653674ca 100644 --- a/src/runtime/virtcontainers/kata_agent_test.go +++ b/src/runtime/virtcontainers/kata_agent_test.go @@ -235,10 +235,10 @@ func TestHandleDeviceBlockVolume(t *testing.T) { // nolint: govet tests := []struct { - inputDev *drivers.BlockDevice - resultVol *pb.Storage BlockDeviceDriver string inputMount Mount + inputDev *drivers.BlockDevice + resultVol *pb.Storage }{ { inputDev: &drivers.BlockDevice{ @@ -1024,10 +1024,10 @@ func TestKataAgentKernelParams(t *testing.T) { // nolint: govet type testData struct { - expectedParams []Param - containerPipeSize uint32 debug bool trace bool + containerPipeSize uint32 + expectedParams []Param } debugParam := Param{Key: "agent.log", Value: "debug"} @@ -1036,28 +1036,28 @@ func TestKataAgentKernelParams(t *testing.T) { containerPipeSizeParam := Param{Key: vcAnnotations.ContainerPipeSizeKernelParam, Value: "2097152"} data := []testData{ - {debug: false, trace: false, containerPipeSize: 0, expectedParams: []Param{}}, + {false, false, 0, []Param{}}, // Debug - {debug: true, trace: false, containerPipeSize: 0, expectedParams: []Param{debugParam}}, + {true, false, 0, []Param{debugParam}}, // Tracing - {debug: false, trace: true, containerPipeSize: 0, expectedParams: []Param{traceParam}}, + {false, true, 0, []Param{traceParam}}, // Debug + Tracing - {debug: true, trace: true, containerPipeSize: 0, expectedParams: []Param{debugParam, traceParam}}, + {true, true, 0, []Param{debugParam, traceParam}}, // pipesize - {debug: false, trace: false, containerPipeSize: 2097152, expectedParams: []Param{containerPipeSizeParam}}, + {false, false, 0, []Param{containerPipeSizeParam}}, // Debug + pipesize - {debug: true, trace: false, containerPipeSize: 2097152, expectedParams: []Param{debugParam, containerPipeSizeParam}}, + {true, false, 0, []Param{debugParam, containerPipeSizeParam}}, // Tracing + pipesize - {debug: false, trace: true, containerPipeSize: 2097152, expectedParams: []Param{traceParam, containerPipeSizeParam}}, + {false, true, 0, []Param{traceParam, containerPipeSizeParam}}, // Debug + Tracing + pipesize - {debug: true, trace: true, containerPipeSize: 2097152, expectedParams: []Param{debugParam, traceParam, containerPipeSizeParam}}, + {true, true, 0, []Param{debugParam, traceParam, containerPipeSizeParam}}, } for i, d := range data { diff --git a/src/runtime/virtcontainers/monitor.go b/src/runtime/virtcontainers/monitor.go index 75e06fc8f..ae7843bea 100644 --- a/src/runtime/virtcontainers/monitor.go +++ b/src/runtime/virtcontainers/monitor.go @@ -22,12 +22,15 @@ var monitorLog = virtLog.WithField("subsystem", "virtcontainers/monitor") // nolint: govet type monitor struct { - sandbox *Sandbox - stopCh chan bool - watchers []chan error - checkInterval time.Duration - wg sync.WaitGroup + watchers []chan error + sandbox *Sandbox + + wg sync.WaitGroup sync.Mutex + + stopCh chan bool + checkInterval time.Duration + running bool } diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 06e35d3e3..514cc9ee3 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -747,9 +747,8 @@ func (q *qemu) checkBpfEnabled() { // There is only 64kB of IO memory each root,switch port will consume 4k hence // only 16 ports possible. func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig) error { - // We do not need to do anything if we want to hotplug a VFIO to a - // pcie-pci-bridge, just return - if hypervisorConfig.HotPlugVFIO == hv.BridgePort { + // If no-port set just return no need to add PCIe Root Port or PCIe Switches + if hypervisorConfig.HotPlugVFIO == hv.NoPort && hypervisorConfig.ColdPlugVFIO == hv.NoPort { return nil } // Add PCIe Root Port or PCIe Switches to the hypervisor @@ -772,37 +771,27 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg) } - // Get the number of hotpluggable ports needed from the provided devices - var numOfHotPluggablePorts uint32 = 0 - for _, dev := range hypervisorConfig.RawDevices { - hostPath, _ := config.GetHostPath(dev, false, "") - if hostPath == "" { - continue - } - - vfioNumberOrContainer := filepath.Base(hostPath) - // If we want to have VFIO inside of the VM we need to passthrough - // additionally /dev/vfio/vfio which is the VFIO "container". - // Ignore it and handle the remaining devices. - if strings.Compare(vfioNumberOrContainer, "vfio") == 0 { - continue - } - iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioNumberOrContainer, "devices") - deviceFiles, err := os.ReadDir(iommuDevicesPath) + // Get the number of hot(cold)-pluggable ports needed from the provided devices + var numOfPluggablePorts uint32 = 0 + for _, dev := range hypervisorConfig.VFIODevices { + var err error + dev.HostPath, err = config.GetHostPath(dev, false, "") if err != nil { - return err + return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err) } - - for _, deviceFile := range deviceFiles { - deviceBDF, _, _, err := drivers.GetVFIODetails(deviceFile.Name(), iommuDevicesPath) - if err != nil { - return err - } - if drivers.IsPCIeDevice(deviceBDF) { - numOfHotPluggablePorts = numOfHotPluggablePorts + 1 + devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev) + if err != nil { + return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err) + } + for _, vfioDevice := range devicesPerIOMMUGroup { + if drivers.IsPCIeDevice(vfioDevice.BDF) { + numOfPluggablePorts = numOfPluggablePorts + 1 } + // Reset the Rank, the vfio module is going to assign the correct one + vfioDevice.Rank = -1 } } + drivers.AllPCIeDevs = make(map[string]bool) // If number of PCIe root ports > 16 then bail out otherwise we may // use up all slots or IO memory on the root bus and vfio-XXX-pci devices @@ -817,20 +806,20 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig // If the user provided more root ports than we have detected // use the user provided number of PCIe root ports - if numOfHotPluggablePorts < hypervisorConfig.PCIeRootPort { - numOfHotPluggablePorts = hypervisorConfig.PCIeRootPort + if numOfPluggablePorts < hypervisorConfig.PCIeRootPort { + numOfPluggablePorts = hypervisorConfig.PCIeRootPort } // If the user provided more switch ports than we have detected // use the user provided number of PCIe root ports - if numOfHotPluggablePorts < hypervisorConfig.PCIeSwitchPort { - numOfHotPluggablePorts = hypervisorConfig.PCIeSwitchPort + if numOfPluggablePorts < hypervisorConfig.PCIeSwitchPort { + numOfPluggablePorts = hypervisorConfig.PCIeSwitchPort } - if q.state.HotPlugVFIO == hv.RootPort || q.state.HotplugVFIOOnRootBus { - qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfHotPluggablePorts, memSize32bit, memSize64bit) + if q.state.HotPlugVFIO == hv.RootPort || q.state.ColdPlugVFIO == hv.RootPort || q.state.HotplugVFIOOnRootBus { + qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) } - if q.state.HotPlugVFIO == hv.SwitchPort { - qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfHotPluggablePorts, memSize32bit, memSize64bit) + if q.state.HotPlugVFIO == hv.SwitchPort || q.state.ColdPlugVFIO == hv.SwitchPort { + qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) } return nil } @@ -1847,6 +1836,7 @@ func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.V return fmt.Errorf("VFIO device is a PCIe device. Hotplug (%v) only supported on PCIe Root (%d) or PCIe Switch Ports (%v)", q.state.HotPlugVFIO, q.state.PCIeRootPort, q.state.PCIeSwitchPort) } + device.Bus = fmt.Sprintf("%s%d", pcieSwitchDownstreamPortPrefix, device.Rank) return q.executeVFIODeviceAdd(device) } @@ -1897,11 +1887,10 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op } if op == AddDevice { - buf, _ := json.Marshal(device) q.Logger().WithFields(logrus.Fields{ "machine-type": q.HypervisorConfig().HypervisorMachineType, - "hotplug-vfio": q.state.HotPlugVFIO, + "hot-plug-vfio": q.state.HotPlugVFIO, "pcie-root-port": q.state.PCIeRootPort, "pcie-switch-port": q.state.PCIeSwitchPort, "device-info": string(buf), diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index bda931b43..a5e9ca4a5 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -613,22 +613,26 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor if err := validateHypervisorConfig(&sandboxConfig.HypervisorConfig); err != nil { return nil, err } - // Aggregate all the container devices and update the HV config - var devices []config.DeviceInfo - for _, ct := range sandboxConfig.Containers { - devices = append(devices, ct.DeviceInfos...) - } - sandboxConfig.HypervisorConfig.RawDevices = devices // If we have a confidential guest we need to cold-plug the PCIe VFIO devices // until we have TDISP/IDE PCIe support. coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != hv.NoPort) - var devs []config.DeviceInfo + // Aggregate all the containner devices for hot-plug and use them to dedcue + // the correct amount of ports to reserve for the hypervisor. + hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != hv.NoPort) + + var vfioHotPlugDevices []config.DeviceInfo + var vfioColdPlugDevices []config.DeviceInfo + for cnt, containers := range sandboxConfig.Containers { for dev, device := range containers.DeviceInfos { - if coldPlugVFIO && deviceManager.IsVFIO(device.ContainerPath) { + isVFIO := deviceManager.IsVFIO(device.ContainerPath) + if hotPlugVFIO && isVFIO { + vfioHotPlugDevices = append(vfioHotPlugDevices, device) + } + if coldPlugVFIO && isVFIO { device.ColdPlug = true - devs = append(devs, device) + vfioColdPlugDevices = append(vfioColdPlugDevices, device) // We need to remove the devices marked for cold-plug // otherwise at the container level the kata-agent // will try to hot-plug them. @@ -638,6 +642,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor } } } + sandboxConfig.HypervisorConfig.VFIODevices = vfioHotPlugDevices // store doesn't require hypervisor to be stored immediately if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil { @@ -652,7 +657,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor return s, nil } - for _, dev := range devs { + for _, dev := range vfioColdPlugDevices { _, err := s.AddDevice(ctx, dev) if err != nil { s.Logger().WithError(err).Debug("Cannot cold-plug add device") diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go index db1aa7f84..90a2af7ee 100644 --- a/src/runtime/virtcontainers/sandbox_test.go +++ b/src/runtime/virtcontainers/sandbox_test.go @@ -650,8 +650,8 @@ func TestSandboxCreateAssets(t *testing.T) { // nolint: govet type testData struct { - annotations map[string]string assetType types.AssetType + annotations map[string]string } tmpfile, err := os.CreateTemp("", "virtcontainers-test-") @@ -687,50 +687,50 @@ func TestSandboxCreateAssets(t *testing.T) { data := []testData{ { - assetType: types.FirmwareAsset, - annotations: map[string]string{ + types.FirmwareAsset, + map[string]string{ annotations.FirmwarePath: filename, annotations.FirmwareHash: assetContentHash, }, }, { - assetType: types.HypervisorAsset, - annotations: map[string]string{ + types.HypervisorAsset, + map[string]string{ annotations.HypervisorPath: filename, annotations.HypervisorHash: assetContentHash, }, }, { - assetType: types.HypervisorCtlAsset, - annotations: map[string]string{ + types.HypervisorCtlAsset, + map[string]string{ annotations.HypervisorCtlPath: filename, annotations.HypervisorCtlHash: assetContentHash, }, }, { - assetType: types.ImageAsset, - annotations: map[string]string{ + types.ImageAsset, + map[string]string{ annotations.ImagePath: filename, annotations.ImageHash: assetContentHash, }, }, { - assetType: types.InitrdAsset, - annotations: map[string]string{ + types.InitrdAsset, + map[string]string{ annotations.InitrdPath: filename, annotations.InitrdHash: assetContentHash, }, }, { - assetType: types.JailerAsset, - annotations: map[string]string{ + types.JailerAsset, + map[string]string{ annotations.JailerPath: filename, annotations.JailerHash: assetContentHash, }, }, { - assetType: types.KernelAsset, - annotations: map[string]string{ + types.KernelAsset, + map[string]string{ annotations.KernelPath: filename, annotations.KernelHash: assetContentHash, }, @@ -1407,58 +1407,58 @@ func TestSandbox_Cgroups(t *testing.T) { // nolint: govet tests := []struct { - s *Sandbox name string + s *Sandbox wantErr bool needRoot bool }{ { - name: "New sandbox", - s: &Sandbox{}, - wantErr: true, - needRoot: false, + "New sandbox", + &Sandbox{}, + true, + false, }, { - name: "New sandbox, new config", - s: &Sandbox{config: &SandboxConfig{}}, - wantErr: false, - needRoot: true, + "New sandbox, new config", + &Sandbox{config: &SandboxConfig{}}, + false, + true, }, { - name: "sandbox, container no sandbox type", - s: &Sandbox{ + "sandbox, container no sandbox type", + &Sandbox{ config: &SandboxConfig{Containers: []ContainerConfig{ {}, }}}, - wantErr: false, - needRoot: true, + false, + true, }, { - name: "sandbox, container sandbox type", - s: &Sandbox{ + "sandbox, container sandbox type", + &Sandbox{ config: &SandboxConfig{Containers: []ContainerConfig{ sandboxContainer, }}}, - wantErr: false, - needRoot: true, + false, + true, }, { - name: "sandbox, empty linux json", - s: &Sandbox{ + "sandbox, empty linux json", + &Sandbox{ config: &SandboxConfig{Containers: []ContainerConfig{ emptyJSONLinux, }}}, - wantErr: false, - needRoot: true, + false, + true, }, { - name: "sandbox, successful config", - s: &Sandbox{ + "sandbox, successful config", + &Sandbox{ config: &SandboxConfig{Containers: []ContainerConfig{ successfulContainer, }}}, - wantErr: false, - needRoot: true, + false, + true, }, } for _, tt := range tests { diff --git a/src/runtime/virtcontainers/veth_endpoint_test.go b/src/runtime/virtcontainers/veth_endpoint_test.go index 341185b00..dc6f03270 100644 --- a/src/runtime/virtcontainers/veth_endpoint_test.go +++ b/src/runtime/virtcontainers/veth_endpoint_test.go @@ -85,16 +85,16 @@ func TestCreateVethNetworkEndpointChooseIfaceName(t *testing.T) { func TestCreateVethNetworkEndpointInvalidArgs(t *testing.T) { // nolint: govet type endpointValues struct { - ifName string idx int + ifName string } assert := assert.New(t) // all elements are expected to result in failure failingValues := []endpointValues{ - {idx: -1, ifName: "bar"}, - {idx: -1, ifName: ""}, + {-1, "bar"}, + {-1, ""}, } for _, d := range failingValues { diff --git a/src/runtime/virtcontainers/virtiofsd_test.go b/src/runtime/virtcontainers/virtiofsd_test.go index 55eb3fb1a..a4252a2ba 100644 --- a/src/runtime/virtcontainers/virtiofsd_test.go +++ b/src/runtime/virtcontainers/virtiofsd_test.go @@ -17,13 +17,13 @@ import ( func TestVirtiofsdStart(t *testing.T) { // nolint: govet type fields struct { - ctx context.Context + path string socketPath string cache string - sourcePath string - path string extraArgs []string + sourcePath string PID int + ctx context.Context } sourcePath := t.TempDir() @@ -41,13 +41,13 @@ func TestVirtiofsdStart(t *testing.T) { // nolint: govet tests := []struct { - fields fields name string + fields fields wantErr bool }{ - {name: "empty config", fields: fields{}, wantErr: true}, - {name: "Directory socket does not exist", fields: NoDirectorySocket, wantErr: true}, - {name: "valid config", fields: validConfig, wantErr: false}, + {"empty config", fields{}, true}, + {"Directory socket does not exist", NoDirectorySocket, true}, + {"valid config", validConfig, false}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/tools/osbuilder/image-builder/image_builder.sh b/tools/osbuilder/image-builder/image_builder.sh index f88f3aac0..3e7f0babc 100755 --- a/tools/osbuilder/image-builder/image_builder.sh +++ b/tools/osbuilder/image-builder/image_builder.sh @@ -271,7 +271,7 @@ calculate_required_disk_size() { readonly image="$(mktemp)" readonly mount_dir="$(mktemp -d)" readonly max_tries=20 - readonly increment=100 + readonly increment=10 for i in $(seq 1 $max_tries); do local img_size="$((rootfs_size_mb + (i * increment)))" diff --git a/tools/packaging/scripts/configure-hypervisor.sh b/tools/packaging/scripts/configure-hypervisor.sh index 9dfc9bb32..8bdfc94de 100755 --- a/tools/packaging/scripts/configure-hypervisor.sh +++ b/tools/packaging/scripts/configure-hypervisor.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env bash # # Copyright (c) 2018 Intel Corporation # @@ -14,8 +14,6 @@ # been specified. #--------------------------------------------------------------------- -set -x - script_name=${0##*/} arch="${3:-$(uname -m)}" From b1aa8c8a24827983f450daab4f0fffc8b8571f63 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Mon, 8 May 2023 12:59:36 +0000 Subject: [PATCH 08/87] gpu: Moved the PCIe configs to drivers The hypervisor_state file was the wrong location for the PCIe Port settings, moved everything under device umbrella, where it can be consumed more easily and we do not get into circular deps. Signed-off-by: Zvonko Kaiser --- src/runtime/cmd/kata-runtime/kata-env.go | 6 +- src/runtime/cmd/kata-runtime/kata-env_test.go | 16 +- .../pkg/containerd-shim-v2/create_test.go | 12 +- src/runtime/pkg/device/config/config.go | 65 ++++++ src/runtime/pkg/device/drivers/utils.go | 2 + src/runtime/pkg/device/drivers/vfio.go | 1 - .../pkg/hypervisors/hypervisor_state.go | 34 +--- src/runtime/pkg/katatestutils/utils.go | 6 +- .../pkg/katautils/config-settings.go.in | 6 +- src/runtime/pkg/katautils/config.go | 187 +++++++++--------- src/runtime/pkg/katautils/config_test.go | 32 +-- src/runtime/virtcontainers/hypervisor.go | 4 +- src/runtime/virtcontainers/kata_agent_test.go | 8 +- src/runtime/virtcontainers/nydusd_test.go | 8 +- .../virtcontainers/persist/api/config.go | 6 +- src/runtime/virtcontainers/qemu.go | 37 ++-- src/runtime/virtcontainers/qemu_arch_base.go | 20 +- src/runtime/virtcontainers/sandbox.go | 10 +- 18 files changed, 249 insertions(+), 211 deletions(-) diff --git a/src/runtime/cmd/kata-runtime/kata-env.go b/src/runtime/cmd/kata-runtime/kata-env.go index e5e7e87ed..d87e426bf 100644 --- a/src/runtime/cmd/kata-runtime/kata-env.go +++ b/src/runtime/cmd/kata-runtime/kata-env.go @@ -17,7 +17,7 @@ import ( "github.com/prometheus/procfs" "github.com/urfave/cli" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" "github.com/kata-containers/kata-containers/src/runtime/pkg/utils" @@ -115,8 +115,8 @@ type HypervisorInfo struct { MemorySlots uint32 PCIeRootPort uint32 PCIeSwitchPort uint32 - HotPlugVFIO hv.PCIePort - ColdPlugVFIO hv.PCIePort + HotPlugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort HotplugVFIOOnRootBus bool Debug bool } diff --git a/src/runtime/cmd/kata-runtime/kata-env_test.go b/src/runtime/cmd/kata-runtime/kata-env_test.go index d897a0c4e..4de2fb09e 100644 --- a/src/runtime/cmd/kata-runtime/kata-env_test.go +++ b/src/runtime/cmd/kata-runtime/kata-env_test.go @@ -19,12 +19,12 @@ import ( "testing" "github.com/BurntSushi/toml" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" vcUtils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/urfave/cli" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" @@ -74,9 +74,9 @@ func createConfig(configPath string, fileData string) error { return nil } -func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeConfig, err error) { - var hotPlugVFIO hv.PCIePort - var coldPlugVFIO hv.PCIePort +func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.RuntimeConfig, err error) { + var hotPlugVFIO config.PCIePort + var coldPlugVFIO config.PCIePort const logPath = "/log/path" hypervisorPath := filepath.Join(prefixDir, "hypervisor") kernelPath := filepath.Join(prefixDir, "kernel") @@ -90,8 +90,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC hotplugVFIOOnRootBus := true pcieRootPort := uint32(2) pcieSwitchPort := uint32(2) - hotPlugVFIO = hv.BridgePort - coldPlugVFIO = hv.NoPort + hotPlugVFIO = config.BridgePort + coldPlugVFIO = config.NoPort disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := filepath.Join(prefixDir, "virtiofsd") @@ -161,12 +161,12 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC return "", oci.RuntimeConfig{}, err } - _, config, err = katautils.LoadConfiguration(configFile, true) + _, ociConfig, err = katautils.LoadConfiguration(configFile, true) if err != nil { return "", oci.RuntimeConfig{}, err } - return configFile, config, nil + return configFile, ociConfig, nil } func getExpectedAgentDetails(config oci.RuntimeConfig) (AgentInfo, error) { diff --git a/src/runtime/pkg/containerd-shim-v2/create_test.go b/src/runtime/pkg/containerd-shim-v2/create_test.go index dc37d72dd..f95b501b3 100644 --- a/src/runtime/pkg/containerd-shim-v2/create_test.go +++ b/src/runtime/pkg/containerd-shim-v2/create_test.go @@ -20,7 +20,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/assert" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations" @@ -308,9 +308,9 @@ func TestCreateContainerConfigFail(t *testing.T) { assert.Error(err) } -func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err error) { - var hotPlugVFIO hv.PCIePort - var coldPlugVFIO hv.PCIePort +func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, err error) { + var hotPlugVFIO config.PCIePort + var coldPlugVFIO config.PCIePort if dir == "" { return "", fmt.Errorf("BUG: need directory") } @@ -336,8 +336,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := path.Join(dir, "virtiofsd") - hotPlugVFIO = hv.BridgePort - coldPlugVFIO = hv.RootPort + hotPlugVFIO = config.BridgePort + coldPlugVFIO = config.RootPort configFileOptions := ktu.RuntimeConfigOptions{ Hypervisor: "qemu", diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index f3a0e879a..2431fa36f 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -122,6 +122,68 @@ var SysBusPciDevicesPath = "/sys/bus/pci/devices" var getSysDevPath = getSysDevPathImpl +// PCIePortBusPrefix gives us the correct bus nameing dependeing on the port +// used to hot(cold)-plug the device +type PCIePortBusPrefix string + +const ( + PCIeRootPortPrefix PCIePortBusPrefix = "rp" + PCIeSwitchPortPrefix PCIePortBusPrefix = "sw" + PCIeSwitchUpstreamPortPrefix PCIePortBusPrefix = "swup" + PCIeSwitchhDownstreamPortPrefix PCIePortBusPrefix = "swdp" + PCIBridgePortPrefix PCIePortBusPrefix = "bp" +) + +func (p PCIePortBusPrefix) String() string { + switch p { + case PCIeRootPortPrefix: + fallthrough + case PCIeSwitchPortPrefix: + fallthrough + case PCIeSwitchUpstreamPortPrefix: + fallthrough + case PCIeSwitchhDownstreamPortPrefix: + fallthrough + case PCIBridgePortPrefix: + return string(p) + } + return fmt.Sprintf("", string(p)) +} + +// PCIePort distinguish only between root and switch port +type PCIePort string + +const ( + // RootPort attach VFIO devices to a root-port + RootPort PCIePort = "root-port" + // SwitchPort attach VFIO devices to a switch-port + SwitchPort = "switch-port" + // BridgePort is the default + BridgePort = "bridge-port" + // NoPort is for disabling VFIO hotplug/coldplug + NoPort = "no-port" +) + +func (p PCIePort) String() string { + switch p { + case RootPort: + fallthrough + case SwitchPort: + fallthrough + case BridgePort: + fallthrough + case NoPort: + return string(p) + } + return fmt.Sprintf("", string(p)) +} + +var PCIePortPrefixMapping = map[PCIePort]PCIePortBusPrefix{ + RootPort: PCIeRootPortPrefix, + SwitchPort: PCIeSwitchhDownstreamPortPrefix, + BridgePort: PCIBridgePortPrefix, +} + // DeviceInfo is an embedded type that contains device data common to all types of devices. type DeviceInfo struct { // DriverOptions is specific options for each device driver @@ -167,6 +229,9 @@ type DeviceInfo struct { // ColdPlug specifies whether the device must be cold plugged (true) // or hot plugged (false). ColdPlug bool + + // Specifies the PCIe port type to which the device is attached + Port PCIePort } // BlockDrive represents a block storage drive which may be used in case the storage diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index f0c210954..ec1609fe9 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -208,7 +208,9 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe if isPCIe { vfioPCI.Rank = len(AllPCIeDevs) AllPCIeDevs[deviceBDF] = true + vfioPCI.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[device.Port], vfioPCI.Rank) } + vfio = vfioPCI case config.VFIOAPDeviceMediatedType: devices, err := GetAPVFIODevices(deviceSysfsDev) diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 3604a355d..ad6ec5cc3 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -28,7 +28,6 @@ const ( vfioRemoveIDPath = "/sys/bus/pci/drivers/vfio-pci/remove_id" iommuGroupPath = "/sys/bus/pci/devices/%s/iommu_group" vfioDevPath = "/dev/vfio/%s" - pcieRootPortPrefix = "rp" vfioAPSysfsDir = "/sys/devices/vfio_ap" ) diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go index 2eed91cae..20d6c7d65 100644 --- a/src/runtime/pkg/hypervisors/hypervisor_state.go +++ b/src/runtime/pkg/hypervisors/hypervisor_state.go @@ -5,7 +5,7 @@ package hypervisors -import "fmt" +import "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" // Bridge is a bridge where devices can be hot plugged type Bridge struct { @@ -28,34 +28,6 @@ type CPUDevice struct { ID string } -// PCIePort distinguish only between root and switch port -type PCIePort string - -const ( - // RootPort attach VFIO devices to a root-port - RootPort PCIePort = "root-port" - // SwitchPort attach VFIO devices to a switch-port - SwitchPort = "switch-port" - // BridgePort is the default - BridgePort = "bridge-port" - // NoPort is for disabling VFIO hotplug/coldplug - NoPort = "no-port" -) - -func (p PCIePort) String() string { - switch p { - case RootPort: - fallthrough - case SwitchPort: - fallthrough - case BridgePort: - fallthrough - case NoPort: - return string(p) - } - return fmt.Sprintf("", string(p)) -} - type HypervisorState struct { BlockIndexMap map[int]struct{} // Type of hypervisor, E.g. qemu/firecracker/acrn. @@ -76,7 +48,7 @@ type HypervisorState struct { Pid int PCIeRootPort int PCIeSwitchPort int - HotPlugVFIO PCIePort - ColdPlugVFIO PCIePort + HotPlugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort HotplugVFIOOnRootBus bool } diff --git a/src/runtime/pkg/katatestutils/utils.go b/src/runtime/pkg/katatestutils/utils.go index ef650ea0d..bd76ae9a8 100644 --- a/src/runtime/pkg/katatestutils/utils.go +++ b/src/runtime/pkg/katatestutils/utils.go @@ -14,7 +14,7 @@ import ( "strconv" "testing" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/assert" ) @@ -226,8 +226,8 @@ type RuntimeConfigOptions struct { PFlash []string PCIeRootPort uint32 PCIeSwitchPort uint32 - HotPlugVFIO hv.PCIePort - ColdPlugVFIO hv.PCIePort + HotPlugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort DefaultVCPUCount uint32 DefaultMaxVCPUCount uint32 DefaultMemSize uint32 diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 521f7b60f..f3ce570aa 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -10,7 +10,7 @@ package katautils import ( - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" + config "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" ) // name is the name of the runtime @@ -109,7 +109,7 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock" // Default config file used by stateless systems. var defaultRuntimeConfiguration = "@CONFIG_PATH@" -const defaultHotPlugVFIO = hv.NoPort -const defaultColdPlugVFIO = hv.NoPort +const defaultHotPlugVFIO = config.NoPort +const defaultColdPlugVFIO = config.NoPort diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 75a246984..d34ede93b 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -20,7 +20,6 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm" govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" @@ -78,90 +77,90 @@ type factory struct { } type hypervisor struct { - Path string `toml:"path"` - JailerPath string `toml:"jailer_path"` - Kernel string `toml:"kernel"` - CtlPath string `toml:"ctlpath"` - Initrd string `toml:"initrd"` - Image string `toml:"image"` - RootfsType string `toml:"rootfs_type"` - Firmware string `toml:"firmware"` - FirmwareVolume string `toml:"firmware_volume"` - MachineAccelerators string `toml:"machine_accelerators"` - CPUFeatures string `toml:"cpu_features"` - KernelParams string `toml:"kernel_params"` - MachineType string `toml:"machine_type"` - BlockDeviceDriver string `toml:"block_device_driver"` - EntropySource string `toml:"entropy_source"` - SharedFS string `toml:"shared_fs"` - VirtioFSDaemon string `toml:"virtio_fs_daemon"` - VirtioFSCache string `toml:"virtio_fs_cache"` - VhostUserStorePath string `toml:"vhost_user_store_path"` - FileBackedMemRootDir string `toml:"file_mem_backend"` - GuestHookPath string `toml:"guest_hook_path"` - GuestMemoryDumpPath string `toml:"guest_memory_dump_path"` - SeccompSandbox string `toml:"seccompsandbox"` - BlockDeviceAIO string `toml:"block_device_aio"` - HypervisorPathList []string `toml:"valid_hypervisor_paths"` - JailerPathList []string `toml:"valid_jailer_paths"` - CtlPathList []string `toml:"valid_ctlpaths"` - VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"` - VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"` - PFlashList []string `toml:"pflashes"` - VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"` - FileBackedMemRootList []string `toml:"valid_file_mem_backends"` - EntropySourceList []string `toml:"valid_entropy_sources"` - EnableAnnotations []string `toml:"enable_annotations"` - RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"` - TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"` - MemOffset uint64 `toml:"memory_offset"` - DefaultMaxMemorySize uint64 `toml:"default_maxmemory"` - DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"` - DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"` - DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"` - DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"` - NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"` - NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"` - NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"` - NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"` - VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"` - VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"` - DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` - MemorySize uint32 `toml:"default_memory"` - MemSlots uint32 `toml:"memory_slots"` - DefaultBridges uint32 `toml:"default_bridges"` - Msize9p uint32 `toml:"msize_9p"` - PCIeSwitchPort uint32 `toml:"pcie_switch_port"` - PCIeRootPort uint32 `toml:"pcie_root_port"` - NumVCPUs int32 `toml:"default_vcpus"` - BlockDeviceCacheSet bool `toml:"block_device_cache_set"` - BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"` - BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"` - EnableVhostUserStore bool `toml:"enable_vhost_user_store"` - VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"` - DisableBlockDeviceUse bool `toml:"disable_block_device_use"` - MemPrealloc bool `toml:"enable_mem_prealloc"` - HugePages bool `toml:"enable_hugepages"` - VirtioMem bool `toml:"enable_virtio_mem"` - IOMMU bool `toml:"enable_iommu"` - IOMMUPlatform bool `toml:"enable_iommu_platform"` - Debug bool `toml:"enable_debug"` - DisableNestingChecks bool `toml:"disable_nesting_checks"` - EnableIOThreads bool `toml:"enable_iothreads"` - DisableImageNvdimm bool `toml:"disable_image_nvdimm"` - HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` - HotPlugVFIO hv.PCIePort `toml:"hot_plug_vfio"` - ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"` - DisableVhostNet bool `toml:"disable_vhost_net"` - GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` - ConfidentialGuest bool `toml:"confidential_guest"` - SevSnpGuest bool `toml:"sev_snp_guest"` - GuestSwap bool `toml:"enable_guest_swap"` - Rootless bool `toml:"rootless"` - DisableSeccomp bool `toml:"disable_seccomp"` - DisableSeLinux bool `toml:"disable_selinux"` - DisableGuestSeLinux bool `toml:"disable_guest_selinux"` - LegacySerial bool `toml:"use_legacy_serial"` + Path string `toml:"path"` + JailerPath string `toml:"jailer_path"` + Kernel string `toml:"kernel"` + CtlPath string `toml:"ctlpath"` + Initrd string `toml:"initrd"` + Image string `toml:"image"` + RootfsType string `toml:"rootfs_type"` + Firmware string `toml:"firmware"` + FirmwareVolume string `toml:"firmware_volume"` + MachineAccelerators string `toml:"machine_accelerators"` + CPUFeatures string `toml:"cpu_features"` + KernelParams string `toml:"kernel_params"` + MachineType string `toml:"machine_type"` + BlockDeviceDriver string `toml:"block_device_driver"` + EntropySource string `toml:"entropy_source"` + SharedFS string `toml:"shared_fs"` + VirtioFSDaemon string `toml:"virtio_fs_daemon"` + VirtioFSCache string `toml:"virtio_fs_cache"` + VhostUserStorePath string `toml:"vhost_user_store_path"` + FileBackedMemRootDir string `toml:"file_mem_backend"` + GuestHookPath string `toml:"guest_hook_path"` + GuestMemoryDumpPath string `toml:"guest_memory_dump_path"` + SeccompSandbox string `toml:"seccompsandbox"` + BlockDeviceAIO string `toml:"block_device_aio"` + HypervisorPathList []string `toml:"valid_hypervisor_paths"` + JailerPathList []string `toml:"valid_jailer_paths"` + CtlPathList []string `toml:"valid_ctlpaths"` + VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"` + VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"` + PFlashList []string `toml:"pflashes"` + VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"` + FileBackedMemRootList []string `toml:"valid_file_mem_backends"` + EntropySourceList []string `toml:"valid_entropy_sources"` + EnableAnnotations []string `toml:"enable_annotations"` + RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"` + TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"` + MemOffset uint64 `toml:"memory_offset"` + DefaultMaxMemorySize uint64 `toml:"default_maxmemory"` + DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"` + DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"` + DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"` + DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"` + NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"` + NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"` + NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"` + NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"` + VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"` + VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"` + DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` + MemorySize uint32 `toml:"default_memory"` + MemSlots uint32 `toml:"memory_slots"` + DefaultBridges uint32 `toml:"default_bridges"` + Msize9p uint32 `toml:"msize_9p"` + PCIeSwitchPort uint32 `toml:"pcie_switch_port"` + PCIeRootPort uint32 `toml:"pcie_root_port"` + NumVCPUs int32 `toml:"default_vcpus"` + BlockDeviceCacheSet bool `toml:"block_device_cache_set"` + BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"` + BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"` + EnableVhostUserStore bool `toml:"enable_vhost_user_store"` + VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"` + DisableBlockDeviceUse bool `toml:"disable_block_device_use"` + MemPrealloc bool `toml:"enable_mem_prealloc"` + HugePages bool `toml:"enable_hugepages"` + VirtioMem bool `toml:"enable_virtio_mem"` + IOMMU bool `toml:"enable_iommu"` + IOMMUPlatform bool `toml:"enable_iommu_platform"` + Debug bool `toml:"enable_debug"` + DisableNestingChecks bool `toml:"disable_nesting_checks"` + EnableIOThreads bool `toml:"enable_iothreads"` + DisableImageNvdimm bool `toml:"disable_image_nvdimm"` + HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` + HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"` + ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"` + DisableVhostNet bool `toml:"disable_vhost_net"` + GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` + ConfidentialGuest bool `toml:"confidential_guest"` + SevSnpGuest bool `toml:"sev_snp_guest"` + GuestSwap bool `toml:"enable_guest_swap"` + Rootless bool `toml:"rootless"` + DisableSeccomp bool `toml:"disable_seccomp"` + DisableSeLinux bool `toml:"disable_selinux"` + DisableGuestSeLinux bool `toml:"disable_guest_selinux"` + LegacySerial bool `toml:"use_legacy_serial"` } type runtime struct { @@ -289,13 +288,13 @@ func (h hypervisor) firmware() (string, error) { return ResolvePath(p) } -func (h hypervisor) coldPlugVFIO() hv.PCIePort { +func (h hypervisor) coldPlugVFIO() config.PCIePort { if h.ColdPlugVFIO == "" { return defaultColdPlugVFIO } return h.ColdPlugVFIO } -func (h hypervisor) hotPlugVFIO() hv.PCIePort { +func (h hypervisor) hotPlugVFIO() config.PCIePort { if h.HotPlugVFIO == "" { return defaultHotPlugVFIO } @@ -1689,29 +1688,29 @@ func checkConfig(config oci.RuntimeConfig) error { // checkPCIeConfig ensures the PCIe configuration is valid. // Only allow one of the following settings for cold-plug: // no-port, root-port, switch-port -func checkPCIeConfig(coldPlug hv.PCIePort, hotPlug hv.PCIePort, machineType string) error { +func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string) error { // Currently only QEMU q35 supports advanced PCIe topologies // firecracker, dragonball do not have right now any PCIe support if machineType != "q35" { return nil } - if coldPlug != hv.NoPort && hotPlug != hv.NoPort { + if coldPlug != config.NoPort && hotPlug != config.NoPort { return fmt.Errorf("invalid hot-plug=%s and cold-plug=%s settings, only one of them can be set", coldPlug, hotPlug) } - var port hv.PCIePort - if coldPlug != hv.NoPort { + var port config.PCIePort + if coldPlug != config.NoPort { port = coldPlug } - if hotPlug != hv.NoPort { + if hotPlug != config.NoPort { port = hotPlug } - if port == hv.NoPort || port == hv.BridgePort || port == hv.RootPort || port == hv.SwitchPort { + if port == config.NoPort || port == config.BridgePort || port == config.RootPort || port == config.SwitchPort { return nil } return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s, %s", - coldPlug, hv.NoPort, hv.BridgePort, hv.RootPort, hv.SwitchPort) + coldPlug, config.NoPort, config.BridgePort, config.RootPort, config.SwitchPort) } // checkNetNsConfig performs sanity checks on disable_new_netns config. diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index f30a7a27d..2f062bd08 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -18,8 +18,8 @@ import ( "syscall" "testing" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" @@ -63,16 +63,16 @@ func createConfig(configPath string, fileData string) error { // createAllRuntimeConfigFiles creates all files necessary to call // loadConfiguration(). -func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConfig, err error) { +func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntimeConfig, err error) { if dir == "" { - return config, fmt.Errorf("BUG: need directory") + return testConfig, fmt.Errorf("BUG: need directory") } if hypervisor == "" { - return config, fmt.Errorf("BUG: need hypervisor") + return testConfig, fmt.Errorf("BUG: need hypervisor") } - var hotPlugVFIO hv.PCIePort - var coldPlugVFIO hv.PCIePort + var hotPlugVFIO config.PCIePort + var coldPlugVFIO config.PCIePort hypervisorPath := path.Join(dir, "hypervisor") kernelPath := path.Join(dir, "kernel") kernelParams := "foo=bar xyz" @@ -88,8 +88,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf hotplugVFIOOnRootBus := true pcieRootPort := uint32(2) pcieSwitchPort := uint32(3) - hotPlugVFIO = hv.BridgePort - coldPlugVFIO = hv.RootPort + hotPlugVFIO = config.BridgePort + coldPlugVFIO = config.RootPort disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := path.Join(dir, "virtiofsd") @@ -139,7 +139,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf configPath := path.Join(dir, "runtime.toml") err = createConfig(configPath, runtimeConfigFileData) if err != nil { - return config, err + return testConfig, err } configPathLink := path.Join(filepath.Dir(configPath), "link-to-configuration.toml") @@ -147,7 +147,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf // create a link to the config file err = syscall.Symlink(configPath, configPathLink) if err != nil { - return config, err + return testConfig, err } files := []string{hypervisorPath, kernelPath, imagePath} @@ -156,7 +156,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf // create the resource (which must be >0 bytes) err := WriteFile(file, "foo", testFileMode) if err != nil { - return config, err + return testConfig, err } } @@ -223,10 +223,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf err = SetKernelParams(&runtimeConfig) if err != nil { - return config, err + return testConfig, err } - config = testRuntimeConfig{ + rtimeConfig := testRuntimeConfig{ RuntimeConfig: runtimeConfig, RuntimeConfigFile: configPath, ConfigPath: configPath, @@ -235,7 +235,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf LogPath: logPath, } - return config, nil + return rtimeConfig, nil } // testLoadConfiguration accepts an optional function that can be used @@ -610,7 +610,7 @@ func TestMinimalRuntimeConfig(t *testing.T) { func TestNewQemuHypervisorConfig(t *testing.T) { dir := t.TempDir() - var coldPlugVFIO hv.PCIePort + var coldPlugVFIO config.PCIePort hypervisorPath := path.Join(dir, "hypervisor") kernelPath := path.Join(dir, "kernel") imagePath := path.Join(dir, "image") @@ -619,7 +619,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) { enableIOThreads := true hotplugVFIOOnRootBus := true pcieRootPort := uint32(2) - coldPlugVFIO = hv.RootPort + coldPlugVFIO = config.RootPort orgVHostVSockDevicePath := utils.VHostVSockDevicePath blockDeviceAIO := "io_uring" defer func() { diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 43974adf6..62a8d866f 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -511,11 +511,11 @@ type HypervisorConfig struct { // HotplugVFIO is used to indicate if devices need to be hotplugged on the // root port or a switch - HotPlugVFIO hv.PCIePort + HotPlugVFIO config.PCIePort // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the // root port, switch or no port - ColdPlugVFIO hv.PCIePort + ColdPlugVFIO config.PCIePort // PCIeRootPort is used to indicate the number of PCIe Root Port devices // The PCIe Root Port device is used to hot-plug the PCIe device diff --git a/src/runtime/virtcontainers/kata_agent_test.go b/src/runtime/virtcontainers/kata_agent_test.go index 3653674ca..c7fa059dc 100644 --- a/src/runtime/virtcontainers/kata_agent_test.go +++ b/src/runtime/virtcontainers/kata_agent_test.go @@ -1048,16 +1048,16 @@ func TestKataAgentKernelParams(t *testing.T) { {true, true, 0, []Param{debugParam, traceParam}}, // pipesize - {false, false, 0, []Param{containerPipeSizeParam}}, + {false, false, 2097152, []Param{containerPipeSizeParam}}, // Debug + pipesize - {true, false, 0, []Param{debugParam, containerPipeSizeParam}}, + {true, false, 2097152, []Param{debugParam, containerPipeSizeParam}}, // Tracing + pipesize - {false, true, 0, []Param{traceParam, containerPipeSizeParam}}, + {false, true, 2097152, []Param{traceParam, containerPipeSizeParam}}, // Debug + Tracing + pipesize - {true, true, 0, []Param{debugParam, traceParam, containerPipeSizeParam}}, + {true, true, 2097152, []Param{debugParam, traceParam, containerPipeSizeParam}}, } for i, d := range data { diff --git a/src/runtime/virtcontainers/nydusd_test.go b/src/runtime/virtcontainers/nydusd_test.go index 712113015..a8ec6dc9b 100644 --- a/src/runtime/virtcontainers/nydusd_test.go +++ b/src/runtime/virtcontainers/nydusd_test.go @@ -57,13 +57,13 @@ func TestNydusdStart(t *testing.T) { // nolint: govet tests := []struct { - fields fields name string + fields fields wantErr bool }{ - {name: "empty config", fields: fields{}, wantErr: true}, - {name: "directory source path not exist", fields: SourcePathNoExist, wantErr: true}, - {name: "valid config", fields: validConfig, wantErr: false}, + {"empty config", fields{}, true}, + {"directory source path not exist", SourcePathNoExist, true}, + {"valid config", validConfig, false}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go index 8cd752093..aeb347127 100644 --- a/src/runtime/virtcontainers/persist/api/config.go +++ b/src/runtime/virtcontainers/persist/api/config.go @@ -7,7 +7,7 @@ package persistapi import ( - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/opencontainers/runc/libcontainer/configs" specs "github.com/opencontainers/runtime-spec/specs-go" ) @@ -205,11 +205,11 @@ type HypervisorConfig struct { // HotPlugVFIO is used to indicate if devices need to be hotplugged on the // root, switch, bridge or no-port - HotPlugVFIO hv.PCIePort + HotPlugVFIO config.PCIePort // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the // root, bridge, switch or no-port - ColdPlugVFIO hv.PCIePort + ColdPlugVFIO config.PCIePort // BootToBeTemplate used to indicate if the VM is created to be a template VM BootToBeTemplate bool diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 514cc9ee3..acd5fcc7e 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -76,7 +76,7 @@ type qmpChannel struct { // QemuState keeps Qemu's state type QemuState struct { UUID string - HotPlugVFIO hv.PCIePort + HotPlugVFIO config.PCIePort Bridges []types.Bridge HotpluggedVCPUs []hv.CPUDevice HotpluggedMemory int @@ -84,8 +84,8 @@ type QemuState struct { PCIeRootPort int PCIeSwitchPort int HotplugVFIOOnRootBus bool - HotplugVFIO hv.PCIePort - ColdPlugVFIO hv.PCIePort + HotplugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort } // qemu is an Hypervisor interface implementation for the Linux qemu hypervisor. @@ -748,7 +748,7 @@ func (q *qemu) checkBpfEnabled() { // only 16 ports possible. func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig) error { // If no-port set just return no need to add PCIe Root Port or PCIe Switches - if hypervisorConfig.HotPlugVFIO == hv.NoPort && hypervisorConfig.ColdPlugVFIO == hv.NoPort { + if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort { return nil } // Add PCIe Root Port or PCIe Switches to the hypervisor @@ -787,11 +787,14 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig if drivers.IsPCIeDevice(vfioDevice.BDF) { numOfPluggablePorts = numOfPluggablePorts + 1 } - // Reset the Rank, the vfio module is going to assign the correct one - vfioDevice.Rank = -1 + // Reset the Rank and AllPCIeDevsthe vfio module is going + // to assign the correct one in the case of hot-plug + if hypervisorConfig.HotPlugVFIO != config.NoPort { + vfioDevice.Rank = -1 + drivers.AllPCIeDevs = map[string]bool{} + } } } - drivers.AllPCIeDevs = make(map[string]bool) // If number of PCIe root ports > 16 then bail out otherwise we may // use up all slots or IO memory on the root bus and vfio-XXX-pci devices @@ -815,10 +818,10 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig numOfPluggablePorts = hypervisorConfig.PCIeSwitchPort } - if q.state.HotPlugVFIO == hv.RootPort || q.state.ColdPlugVFIO == hv.RootPort || q.state.HotplugVFIOOnRootBus { + if q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus { qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) } - if q.state.HotPlugVFIO == hv.SwitchPort || q.state.ColdPlugVFIO == hv.SwitchPort { + if q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort { qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) } return nil @@ -1629,7 +1632,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V //Since the dev is the first and only one on this bus(root port), it should be 0. addr := "00" - bridgeID := fmt.Sprintf("%s%d", pcieRootPortPrefix, len(drivers.AllPCIeDevs)) + bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(drivers.AllPCIeDevs)) drivers.AllPCIeDevs[devID] = true bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID) @@ -1824,7 +1827,6 @@ func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFI q.state.HotPlugVFIO, q.state.PCIeRootPort, q.state.PCIeSwitchPort) } - device.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, device.Rank) return q.executeVFIODeviceAdd(device) } @@ -1837,7 +1839,6 @@ func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.V q.state.HotPlugVFIO, q.state.PCIeRootPort, q.state.PCIeSwitchPort) } - device.Bus = fmt.Sprintf("%s%d", pcieSwitchDownstreamPortPrefix, device.Rank) return q.executeVFIODeviceAdd(device) } @@ -1904,9 +1905,9 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus // for pc machine type instead of bridge. This is useful for devices that require // a large PCI BAR which is a currently a limitation with PCI bridges. - if q.state.HotPlugVFIO == hv.RootPort || q.state.HotplugVFIOOnRootBus { + if q.state.HotPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus { err = q.hotplugVFIODeviceRootPort(ctx, device) - } else if q.state.HotPlugVFIO == hv.SwitchPort { + } else if q.state.HotPlugVFIO == config.SwitchPort { err = q.hotplugVFIODeviceSwitchPort(ctx, device) } else { err = q.hotplugVFIODeviceBridgePort(ctx, device) @@ -2632,7 +2633,7 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin for i := uint32(0); i < number; i++ { devices = append(devices, govmmQemu.PCIeRootPortDevice{ - ID: fmt.Sprintf("%s%d", pcieRootPortPrefix, i), + ID: fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, i), Bus: bus, Chassis: chassis, Slot: strconv.FormatUint(uint64(i), 10), @@ -2679,7 +2680,7 @@ func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, mach // Using an own ID for the root port, so we do not clash with already // existing root ports adding "s" for switch prefix pcieRootPort := govmmQemu.PCIeRootPortDevice{ - ID: fmt.Sprintf("%s%s%d", pcieSwitchPrefix, pcieRootPortPrefix, 0), + ID: fmt.Sprintf("%s%s%d", config.PCIeSwitchPortPrefix, config.PCIeRootPortPrefix, 0), Bus: defaultBridgeBus, Chassis: "0", Slot: strconv.FormatUint(uint64(0), 10), @@ -2692,7 +2693,7 @@ func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, mach devices = append(devices, pcieRootPort) pcieSwitchUpstreamPort := govmmQemu.PCIeSwitchUpstreamPortDevice{ - ID: fmt.Sprintf("%s%d", pcieSwitchUpstreamPortPrefix, 0), + ID: fmt.Sprintf("%s%d", config.PCIeSwitchUpstreamPortPrefix, 0), Bus: pcieRootPort.ID, } devices = append(devices, pcieSwitchUpstreamPort) @@ -2706,7 +2707,7 @@ func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, mach for i := uint32(0); i < number; i++ { pcieSwitchDownstreamPort := govmmQemu.PCIeSwitchDownstreamPortDevice{ - ID: fmt.Sprintf("%s%d", pcieSwitchDownstreamPortPrefix, i), + ID: fmt.Sprintf("%s%d", config.PCIeSwitchhDownstreamPortPrefix, i), Bus: pcieSwitchUpstreamPort.ID, Chassis: fmt.Sprintf("%d", nextChassis), Slot: strconv.FormatUint(uint64(i), 10), diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index 9a288d85a..36a23371f 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -180,18 +180,14 @@ type qemuArchBase struct { } const ( - defaultCores uint32 = 1 - defaultThreads uint32 = 1 - defaultCPUModel = "host" - defaultBridgeBus = "pcie.0" - defaultPCBridgeBus = "pci.0" - maxDevIDSize = 31 - maxPCIeRootPort = 16 // Limitation from QEMU - maxPCIeSwitchPort = 16 // Limitation from QEMU - pcieRootPortPrefix = "rp" - pcieSwitchPrefix = "sw" - pcieSwitchUpstreamPortPrefix = "swup" - pcieSwitchDownstreamPortPrefix = "swdp" + defaultCores uint32 = 1 + defaultThreads uint32 = 1 + defaultCPUModel = "host" + defaultBridgeBus = "pcie.0" + defaultPCBridgeBus = "pci.0" + maxDevIDSize = 31 + maxPCIeRootPort = 16 // Limitation from QEMU + maxPCIeSwitchPort = 16 // Limitation from QEMU ) // This is the PCI start address assigned to the first bridge that diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index a5e9ca4a5..9fe88b28c 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -32,7 +32,6 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers" deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" resCtrl "github.com/kata-containers/kata-containers/src/runtime/pkg/resourcecontrol" exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental" @@ -616,10 +615,10 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor // If we have a confidential guest we need to cold-plug the PCIe VFIO devices // until we have TDISP/IDE PCIe support. - coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != hv.NoPort) + coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort) // Aggregate all the containner devices for hot-plug and use them to dedcue // the correct amount of ports to reserve for the hypervisor. - hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != hv.NoPort) + hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort) var vfioHotPlugDevices []config.DeviceInfo var vfioColdPlugDevices []config.DeviceInfo @@ -629,9 +628,11 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor isVFIO := deviceManager.IsVFIO(device.ContainerPath) if hotPlugVFIO && isVFIO { vfioHotPlugDevices = append(vfioHotPlugDevices, device) + sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO } if coldPlugVFIO && isVFIO { device.ColdPlug = true + device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO vfioColdPlugDevices = append(vfioColdPlugDevices, device) // We need to remove the devices marked for cold-plug // otherwise at the container level the kata-agent @@ -639,6 +640,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor infos := sandboxConfig.Containers[cnt].DeviceInfos infos = append(infos[:dev], infos[dev+1:]...) sandboxConfig.Containers[cnt].DeviceInfos = infos + } } } @@ -2015,7 +2017,9 @@ func (s *Sandbox) AppendDevice(ctx context.Context, device api.Device) error { return s.hypervisor.AddDevice(ctx, device.GetDeviceInfo().(*config.VhostUserDeviceAttrs), VhostuserDev) case config.DeviceVFIO: vfioDevs := device.GetDeviceInfo().([]*config.VFIODev) + s.Logger().Info("### vfioDevs: ", vfioDevs) for _, d := range vfioDevs { + s.Logger().Info("### vfioDev: ", d) return s.hypervisor.AddDevice(ctx, *d, VfioDev) } default: From b5c4677e0ec1c39c6a04a74a9bc2f0daa8779db6 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Tue, 9 May 2023 08:46:30 +0000 Subject: [PATCH 09/87] vfio: Rearrange the bus assignemnt Refactor the bus assignment so that the call to GetAllVFIODevicesFromIOMMUGroup can be used by any module without affecting the topology. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/config/config.go | 3 +++ src/runtime/pkg/device/drivers/utils.go | 12 +++--------- src/runtime/pkg/device/drivers/vfio.go | 7 +++++++ src/runtime/pkg/govmm/qemu/qmp.go | 2 -- src/runtime/pkg/hypervisors/hypervisor_state.go | 1 - src/runtime/virtcontainers/qemu.go | 8 -------- src/runtime/virtcontainers/sandbox.go | 3 --- 7 files changed, 13 insertions(+), 23 deletions(-) diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index 2431fa36f..f6ac00557 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -370,6 +370,9 @@ type VFIODev struct { // Rank identifies a device in a IOMMU group Rank int + + // Port is the PCIe port type to which the device is attached + Port PCIePort } // RNGDev represents a random number generator device diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index ec1609fe9..ddb1a0ded 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -194,24 +194,18 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe switch vfioDeviceType { case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - isPCIe := IsPCIeDevice(deviceBDF) // Do not directly assign to `vfio` -- need to access field still - vfioPCI := config.VFIODev{ + vfio = config.VFIODev{ ID: id, Type: vfioDeviceType, BDF: deviceBDF, SysfsDev: deviceSysfsDev, - IsPCIe: isPCIe, + IsPCIe: IsPCIeDevice(deviceBDF), Class: pciClass, Rank: -1, - } - if isPCIe { - vfioPCI.Rank = len(AllPCIeDevs) - AllPCIeDevs[deviceBDF] = true - vfioPCI.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[device.Port], vfioPCI.Rank) + Port: device.Port, } - vfio = vfioPCI case config.VFIOAPDeviceMediatedType: devices, err := GetAPVFIODevices(deviceSysfsDev) if err != nil { diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index ad6ec5cc3..8f63b2fc6 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -76,6 +76,13 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece if err != nil { return err } + for _, vfio := range device.VfioDevs { + if vfio.IsPCIe { + vfio.Rank = len(AllPCIeDevs) + AllPCIeDevs[vfio.BDF] = true + vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], vfio.Rank) + } + } coldPlug := device.DeviceInfo.ColdPlug deviceLogger().WithField("cold-plug", coldPlug).Info("Attaching VFIO device") diff --git a/src/runtime/pkg/govmm/qemu/qmp.go b/src/runtime/pkg/govmm/qemu/qmp.go index 5630ff9de..6b020103d 100644 --- a/src/runtime/pkg/govmm/qemu/qmp.go +++ b/src/runtime/pkg/govmm/qemu/qmp.go @@ -656,7 +656,6 @@ func (q *QMP) executeCommand(ctx context.Context, name string, args map[string]i filter *qmpEventFilter) error { _, err := q.executeCommandWithResponse(ctx, name, args, nil, filter) - return err } @@ -1192,7 +1191,6 @@ func (q *QMP) ExecutePCIVFIODeviceAdd(ctx context.Context, devID, bdf, addr, bus if bus != "" { args["bus"] = bus } - return q.executeCommand(ctx, "device_add", args, nil) } diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go index 20d6c7d65..db7e47c72 100644 --- a/src/runtime/pkg/hypervisors/hypervisor_state.go +++ b/src/runtime/pkg/hypervisors/hypervisor_state.go @@ -39,7 +39,6 @@ type HypervisorState struct { // Belows are qemu specific // Refs: virtcontainers/qemu.go:QemuState Bridges []Bridge - // HotpluggedCPUs is the list of CPUs that were hot-added HotpluggedVCPUs []CPUDevice diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index acd5fcc7e..c1cde33c5 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -704,14 +704,12 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi return err } } - if machine.Type == QemuQ35 { if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig); err != nil { q.Logger().WithError(err).Errorf("Cannot create PCIe topology") return err } } - q.qemuConfig = qemuConfig q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath) @@ -787,12 +785,6 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig if drivers.IsPCIeDevice(vfioDevice.BDF) { numOfPluggablePorts = numOfPluggablePorts + 1 } - // Reset the Rank and AllPCIeDevsthe vfio module is going - // to assign the correct one in the case of hot-plug - if hypervisorConfig.HotPlugVFIO != config.NoPort { - vfioDevice.Rank = -1 - drivers.AllPCIeDevs = map[string]bool{} - } } } diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 9fe88b28c..9c227ad56 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -640,7 +640,6 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor infos := sandboxConfig.Containers[cnt].DeviceInfos infos = append(infos[:dev], infos[dev+1:]...) sandboxConfig.Containers[cnt].DeviceInfos = infos - } } } @@ -2017,9 +2016,7 @@ func (s *Sandbox) AppendDevice(ctx context.Context, device api.Device) error { return s.hypervisor.AddDevice(ctx, device.GetDeviceInfo().(*config.VhostUserDeviceAttrs), VhostuserDev) case config.DeviceVFIO: vfioDevs := device.GetDeviceInfo().([]*config.VFIODev) - s.Logger().Info("### vfioDevs: ", vfioDevs) for _, d := range vfioDevs { - s.Logger().Info("### vfioDev: ", d) return s.hypervisor.AddDevice(ctx, *d, VfioDev) } default: From 8f0d4e2612b6d06e5a5805b2d61efab88ae2d0b6 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Tue, 9 May 2023 19:18:06 +0000 Subject: [PATCH 10/87] vfio: Cleanup of Cold and Hot Plug Removed the configuration of PCIeRootPort and PCIeSwitchPort, those values can be deduced in createPCIeTopology Signed-off-by: Zvonko Kaiser --- src/runtime/cmd/kata-runtime/kata-env.go | 4 -- src/runtime/cmd/kata-runtime/kata-env_test.go | 6 --- .../pkg/containerd-shim-v2/create_test.go | 4 -- src/runtime/pkg/govmm/qemu/qemu.go | 2 +- .../pkg/hypervisors/hypervisor_state.go | 2 - src/runtime/pkg/katatestutils/utils.go | 4 -- .../pkg/katautils/config-settings.go.in | 2 - src/runtime/pkg/katautils/config.go | 11 ++-- src/runtime/pkg/katautils/config_test.go | 14 ----- src/runtime/pkg/oci/utils.go | 6 --- .../documentation/api/1.0/api.md | 8 --- src/runtime/virtcontainers/hypervisor.go | 8 --- src/runtime/virtcontainers/persist.go | 4 -- .../virtcontainers/persist/api/config.go | 8 --- .../pkg/annotations/annotations.go | 4 -- src/runtime/virtcontainers/qemu.go | 52 +++---------------- src/runtime/virtcontainers/qemu_amd64.go | 1 + 17 files changed, 13 insertions(+), 127 deletions(-) diff --git a/src/runtime/cmd/kata-runtime/kata-env.go b/src/runtime/cmd/kata-runtime/kata-env.go index d87e426bf..f74cb9f89 100644 --- a/src/runtime/cmd/kata-runtime/kata-env.go +++ b/src/runtime/cmd/kata-runtime/kata-env.go @@ -113,8 +113,6 @@ type HypervisorInfo struct { SocketPath string Msize9p uint32 MemorySlots uint32 - PCIeRootPort uint32 - PCIeSwitchPort uint32 HotPlugVFIO config.PCIePort ColdPlugVFIO config.PCIePort HotplugVFIOOnRootBus bool @@ -322,8 +320,6 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) { HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, - PCIeRootPort: config.HypervisorConfig.PCIeRootPort, - PCIeSwitchPort: config.HypervisorConfig.PCIeSwitchPort, SocketPath: socketPath, }, nil } diff --git a/src/runtime/cmd/kata-runtime/kata-env_test.go b/src/runtime/cmd/kata-runtime/kata-env_test.go index 4de2fb09e..c8d4d0ea9 100644 --- a/src/runtime/cmd/kata-runtime/kata-env_test.go +++ b/src/runtime/cmd/kata-runtime/kata-env_test.go @@ -88,8 +88,6 @@ func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.Runti blockStorageDriver := "virtio-scsi" enableIOThreads := true hotplugVFIOOnRootBus := true - pcieRootPort := uint32(2) - pcieSwitchPort := uint32(2) hotPlugVFIO = config.BridgePort coldPlugVFIO = config.NoPort disableNewNetNs := false @@ -137,8 +135,6 @@ func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.Runti HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, - PCIeRootPort: pcieRootPort, - PCIeSwitchPort: pcieSwitchPort, DisableNewNetNs: disableNewNetNs, DefaultVCPUCount: hypConfig.NumVCPUs, DefaultMaxVCPUCount: hypConfig.DefaultMaxVCPUs, @@ -282,8 +278,6 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo { VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, - PCIeRootPort: config.HypervisorConfig.PCIeRootPort, - PCIeSwitchPort: config.HypervisorConfig.PCIeSwitchPort, HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, } diff --git a/src/runtime/pkg/containerd-shim-v2/create_test.go b/src/runtime/pkg/containerd-shim-v2/create_test.go index f95b501b3..c24e3ced3 100644 --- a/src/runtime/pkg/containerd-shim-v2/create_test.go +++ b/src/runtime/pkg/containerd-shim-v2/create_test.go @@ -331,8 +331,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, blockDeviceDriver := "virtio-scsi" enableIOThreads := true hotplugVFIOOnRootBus := true - pcieRootPort := uint32(2) - pcieSwitchPort := uint32(3) disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := path.Join(dir, "virtiofsd") @@ -352,8 +350,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, BlockDeviceDriver: blockDeviceDriver, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, - PCIeRootPort: pcieRootPort, - PCIeSwitchPort: pcieSwitchPort, DisableNewNetNs: disableNewNetNs, SharedFS: sharedFS, VirtioFSDaemon: virtioFSdaemon, diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 7b4b9037f..453012169 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -244,6 +244,7 @@ const ( // SecExecGuest represents an s390x Secure Execution (Protected Virtualization in QEMU) object SecExecGuest ObjectType = "s390-pv-guest" + // PEFGuest represent ppc64le PEF(Protected Execution Facility) object. PEFGuest ObjectType = "pef-guest" ) @@ -377,7 +378,6 @@ func (object Object) QemuParams(config *Config) []string { deviceParams = append(deviceParams, string(object.Driver)) deviceParams = append(deviceParams, fmt.Sprintf("id=%s", object.DeviceID)) deviceParams = append(deviceParams, fmt.Sprintf("host-path=%s", object.File)) - } if len(deviceParams) > 0 { diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go index db7e47c72..f0ba941de 100644 --- a/src/runtime/pkg/hypervisors/hypervisor_state.go +++ b/src/runtime/pkg/hypervisors/hypervisor_state.go @@ -45,8 +45,6 @@ type HypervisorState struct { HotpluggedMemory int VirtiofsDaemonPid int Pid int - PCIeRootPort int - PCIeSwitchPort int HotPlugVFIO config.PCIePort ColdPlugVFIO config.PCIePort HotplugVFIOOnRootBus bool diff --git a/src/runtime/pkg/katatestutils/utils.go b/src/runtime/pkg/katatestutils/utils.go index bd76ae9a8..ec1d85c3a 100644 --- a/src/runtime/pkg/katatestutils/utils.go +++ b/src/runtime/pkg/katatestutils/utils.go @@ -224,8 +224,6 @@ type RuntimeConfigOptions struct { JaegerUser string JaegerPassword string PFlash []string - PCIeRootPort uint32 - PCIeSwitchPort uint32 HotPlugVFIO config.PCIePort ColdPlugVFIO config.PCIePort DefaultVCPUCount uint32 @@ -320,8 +318,6 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string { disable_block_device_use = ` + strconv.FormatBool(config.DisableBlock) + ` enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + ` hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + ` - pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + ` - pcie_switch_port = ` + strconv.FormatUint(uint64(config.PCIeSwitchPort), 10) + ` cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `" msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + ` enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + ` diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index f3ce570aa..d662f910f 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -82,8 +82,6 @@ const defaultEnableDebug bool = false const defaultDisableNestingChecks bool = false const defaultMsize9p uint32 = 8192 const defaultHotplugVFIOOnRootBus bool = false -const defaultPCIeRootPort = 0 -const defaultPCIeSwitchPort = 0 const defaultEntropySource = "/dev/urandom" const defaultGuestHookPath string = "" const defaultVirtioFSCacheMode = "never" diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index d34ede93b..297c7ca56 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -130,8 +130,6 @@ type hypervisor struct { MemSlots uint32 `toml:"memory_slots"` DefaultBridges uint32 `toml:"default_bridges"` Msize9p uint32 `toml:"msize_9p"` - PCIeSwitchPort uint32 `toml:"pcie_switch_port"` - PCIeRootPort uint32 `toml:"pcie_root_port"` NumVCPUs int32 `toml:"default_vcpus"` BlockDeviceCacheSet bool `toml:"block_device_cache_set"` BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"` @@ -872,8 +870,6 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, HotPlugVFIO: h.hotPlugVFIO(), ColdPlugVFIO: h.coldPlugVFIO(), - PCIeRootPort: h.PCIeRootPort, - PCIeSwitchPort: h.PCIeSwitchPort, DisableVhostNet: h.DisableVhostNet, EnableVhostUserStore: h.EnableVhostUserStore, VhostUserStorePath: h.vhostUserStorePath(), @@ -1069,8 +1065,6 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, ColdPlugVFIO: h.coldPlugVFIO(), HotPlugVFIO: h.hotPlugVFIO(), - PCIeRootPort: h.PCIeRootPort, - PCIeSwitchPort: h.PCIeSwitchPort, DisableVhostNet: true, GuestHookPath: h.guestHookPath(), VirtioFSExtraArgs: h.VirtioFSExtraArgs, @@ -1302,8 +1296,6 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus, ColdPlugVFIO: defaultColdPlugVFIO, HotPlugVFIO: defaultHotPlugVFIO, - PCIeSwitchPort: defaultPCIeSwitchPort, - PCIeRootPort: defaultPCIeRootPort, GuestHookPath: defaultGuestHookPath, VhostUserStorePath: defaultVhostUserStorePath, VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect, @@ -1698,6 +1690,9 @@ func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineT if coldPlug != config.NoPort && hotPlug != config.NoPort { return fmt.Errorf("invalid hot-plug=%s and cold-plug=%s settings, only one of them can be set", coldPlug, hotPlug) } + if coldPlug == config.NoPort && hotPlug == config.NoPort { + return nil + } var port config.PCIePort if coldPlug != config.NoPort { port = coldPlug diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index 2f062bd08..9ebdc7d57 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -86,8 +86,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime blockDeviceAIO := "io_uring" enableIOThreads := true hotplugVFIOOnRootBus := true - pcieRootPort := uint32(2) - pcieSwitchPort := uint32(3) hotPlugVFIO = config.BridgePort coldPlugVFIO = config.RootPort disableNewNetNs := false @@ -111,8 +109,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime BlockDeviceAIO: blockDeviceAIO, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, - PCIeRootPort: pcieRootPort, - PCIeSwitchPort: pcieSwitchPort, HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, DisableNewNetNs: disableNewNetNs, @@ -177,8 +173,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime DefaultBridges: defaultBridgesCount, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, - PCIeRootPort: pcieRootPort, - PCIeSwitchPort: pcieSwitchPort, HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, Msize9p: defaultMsize9p, @@ -618,7 +612,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) { disableBlock := true enableIOThreads := true hotplugVFIOOnRootBus := true - pcieRootPort := uint32(2) coldPlugVFIO = config.RootPort orgVHostVSockDevicePath := utils.VHostVSockDevicePath blockDeviceAIO := "io_uring" @@ -638,7 +631,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) { DisableBlockDeviceUse: disableBlock, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, - PCIeRootPort: pcieRootPort, ColdPlugVFIO: coldPlugVFIO, RxRateLimiterMaxRate: rxRateLimiterMaxRate, TxRateLimiterMaxRate: txRateLimiterMaxRate, @@ -694,10 +686,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) { t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus) } - if config.PCIeRootPort != pcieRootPort { - t.Errorf("Expected value for PCIeRootPort %v, got %v", pcieRootPort, config.PCIeRootPort) - } - if config.RxRateLimiterMaxRate != rxRateLimiterMaxRate { t.Errorf("Expected value for rx rate limiter %v, got %v", rxRateLimiterMaxRate, config.RxRateLimiterMaxRate) } @@ -820,7 +808,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) { disableBlock := true enableIOThreads := true hotplugVFIOOnRootBus := true - pcieRootPort := uint32(2) hypervisor := hypervisor{ Path: hypervisorPath, @@ -831,7 +818,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) { DisableBlockDeviceUse: disableBlock, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, - PCIeRootPort: pcieRootPort, } _, err := newQemuHypervisorConfig(hypervisor) diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index ff5c1912b..bcbf30f8f 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -508,12 +508,6 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig, return err } - if err := newAnnotationConfiguration(ocispec, vcAnnotations.PCIeRootPort).setUint(func(pcieRootPort uint64) { - config.HypervisorConfig.PCIeRootPort = uint32(pcieRootPort) - }); err != nil { - return err - } - if value, ok := ocispec.Annotations[vcAnnotations.EntropySource]; ok { if !checkPathIsInGlobs(runtime.HypervisorConfig.EntropySourceList, value) { return fmt.Errorf("entropy source %v required from annotation is not valid", value) diff --git a/src/runtime/virtcontainers/documentation/api/1.0/api.md b/src/runtime/virtcontainers/documentation/api/1.0/api.md index a26e653ce..ca5cb4a1a 100644 --- a/src/runtime/virtcontainers/documentation/api/1.0/api.md +++ b/src/runtime/virtcontainers/documentation/api/1.0/api.md @@ -288,14 +288,6 @@ type HypervisorConfig struct { // root bus instead of a bridge. HotplugVFIOOnRootBus bool - // PCIeRootPort is used to indicate the number of PCIe Root Port devices - // The PCIe Root Port device is used to hot(cold)-plug the PCIe device - PCIeRootPort uint32 - - // PCIeSwitchPort is used to indicate the number of PCIe Switch Ports - // The PCIe Switch port is used to hot(cold)-plug the PCIe device - PCIeSwitchPort uint32 - // HotPlugVFIO is used to indicate if devices need to be hotplugged on the // root port, switch, bridge or no port HotPlugVFIO hv.PCIePort diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 62a8d866f..bb9c86b21 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -517,14 +517,6 @@ type HypervisorConfig struct { // root port, switch or no port ColdPlugVFIO config.PCIePort - // PCIeRootPort is used to indicate the number of PCIe Root Port devices - // The PCIe Root Port device is used to hot-plug the PCIe device - PCIeRootPort uint32 - - // PCIeSwitchPort is used to indicate the number of PCIe Switch devices - // The PCIe Switch Port device is sued to hot-plug PCIe devices - PCIeSwitchPort uint32 - // NumVCPUs specifies default number of vCPUs for the VM. NumVCPUs uint32 diff --git a/src/runtime/virtcontainers/persist.go b/src/runtime/virtcontainers/persist.go index 02cdc8387..91ab51ebf 100644 --- a/src/runtime/virtcontainers/persist.go +++ b/src/runtime/virtcontainers/persist.go @@ -245,8 +245,6 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) { DisableNestingChecks: sconfig.HypervisorConfig.DisableNestingChecks, DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm, HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus, - PCIeRootPort: sconfig.HypervisorConfig.PCIeRootPort, - PCIeSwitchPort: sconfig.HypervisorConfig.PCIeSwitchPort, BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate, BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate, DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet, @@ -490,8 +488,6 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) { HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus, HotPlugVFIO: hconf.HotPlugVFIO, ColdPlugVFIO: hconf.ColdPlugVFIO, - PCIeRootPort: hconf.PCIeRootPort, - PCIeSwitchPort: hconf.PCIeSwitchPort, BootToBeTemplate: hconf.BootToBeTemplate, BootFromTemplate: hconf.BootFromTemplate, DisableVhostNet: hconf.DisableVhostNet, diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go index aeb347127..6ca5ee690 100644 --- a/src/runtime/virtcontainers/persist/api/config.go +++ b/src/runtime/virtcontainers/persist/api/config.go @@ -131,14 +131,6 @@ type HypervisorConfig struct { // Enable SGX. Hardware-based isolation and memory encryption. SGXEPCSize int64 - // PCIeRootPort is used to indicate the number of PCIe Root Port devices - // The PCIe Root Port device is used to hot-plug the PCIe device - PCIeRootPort uint32 - - // PCIeSwitchPort is used to indicate the number of PCIe Switch Downstream Port - // devices. The PCIe Switch Downstream Port is used to hot-plug the PCIe devices. - PCIeSwitchPort uint32 - // NumVCPUs specifies default number of vCPUs for the VM. NumVCPUs uint32 diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index 3584ccd70..238ab2890 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -126,10 +126,6 @@ const ( // root bus instead of a bridge. HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus" - // PCIeRootPort is used to indicate the number of PCIe Root Port devices - // The PCIe Root Port device is used to hot-plug the PCIe device - PCIeRootPort = kataAnnotHypervisorPrefix + "pcie_root_port" - // EntropySource is a sandbox annotation to specify the path to a host source of // entropy (/dev/random, /dev/urandom or real hardware RNG device) EntropySource = kataAnnotHypervisorPrefix + "entropy_source" diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index c1cde33c5..685cc791b 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -81,8 +81,6 @@ type QemuState struct { HotpluggedVCPUs []hv.CPUDevice HotpluggedMemory int VirtiofsDaemonPid int - PCIeRootPort int - PCIeSwitchPort int HotplugVFIOOnRootBus bool HotplugVFIO config.PCIePort ColdPlugVFIO config.PCIePort @@ -287,8 +285,6 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso q.state.HotPlugVFIO = q.config.HotPlugVFIO q.state.ColdPlugVFIO = q.config.ColdPlugVFIO q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus - q.state.PCIeRootPort = int(q.config.PCIeRootPort) - q.state.PCIeSwitchPort = int(q.config.PCIeSwitchPort) q.state.HotPlugVFIO = q.config.HotPlugVFIO // The path might already exist, but in case of VM templating, @@ -704,6 +700,7 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi return err } } + if machine.Type == QemuQ35 { if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig); err != nil { q.Logger().WithError(err).Errorf("Cannot create PCIe topology") @@ -792,24 +789,13 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig // use up all slots or IO memory on the root bus and vfio-XXX-pci devices // cannot be added which are crucial for Kata max slots on root bus is 32 // max slots on the complete pci(e) topology is 256 in QEMU - if hypervisorConfig.PCIeRootPort > maxPCIeRootPort { + if numOfPluggablePorts > maxPCIeRootPort { return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) } - if hypervisorConfig.PCIeSwitchPort > maxPCIeSwitchPort { + if numOfPluggablePorts > maxPCIeSwitchPort { return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeRootPort) } - // If the user provided more root ports than we have detected - // use the user provided number of PCIe root ports - if numOfPluggablePorts < hypervisorConfig.PCIeRootPort { - numOfPluggablePorts = hypervisorConfig.PCIeRootPort - } - // If the user provided more switch ports than we have detected - // use the user provided number of PCIe root ports - if numOfPluggablePorts < hypervisorConfig.PCIeSwitchPort { - numOfPluggablePorts = hypervisorConfig.PCIeSwitchPort - } - if q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus { qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) } @@ -1616,8 +1602,8 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V switch machineType { case QemuVirt: - if q.state.PCIeRootPort <= 0 { - return fmt.Errorf("Vhost-user-blk device is a PCIe device if machine type is virt. Need to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for virt") + if q.state.ColdPlugVFIO.String() != "true" { + return fmt.Errorf("TODO: Vhost-user-blk device is a PCIe device if machine type is virt. Need to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for virt") } //The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0 @@ -1811,26 +1797,10 @@ func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) { } func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) { - - if device.IsPCIe && (q.state.PCIeRootPort <= 0) { - q.Logger().WithField("dev-id", device.ID).Warn("VFIO device is a PCIe device." + - "It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35") - return fmt.Errorf("VFIO device is a PCIe device. Hotplug (%v) only supported on PCIe Root (%d) or PCIe Switch Ports (%v)", - q.state.HotPlugVFIO, q.state.PCIeRootPort, q.state.PCIeSwitchPort) - } - return q.executeVFIODeviceAdd(device) } func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.VFIODev) (err error) { - - if device.IsPCIe && (q.state.PCIeSwitchPort <= 0) { - q.Logger().WithField("dev-id", device.ID).Warn("VFIO device is a PCIe device." + - "It's recommended to add the PCIe Switch Port by setting the pcie_switch_port parameter in the configuration for q35") - return fmt.Errorf("VFIO device is a PCIe device. Hotplug (%v) only supported on PCIe Root (%d) or PCIe Switch Ports (%v)", - q.state.HotPlugVFIO, q.state.PCIeRootPort, q.state.PCIeSwitchPort) - } - return q.executeVFIODeviceAdd(device) } @@ -1882,11 +1852,9 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op if op == AddDevice { buf, _ := json.Marshal(device) q.Logger().WithFields(logrus.Fields{ - "machine-type": q.HypervisorConfig().HypervisorMachineType, - "hot-plug-vfio": q.state.HotPlugVFIO, - "pcie-root-port": q.state.PCIeRootPort, - "pcie-switch-port": q.state.PCIeSwitchPort, - "device-info": string(buf), + "machine-type": q.HypervisorConfig().HypervisorMachineType, + "hot-plug-vfio": q.state.HotPlugVFIO, + "device-info": string(buf), }).Info("Start hot-plug VFIO device") // In case MachineType is q35, a PCIe device is hotplugged on // a PCIe Root Port or alternatively on a PCIe Switch Port @@ -2887,8 +2855,6 @@ func (q *qemu) Save() (s hv.HypervisorState) { s.UUID = q.state.UUID s.HotpluggedMemory = q.state.HotpluggedMemory s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus - s.PCIeRootPort = q.state.PCIeRootPort - s.PCIeSwitchPort = q.state.PCIeSwitchPort for _, bridge := range q.arch.getBridges() { s.Bridges = append(s.Bridges, hv.Bridge{ @@ -2912,8 +2878,6 @@ func (q *qemu) Load(s hv.HypervisorState) { q.state.HotpluggedMemory = s.HotpluggedMemory q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid - q.state.PCIeRootPort = s.PCIeRootPort - q.state.PCIeSwitchPort = s.PCIeSwitchPort for _, bridge := range s.Bridges { q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr)) diff --git a/src/runtime/virtcontainers/qemu_amd64.go b/src/runtime/virtcontainers/qemu_amd64.go index cb99ec1ac..7efb5c753 100644 --- a/src/runtime/virtcontainers/qemu_amd64.go +++ b/src/runtime/virtcontainers/qemu_amd64.go @@ -305,6 +305,7 @@ func (q *qemuAmd64) appendProtectionDevice(devices []govmmQemu.Device, firmware, ReducedPhysBits: 1, }), "", nil case noneProtection: + return devices, firmware, nil default: From 40101ea7dbf957f81640aa7c48d3c6c858c0fe19 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Wed, 10 May 2023 07:04:36 +0000 Subject: [PATCH 11/87] vfio: Added annotation for hot(cold) plug Now it is possible to configure the PCIe topology via annotations and addded a simple test, checking for Invalid and RootPort Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/config/config.go | 32 +++++++ .../pkg/katautils/config-settings.go.in | 2 - src/runtime/pkg/oci/utils.go | 31 +++++++ src/runtime/pkg/oci/utils_test.go | 88 ++++++++++--------- .../pkg/annotations/annotations.go | 6 ++ src/runtime/virtcontainers/qemu.go | 7 +- src/runtime/virtcontainers/sandbox.go | 12 +-- 7 files changed, 126 insertions(+), 52 deletions(-) diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index f6ac00557..dcb0e93b8 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -162,6 +162,8 @@ const ( BridgePort = "bridge-port" // NoPort is for disabling VFIO hotplug/coldplug NoPort = "no-port" + // InvalidPort is for invalid port + InvalidPort = "invalid-port" ) func (p PCIePort) String() string { @@ -173,6 +175,8 @@ func (p PCIePort) String() string { case BridgePort: fallthrough case NoPort: + fallthrough + case InvalidPort: return string(p) } return fmt.Sprintf("", string(p)) @@ -184,6 +188,34 @@ var PCIePortPrefixMapping = map[PCIePort]PCIePortBusPrefix{ BridgePort: PCIBridgePortPrefix, } +func (p PCIePort) InValid() bool { + switch p { + case RootPort: + fallthrough + case SwitchPort: + fallthrough + case BridgePort: + fallthrough + case NoPort: + return false + } + return true +} + +func (p PCIePort) Valid() bool { + switch p { + case RootPort: + fallthrough + case SwitchPort: + fallthrough + case BridgePort: + fallthrough + case NoPort: + return true + } + return false +} + // DeviceInfo is an embedded type that contains device data common to all types of devices. type DeviceInfo struct { // DriverOptions is specific options for each device driver diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index d662f910f..1a4b7da04 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -109,5 +109,3 @@ var defaultRuntimeConfiguration = "@CONFIG_PATH@" const defaultHotPlugVFIO = config.NoPort const defaultColdPlugVFIO = config.NoPort - - diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index bcbf30f8f..e53fc8e2c 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -453,6 +453,10 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig, return err } + if err := addHypervisorHotColdPlugVfioOverrides(ocispec, config); err != nil { + return err + } + if value, ok := ocispec.Annotations[vcAnnotations.MachineType]; ok { if value != "" { config.HypervisorConfig.HypervisorMachineType = value @@ -570,6 +574,33 @@ func addHypervisorPathOverrides(ocispec specs.Spec, config *vc.SandboxConfig, ru return nil } +func addHypervisorPCIePortOverride(value string) (config.PCIePort, error) { + if value == "" { + return config.NoPort, nil + } + port := config.PCIePort(value) + if port.InValid() { + return config.InvalidPort, fmt.Errorf("Invalid PCIe port \"%v\" specified in annotation", value) + } + return port, nil +} + +func addHypervisorHotColdPlugVfioOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error { + + var err error + if value, ok := ocispec.Annotations[vcAnnotations.HotPlugVFIO]; ok { + if sbConfig.HypervisorConfig.HotPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil { + return err + } + } + if value, ok := ocispec.Annotations[vcAnnotations.ColdPlugVFIO]; ok { + if sbConfig.HypervisorConfig.ColdPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil { + return err + } + } + return nil +} + func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, runtime RuntimeConfig) error { if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMemory).setUintWithCheck(func(memorySz uint64) error { diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go index 7b2c10f2f..deb62cc14 100644 --- a/src/runtime/pkg/oci/utils_test.go +++ b/src/runtime/pkg/oci/utils_test.go @@ -599,7 +599,7 @@ func TestContainerPipeSizeAnnotation(t *testing.T) { func TestAddHypervisorAnnotations(t *testing.T) { assert := assert.New(t) - config := vc.SandboxConfig{ + sbConfig := vc.SandboxConfig{ Annotations: make(map[string]string), } @@ -628,8 +628,8 @@ func TestAddHypervisorAnnotations(t *testing.T) { runtimeConfig.HypervisorConfig.VirtioFSDaemonList = []string{"/bin/*ls*"} ocispec.Annotations[vcAnnotations.KernelParams] = "vsyscall=emulate iommu=on" - addHypervisorConfigOverrides(ocispec, &config, runtimeConfig) - assert.Exactly(expectedHyperConfig, config.HypervisorConfig) + addHypervisorConfigOverrides(ocispec, &sbConfig, runtimeConfig) + assert.Exactly(expectedHyperConfig, sbConfig.HypervisorConfig) ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1" ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1" @@ -660,7 +660,8 @@ func TestAddHypervisorAnnotations(t *testing.T) { ocispec.Annotations[vcAnnotations.GuestHookPath] = "/usr/bin/" ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true" ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true" - ocispec.Annotations[vcAnnotations.PCIeRootPort] = "2" + ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = string(config.InvalidPort) + ocispec.Annotations[vcAnnotations.HotPlugVFIO] = string(config.RootPort) ocispec.Annotations[vcAnnotations.IOMMUPlatform] = "true" ocispec.Annotations[vcAnnotations.SGXEPC] = "64Mi" ocispec.Annotations[vcAnnotations.UseLegacySerial] = "true" @@ -668,55 +669,56 @@ func TestAddHypervisorAnnotations(t *testing.T) { ocispec.Annotations[vcAnnotations.RxRateLimiterMaxRate] = "10000000" ocispec.Annotations[vcAnnotations.TxRateLimiterMaxRate] = "10000000" - addAnnotations(ocispec, &config, runtimeConfig) - assert.Equal(config.HypervisorConfig.NumVCPUs, uint32(1)) - assert.Equal(config.HypervisorConfig.DefaultMaxVCPUs, uint32(1)) - assert.Equal(config.HypervisorConfig.MemorySize, uint32(1024)) - assert.Equal(config.HypervisorConfig.MemSlots, uint32(20)) - assert.Equal(config.HypervisorConfig.MemOffset, uint64(512)) - assert.Equal(config.HypervisorConfig.VirtioMem, true) - assert.Equal(config.HypervisorConfig.MemPrealloc, true) - assert.Equal(config.HypervisorConfig.FileBackedMemRootDir, "/dev/shm") - assert.Equal(config.HypervisorConfig.HugePages, true) - assert.Equal(config.HypervisorConfig.IOMMU, true) - assert.Equal(config.HypervisorConfig.BlockDeviceDriver, "virtio-scsi") - assert.Equal(config.HypervisorConfig.BlockDeviceAIO, "io_uring") - assert.Equal(config.HypervisorConfig.DisableBlockDeviceUse, true) - assert.Equal(config.HypervisorConfig.EnableIOThreads, true) - assert.Equal(config.HypervisorConfig.BlockDeviceCacheSet, true) - assert.Equal(config.HypervisorConfig.BlockDeviceCacheDirect, true) - assert.Equal(config.HypervisorConfig.BlockDeviceCacheNoflush, true) - assert.Equal(config.HypervisorConfig.SharedFS, "virtio-fs") - assert.Equal(config.HypervisorConfig.VirtioFSDaemon, "/bin/false") - assert.Equal(config.HypervisorConfig.VirtioFSCache, "auto") - assert.ElementsMatch(config.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"}) - assert.Equal(config.HypervisorConfig.Msize9p, uint32(512)) - assert.Equal(config.HypervisorConfig.HypervisorMachineType, "q35") - assert.Equal(config.HypervisorConfig.MachineAccelerators, "nofw") - assert.Equal(config.HypervisorConfig.CPUFeatures, "pmu=off") - assert.Equal(config.HypervisorConfig.DisableVhostNet, true) - assert.Equal(config.HypervisorConfig.GuestHookPath, "/usr/bin/") - assert.Equal(config.HypervisorConfig.DisableImageNvdimm, true) - assert.Equal(config.HypervisorConfig.HotplugVFIOOnRootBus, true) - assert.Equal(config.HypervisorConfig.PCIeRootPort, uint32(2)) - assert.Equal(config.HypervisorConfig.IOMMUPlatform, true) - assert.Equal(config.HypervisorConfig.SGXEPCSize, int64(67108864)) - assert.Equal(config.HypervisorConfig.LegacySerial, true) - assert.Equal(config.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000)) - assert.Equal(config.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000)) + addAnnotations(ocispec, &sbConfig, runtimeConfig) + assert.Equal(sbConfig.HypervisorConfig.NumVCPUs, uint32(1)) + assert.Equal(sbConfig.HypervisorConfig.DefaultMaxVCPUs, uint32(1)) + assert.Equal(sbConfig.HypervisorConfig.MemorySize, uint32(1024)) + assert.Equal(sbConfig.HypervisorConfig.MemSlots, uint32(20)) + assert.Equal(sbConfig.HypervisorConfig.MemOffset, uint64(512)) + assert.Equal(sbConfig.HypervisorConfig.VirtioMem, true) + assert.Equal(sbConfig.HypervisorConfig.MemPrealloc, true) + assert.Equal(sbConfig.HypervisorConfig.FileBackedMemRootDir, "/dev/shm") + assert.Equal(sbConfig.HypervisorConfig.HugePages, true) + assert.Equal(sbConfig.HypervisorConfig.IOMMU, true) + assert.Equal(sbConfig.HypervisorConfig.BlockDeviceDriver, "virtio-scsi") + assert.Equal(sbConfig.HypervisorConfig.BlockDeviceAIO, "io_uring") + assert.Equal(sbConfig.HypervisorConfig.DisableBlockDeviceUse, true) + assert.Equal(sbConfig.HypervisorConfig.EnableIOThreads, true) + assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheSet, true) + assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheDirect, true) + assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheNoflush, true) + assert.Equal(sbConfig.HypervisorConfig.SharedFS, "virtio-fs") + assert.Equal(sbConfig.HypervisorConfig.VirtioFSDaemon, "/bin/false") + assert.Equal(sbConfig.HypervisorConfig.VirtioFSCache, "auto") + assert.ElementsMatch(sbConfig.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"}) + assert.Equal(sbConfig.HypervisorConfig.Msize9p, uint32(512)) + assert.Equal(sbConfig.HypervisorConfig.HypervisorMachineType, "q35") + assert.Equal(sbConfig.HypervisorConfig.MachineAccelerators, "nofw") + assert.Equal(sbConfig.HypervisorConfig.CPUFeatures, "pmu=off") + assert.Equal(sbConfig.HypervisorConfig.DisableVhostNet, true) + assert.Equal(sbConfig.HypervisorConfig.GuestHookPath, "/usr/bin/") + assert.Equal(sbConfig.HypervisorConfig.DisableImageNvdimm, true) + assert.Equal(sbConfig.HypervisorConfig.HotplugVFIOOnRootBus, true) + assert.Equal(sbConfig.HypervisorConfig.ColdPlugVFIO, config.InvalidPort) + assert.Equal(sbConfig.HypervisorConfig.HotPlugVFIO, config.RootPort) + assert.Equal(sbConfig.HypervisorConfig.IOMMUPlatform, true) + assert.Equal(sbConfig.HypervisorConfig.SGXEPCSize, int64(67108864)) + assert.Equal(sbConfig.HypervisorConfig.LegacySerial, true) + assert.Equal(sbConfig.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000)) + assert.Equal(sbConfig.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000)) // In case an absurd large value is provided, the config value if not over-ridden ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "655536" - err := addAnnotations(ocispec, &config, runtimeConfig) + err := addAnnotations(ocispec, &sbConfig, runtimeConfig) assert.Error(err) ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "-1" - err = addAnnotations(ocispec, &config, runtimeConfig) + err = addAnnotations(ocispec, &sbConfig, runtimeConfig) assert.Error(err) ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1" ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "-1" - err = addAnnotations(ocispec, &config, runtimeConfig) + err = addAnnotations(ocispec, &sbConfig, runtimeConfig) assert.Error(err) ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1" diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index 238ab2890..bad1f66ee 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -126,6 +126,12 @@ const ( // root bus instead of a bridge. HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus" + // ColdPlugVFIO is a sandbox annotation used to indicate if devices need to be coldplugged. + ColdPlugVFIO = kataAnnotHypervisorPrefix + "cold_plug_vfio" + + // HotPlugVFIO is a sandbox annotation used to indicate if devices need to be hotplugged. + HotPlugVFIO = kataAnnotHypervisorPrefix + "hot_plug_vfio" + // EntropySource is a sandbox annotation to specify the path to a host source of // entropy (/dev/random, /dev/urandom or real hardware RNG device) EntropySource = kataAnnotHypervisorPrefix + "entropy_source" diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 685cc791b..284ce2d79 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -746,6 +746,8 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort { return nil } + + q.Logger().Info("### PCIe Topology ###") // Add PCIe Root Port or PCIe Switches to the hypervisor // The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged // into a PCIe Root Port or PCIe Switch. @@ -778,12 +780,15 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig if err != nil { return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err) } + q.Logger().Info("### PCIe Topology devices ", devicesPerIOMMUGroup) for _, vfioDevice := range devicesPerIOMMUGroup { + q.Logger().Info("### PCIe Topology vfioDevice ", vfioDevice) if drivers.IsPCIeDevice(vfioDevice.BDF) { numOfPluggablePorts = numOfPluggablePorts + 1 } } } + q.Logger().Info("### PCIe Topology numOfPluggablePorts ", numOfPluggablePorts) // If number of PCIe root ports > 16 then bail out otherwise we may // use up all slots or IO memory on the root bus and vfio-XXX-pci devices @@ -2642,7 +2647,7 @@ func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, mach pcieRootPort := govmmQemu.PCIeRootPortDevice{ ID: fmt.Sprintf("%s%s%d", config.PCIeSwitchPortPrefix, config.PCIeRootPortPrefix, 0), Bus: defaultBridgeBus, - Chassis: "0", + Chassis: "1", Slot: strconv.FormatUint(uint64(0), 10), Multifunction: false, Addr: "0", diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 9c227ad56..5244fdf00 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -620,20 +620,20 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor // the correct amount of ports to reserve for the hypervisor. hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort) - var vfioHotPlugDevices []config.DeviceInfo - var vfioColdPlugDevices []config.DeviceInfo + var vfioDevices []config.DeviceInfo for cnt, containers := range sandboxConfig.Containers { for dev, device := range containers.DeviceInfos { isVFIO := deviceManager.IsVFIO(device.ContainerPath) if hotPlugVFIO && isVFIO { - vfioHotPlugDevices = append(vfioHotPlugDevices, device) + vfioDevices = append(vfioDevices, device) sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO } if coldPlugVFIO && isVFIO { + s.Logger().Info("### coldplug and vfio ", device, "coldplug ", sandboxConfig.HypervisorConfig.ColdPlugVFIO) device.ColdPlug = true device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO - vfioColdPlugDevices = append(vfioColdPlugDevices, device) + vfioDevices = append(vfioDevices, device) // We need to remove the devices marked for cold-plug // otherwise at the container level the kata-agent // will try to hot-plug them. @@ -643,7 +643,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor } } } - sandboxConfig.HypervisorConfig.VFIODevices = vfioHotPlugDevices + sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices // store doesn't require hypervisor to be stored immediately if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil { @@ -658,7 +658,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor return s, nil } - for _, dev := range vfioColdPlugDevices { + for _, dev := range vfioDevices { _, err := s.AddDevice(ctx, dev) if err != nil { s.Logger().WithError(err).Debug("Cannot cold-plug add device") From b11246c3aab45b4bb37a8c8acdf7e0a3612a212d Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Fri, 12 May 2023 09:58:54 +0000 Subject: [PATCH 12/87] gpu: Various fixes for virt machine type The PCI qom path was not deduced correctly added regex for correct path walking. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/drivers/vfio.go | 10 ++++++--- src/runtime/virtcontainers/kata_agent.go | 5 ++++- src/runtime/virtcontainers/mount.go | 26 ++++++++++++++++-------- src/runtime/virtcontainers/qemu.go | 19 ++++++++--------- src/runtime/virtcontainers/sandbox.go | 13 ++++++++---- 5 files changed, 45 insertions(+), 28 deletions(-) diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 8f63b2fc6..5e5f2a10b 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -214,14 +214,18 @@ func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceS switch vfioDeviceType { case config.VFIOPCIDeviceNormalType: // Get bdf of device eg. 0000:00:1c.0 - deviceBDF = getBDF(deviceFileName) + //deviceBDF = getBDF(deviceFileName) + // The old implementation did not consider the case where + // vfio devices are located on differente root busses. The + // kata-agent will handle the case now, here use the full PCI addr + deviceBDF = deviceFileName // Get sysfs path used by cloud-hypervisor deviceSysfsDev = filepath.Join(config.SysBusPciDevicesPath, deviceFileName) case config.VFIOPCIDeviceMediatedType: // Get sysfsdev of device eg. /sys/devices/pci0000:00/0000:00:02.0/f79944e4-5a3d-11e8-99ce-479cbab002e4 sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName) deviceSysfsDev, err = GetSysfsDev(sysfsDevStr) - deviceBDF = getBDF(getMediatedBDF(deviceSysfsDev)) + deviceBDF = GetBDF(getMediatedBDF(deviceSysfsDev)) case config.VFIOAPDeviceMediatedType: sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName) deviceSysfsDev, err = GetSysfsDev(sysfsDevStr) @@ -244,7 +248,7 @@ func getMediatedBDF(deviceSysfsDev string) string { // getBDF returns the BDF of pci device // Expected input string format is []:[][].[] eg. 0000:02:10.0 -func getBDF(deviceSysStr string) string { +func GetBDF(deviceSysStr string) string { tokens := strings.SplitN(deviceSysStr, ":", 2) if len(tokens) == 1 { return "" diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index a350c1a48..a3e5f5d42 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -21,6 +21,7 @@ import ( "github.com/docker/go-units" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/api" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers" volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/pkg/uuid" @@ -1152,7 +1153,9 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c * kataDevice.Type = kataVfioApDevType kataDevice.Options = dev.APDevices } else { - kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", dev.BDF, dev.GuestPciPath) + + devBDF := drivers.GetBDF(dev.BDF) + kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", devBDF, dev.GuestPciPath) } } diff --git a/src/runtime/virtcontainers/mount.go b/src/runtime/virtcontainers/mount.go index acf4f05f6..6c2e20420 100644 --- a/src/runtime/virtcontainers/mount.go +++ b/src/runtime/virtcontainers/mount.go @@ -239,32 +239,40 @@ func evalMountPath(source, destination string) (string, string, error) { // Mount describes a container mount. // nolint: govet type Mount struct { - // FSGroup a group ID that the group ownership of the files for the mounted volume - // will need to be changed when set. - FSGroup *int // Source is the source of the mount. Source string // Destination is the destination of the mount (within the container). Destination string + + // Type specifies the type of filesystem to mount. + Type string + // HostPath used to store host side bind mount path HostPath string + // GuestDeviceMount represents the path within the VM that the device // is mounted. Only relevant for block devices. This is tracked in the event // runtime wants to query the agent for mount stats. GuestDeviceMount string + // BlockDeviceID represents block device that is attached to the // VM in case this mount is a block device file or a directory // backed by a block device. BlockDeviceID string - // Type specifies the type of filesystem to mount. - Type string + + // Options list all the mount options of the filesystem. + Options []string + + // ReadOnly specifies if the mount should be read only or not + ReadOnly bool + + // FSGroup a group ID that the group ownership of the files for the mounted volume + // will need to be changed when set. + FSGroup *int + // FSGroupChangePolicy specifies the policy that will be used when applying // group id ownership change for a volume. FSGroupChangePolicy volume.FSGroupChangePolicy - // Options list all the mount options of the filesystem. - Options []string - // ReadOnly specifies if the mount should be read only or not - ReadOnly bool } func isSymlink(path string) bool { diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 284ce2d79..79dfb286e 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -701,7 +701,7 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi } } - if machine.Type == QemuQ35 { + if machine.Type == QemuQ35 || machine.Type == QemuVirt { if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig); err != nil { q.Logger().WithError(err).Errorf("Cannot create PCIe topology") return err @@ -747,7 +747,6 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig return nil } - q.Logger().Info("### PCIe Topology ###") // Add PCIe Root Port or PCIe Switches to the hypervisor // The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged // into a PCIe Root Port or PCIe Switch. @@ -780,16 +779,12 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig if err != nil { return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err) } - q.Logger().Info("### PCIe Topology devices ", devicesPerIOMMUGroup) for _, vfioDevice := range devicesPerIOMMUGroup { - q.Logger().Info("### PCIe Topology vfioDevice ", vfioDevice) if drivers.IsPCIeDevice(vfioDevice.BDF) { numOfPluggablePorts = numOfPluggablePorts + 1 } } } - q.Logger().Info("### PCIe Topology numOfPluggablePorts ", numOfPluggablePorts) - // If number of PCIe root ports > 16 then bail out otherwise we may // use up all slots or IO memory on the root bus and vfio-XXX-pci devices // cannot be added which are crucial for Kata max slots on root bus is 32 @@ -798,7 +793,7 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) } if numOfPluggablePorts > maxPCIeSwitchPort { - return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeRootPort) + return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort) } if q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus { @@ -1757,6 +1752,8 @@ func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) { } slots = append(slots, devSlot) + r, _ := regexp.Compile(`^/machine/.*/pcie.0`) + var parentPath = qemuID // We do not want to use a forever loop here, a deeper PCIe topology // than 5 is already not advisable just for the sake of having enough @@ -1775,7 +1772,7 @@ func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) { // If we hit /machine/q35/pcie.0 we're done this is the root bus // we climbed the complete hierarchy - if strings.Contains(busQOM, "/machine/q35/pcie.0") { + if r.Match([]byte(busQOM)) { break } @@ -1863,7 +1860,7 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op }).Info("Start hot-plug VFIO device") // In case MachineType is q35, a PCIe device is hotplugged on // a PCIe Root Port or alternatively on a PCIe Switch Port - if q.HypervisorConfig().HypervisorMachineType != QemuQ35 { + if q.HypervisorConfig().HypervisorMachineType != QemuQ35 && q.HypervisorConfig().HypervisorMachineType != QemuVirt { device.Bus = "" } else { var err error @@ -2636,9 +2633,9 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin // genericAppendPCIeSwitch adds a PCIe Swtich func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device { - // Q35 has the correct PCIe support, + // Q35, Virt have the correct PCIe support, // hence ignore all other machines - if machineType != QemuQ35 { + if machineType != QemuQ35 && machineType != QemuVirt { return devices } diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 5244fdf00..ed03c092d 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -630,18 +630,23 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO } if coldPlugVFIO && isVFIO { - s.Logger().Info("### coldplug and vfio ", device, "coldplug ", sandboxConfig.HypervisorConfig.ColdPlugVFIO) device.ColdPlug = true device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO vfioDevices = append(vfioDevices, device) // We need to remove the devices marked for cold-plug // otherwise at the container level the kata-agent // will try to hot-plug them. - infos := sandboxConfig.Containers[cnt].DeviceInfos - infos = append(infos[:dev], infos[dev+1:]...) - sandboxConfig.Containers[cnt].DeviceInfos = infos + sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging" } } + var filteredDevices []config.DeviceInfo + for _, device := range containers.DeviceInfos { + if device.ID != "remove-we-are-cold-plugging" { + filteredDevices = append(filteredDevices, device) + } + } + sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices + } sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices From fbacc09646aa3668164a3744c1097629491d0e63 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 15 Jun 2023 16:26:20 +0000 Subject: [PATCH 13/87] gpu: PCIe topology, consider vhost-user-block in Virt In Virt the vhost-user-block is an PCIe device so we need to make sure to consider it as well. We're keeping track of vhost-user-block devices and deduce the correct amount of PCIe root ports. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/config/config.go | 9 ++++ src/runtime/pkg/device/drivers/vfio.go | 21 +++----- src/runtime/pkg/device/manager/manager.go | 8 ++- src/runtime/pkg/device/manager/utils.go | 2 +- src/runtime/pkg/device/manager/utils_test.go | 2 +- src/runtime/virtcontainers/hypervisor.go | 6 ++- src/runtime/virtcontainers/qemu.go | 57 +++++++++++++------- src/runtime/virtcontainers/qemu_test.go | 4 -- src/runtime/virtcontainers/sandbox.go | 10 ++++ 9 files changed, 78 insertions(+), 41 deletions(-) diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index dcb0e93b8..c65e86123 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -216,6 +216,15 @@ func (p PCIePort) Valid() bool { return false } +type PCIePortMapping map[string]bool + +var ( + // Each of this structures keeps track of the devices attached to the + // different types of PCI ports. We can deduces the Bus number from it + // and eliminate duplicates being assigned. + PCIeDevices = map[PCIePort]PCIePortMapping{} +) + // DeviceInfo is an embedded type that contains device data common to all types of devices. type DeviceInfo struct { // DriverOptions is specific options for each device driver diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 5e5f2a10b..a9875f7e7 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -31,13 +31,6 @@ const ( vfioAPSysfsDir = "/sys/devices/vfio_ap" ) -var ( - // AllPCIeDevs deduces the correct bus number. The BDF keeps track that - // we're not accounting for the very same device, if a user provides the - // devices multiple times. - AllPCIeDevs = map[string]bool{} -) - // VFIODevice is a vfio device meant to be passed to the hypervisor // to be used by the Virtual Machine. type VFIODevice struct { @@ -78,9 +71,11 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } for _, vfio := range device.VfioDevs { if vfio.IsPCIe { - vfio.Rank = len(AllPCIeDevs) - AllPCIeDevs[vfio.BDF] = true - vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], vfio.Rank) + //vfio.Rank = len(AllPCIeDevs) + //AllPCIeDevs[vfio.BDF] = true + busIndex := len(config.PCIeDevices[vfio.Port]) + vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) + config.PCIeDevices[vfio.Port][vfio.BDF] = true } } @@ -214,10 +209,10 @@ func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceS switch vfioDeviceType { case config.VFIOPCIDeviceNormalType: // Get bdf of device eg. 0000:00:1c.0 - //deviceBDF = getBDF(deviceFileName) + // OLD IMPL: deviceBDF = getBDF(deviceFileName) // The old implementation did not consider the case where - // vfio devices are located on differente root busses. The - // kata-agent will handle the case now, here use the full PCI addr + // vfio devices are located on different root busses. The + // kata-agent will handle the case now, here, use the full PCI addr deviceBDF = deviceFileName // Get sysfs path used by cloud-hypervisor deviceSysfsDev = filepath.Join(config.SysBusPciDevicesPath, deviceFileName) diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go index baf1209a7..735061d9e 100644 --- a/src/runtime/pkg/device/manager/manager.go +++ b/src/runtime/pkg/device/manager/manager.go @@ -71,7 +71,11 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS dm.blockDriver = config.VirtioSCSI } - drivers.AllPCIeDevs = make(map[string]bool) + config.PCIeDevices = make(map[config.PCIePort]config.PCIePortMapping) + + config.PCIeDevices[config.RootPort] = make(map[string]bool) + config.PCIeDevices[config.SwitchPort] = make(map[string]bool) + config.PCIeDevices[config.BridgePort] = make(map[string]bool) for _, dev := range devices { dm.devices[dev.DeviceID()] = dev @@ -118,7 +122,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device } if IsVFIO(devInfo.HostPath) { return drivers.NewVFIODevice(&devInfo), nil - } else if isVhostUserBlk(devInfo) { + } else if IsVhostUserBlk(devInfo) { if devInfo.DriverOptions == nil { devInfo.DriverOptions = make(map[string]string) } diff --git a/src/runtime/pkg/device/manager/utils.go b/src/runtime/pkg/device/manager/utils.go index e78205d0c..a9e4ee8c6 100644 --- a/src/runtime/pkg/device/manager/utils.go +++ b/src/runtime/pkg/device/manager/utils.go @@ -37,7 +37,7 @@ func isBlock(devInfo config.DeviceInfo) bool { } // isVhostUserBlk checks if the device is a VhostUserBlk device. -func isVhostUserBlk(devInfo config.DeviceInfo) bool { +func IsVhostUserBlk(devInfo config.DeviceInfo) bool { return devInfo.DevType == "b" && devInfo.Major == config.VhostUserBlkMajor } diff --git a/src/runtime/pkg/device/manager/utils_test.go b/src/runtime/pkg/device/manager/utils_test.go index b57992b3d..6752719dd 100644 --- a/src/runtime/pkg/device/manager/utils_test.go +++ b/src/runtime/pkg/device/manager/utils_test.go @@ -70,7 +70,7 @@ func TestIsVhostUserBlk(t *testing.T) { } for _, d := range data { - isVhostUserBlk := isVhostUserBlk( + isVhostUserBlk := IsVhostUserBlk( config.DeviceInfo{ DevType: d.devType, Major: d.major, diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index bb9c86b21..43a631c1d 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -505,9 +505,13 @@ type HypervisorConfig struct { // MemOffset specifies memory space for nvdimm device MemOffset uint64 - // RawDevics are used to get PCIe device info early before the sandbox + // VFIODevices are used to get PCIe device info early before the sandbox // is started to make better PCIe topology decisions VFIODevices []config.DeviceInfo + // VhostUserBlkDevices are handled differently in Q35 and Virt machine + // type. capture them early before the sandbox to make better PCIe topology + // decisions + VhostUserBlkDevices []config.DeviceInfo // HotplugVFIO is used to indicate if devices need to be hotplugged on the // root port or a switch diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 79dfb286e..ad94923ff 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -702,7 +702,7 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi } if machine.Type == QemuQ35 || machine.Type == QemuVirt { - if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig); err != nil { + if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig, machine.Type); err != nil { q.Logger().WithError(err).Errorf("Cannot create PCIe topology") return err } @@ -741,9 +741,10 @@ func (q *qemu) checkBpfEnabled() { // Max PCIe switch ports is 16 // There is only 64kB of IO memory each root,switch port will consume 4k hence // only 16 ports possible. -func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig) error { +func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig, machineType string) error { + // If no-port set just return no need to add PCIe Root Port or PCIe Switches - if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort { + if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort && machineType == QemuQ35 { return nil } @@ -767,7 +768,8 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg) } - // Get the number of hot(cold)-pluggable ports needed from the provided devices + // Get the number of hot(cold)-pluggable ports needed from the provided + // VFIO devices and VhostUserBlockDevices var numOfPluggablePorts uint32 = 0 for _, dev := range hypervisorConfig.VFIODevices { var err error @@ -785,22 +787,42 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig } } } + vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus) + vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort) + + numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices) + // If number of PCIe root ports > 16 then bail out otherwise we may // use up all slots or IO memory on the root bus and vfio-XXX-pci devices // cannot be added which are crucial for Kata max slots on root bus is 32 // max slots on the complete pci(e) topology is 256 in QEMU - if numOfPluggablePorts > maxPCIeRootPort { - return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) - } - if numOfPluggablePorts > maxPCIeSwitchPort { - return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort) - } - - if q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus { + if vfioOnRootPort { + // On Arm the vhost-user-block device is a PCIe device we need + // to account for it in the number of pluggable ports + if machineType == QemuVirt { + numOfPluggablePorts = numOfPluggablePorts + uint32(numOfVhostUserBlockDevices) + } + if numOfPluggablePorts > maxPCIeRootPort { + return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) + } qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) + return nil } - if q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort { + if vfioOnSwitchPort { + // On Arm the vhost-user-block device is a PCIe device we need + // to account for it in the number of pluggable ports + if machineType == QemuVirt { + numOfPluggableRootPorts := uint32(numOfVhostUserBlockDevices) + if numOfPluggableRootPorts > maxPCIeRootPort { + return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) + } + qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggableRootPorts, memSize32bit, memSize64bit) + } + if numOfPluggablePorts > maxPCIeSwitchPort { + return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort) + } qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) + return nil } return nil } @@ -1585,6 +1607,7 @@ func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDri } func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation, devID string) (err error) { + err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false, vAttr.ReconnectTime) if err != nil { return err @@ -1602,16 +1625,12 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V switch machineType { case QemuVirt: - if q.state.ColdPlugVFIO.String() != "true" { - return fmt.Errorf("TODO: Vhost-user-blk device is a PCIe device if machine type is virt. Need to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for virt") - } - //The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0 //Since the dev is the first and only one on this bus(root port), it should be 0. addr := "00" - bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(drivers.AllPCIeDevs)) - drivers.AllPCIeDevs[devID] = true + bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort])) + config.PCIeDevices[config.RootPort][devID] = true bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID) bridgeSlot, err := q.qomGetSlot(bridgeQomPath) diff --git a/src/runtime/virtcontainers/qemu_test.go b/src/runtime/virtcontainers/qemu_test.go index bfa348145..418075e26 100644 --- a/src/runtime/virtcontainers/qemu_test.go +++ b/src/runtime/virtcontainers/qemu_test.go @@ -111,9 +111,6 @@ func TestQemuCreateVM(t *testing.T) { config6 := newQemuConfig() config6.DisableGuestSeLinux = false - config7 := newQemuConfig() - config7.PCIeRootPort = 1 - config8 := newQemuConfig() config8.EnableVhostUserStore = true config8.HugePages = true @@ -161,7 +158,6 @@ func TestQemuCreateVM(t *testing.T) { {config3, false, true}, {config5, false, true}, {config6, false, false}, - {config7, false, true}, {config8, false, true}, {config9, true, false}, {config10, false, true}, diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index ed03c092d..9949656a6 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -621,9 +621,17 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort) var vfioDevices []config.DeviceInfo + // vhost-user-block device is a PCIe device in Virt, keep track of it + // for correct number of PCIe root ports. + var vhostUserBlkDevices []config.DeviceInfo for cnt, containers := range sandboxConfig.Containers { for dev, device := range containers.DeviceInfos { + + if deviceManager.IsVhostUserBlk(device) { + vhostUserBlkDevices = append(vhostUserBlkDevices, device) + continue + } isVFIO := deviceManager.IsVFIO(device.ContainerPath) if hotPlugVFIO && isVFIO { vfioDevices = append(vfioDevices, device) @@ -649,6 +657,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor } sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices + sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices // store doesn't require hypervisor to be stored immediately if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil { @@ -1930,6 +1939,7 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy return err case config.VhostUserBlk: vhostUserBlkDevice, ok := device.(*drivers.VhostUserBlkDevice) + if !ok { return fmt.Errorf("device type mismatch, expect device type to be %s", devType) } From 72f2cb84e61f9b544d295beeb6f9b76f6019a411 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Thu, 15 Jun 2023 17:37:01 +0000 Subject: [PATCH 14/87] gpu: Reset cold or hot plug after overriding If we override the cold, hot plug with an annotation we need to reset the other plugging mechanism to NoPort otherwise both will be enabled. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/drivers/vfio.go | 2 -- src/runtime/pkg/oci/utils.go | 4 ++++ src/runtime/virtcontainers/qemu.go | 8 +++++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index a9875f7e7..801b1a81f 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -71,8 +71,6 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } for _, vfio := range device.VfioDevs { if vfio.IsPCIe { - //vfio.Rank = len(AllPCIeDevs) - //AllPCIeDevs[vfio.BDF] = true busIndex := len(config.PCIeDevices[vfio.Port]) vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) config.PCIeDevices[vfio.Port][vfio.BDF] = true diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index e53fc8e2c..21e027160 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -592,11 +592,15 @@ func addHypervisorHotColdPlugVfioOverrides(ocispec specs.Spec, sbConfig *vc.Sand if sbConfig.HypervisorConfig.HotPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil { return err } + // If hot-plug is specified disable cold-plug and vice versa + sbConfig.HypervisorConfig.ColdPlugVFIO = config.NoPort } if value, ok := ocispec.Annotations[vcAnnotations.ColdPlugVFIO]; ok { if sbConfig.HypervisorConfig.ColdPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil { return err } + // If cold-plug is specified disable hot-plug and vice versa + sbConfig.HypervisorConfig.HotPlugVFIO = config.NoPort } return nil } diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index ad94923ff..80a05b3cf 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -65,6 +65,11 @@ const romFile = "" // Default value is false. const defaultDisableModern = false +// A deeper PCIe topology than 5 is already not advisable just for the sake +// of having enough buffer we limit ourselves to 10 and exit if we reach +// the root bus +const maxPCIeTopoDepth = 10 + type qmpChannel struct { qmp *govmmQemu.QMP ctx context.Context @@ -1771,6 +1776,7 @@ func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) { } slots = append(slots, devSlot) + // This only works for Q35 and Virt r, _ := regexp.Compile(`^/machine/.*/pcie.0`) var parentPath = qemuID @@ -1778,7 +1784,7 @@ func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) { // than 5 is already not advisable just for the sake of having enough // buffer we limit ourselves to 10 and leave the loop early if we hit // the root bus. - for i := 1; i <= 10; i++ { + for i := 1; i <= maxPCIeTopoDepth; i++ { parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus") if err != nil { return types.PciPath{}, err From b535c7cbd8fb7def4a198c8e679bc8443b1567d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bombo?= Date: Tue, 30 May 2023 11:01:28 -0700 Subject: [PATCH 15/87] tests: Enable running k8s tests on Mariner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This removes the gate and lets CI run tests on Mariner. Fixes: #6840 Signed-off-by: Aurélien Bombo --- tests/integration/kubernetes/run_kubernetes_tests.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/integration/kubernetes/run_kubernetes_tests.sh b/tests/integration/kubernetes/run_kubernetes_tests.sh index 0975ec0d5..db1e16633 100644 --- a/tests/integration/kubernetes/run_kubernetes_tests.sh +++ b/tests/integration/kubernetes/run_kubernetes_tests.sh @@ -54,10 +54,6 @@ else ) fi -if [ ${KATA_HOST_OS} == "cbl-mariner" ]; then - exit 0 -fi - # we may need to skip a few test cases when running on non-x86_64 arch arch_config_file="${kubernetes_dir}/filter_out_per_arch/${TARGET_ARCH}.yaml" if [ -f "${arch_config_file}" ]; then From 8330fb8ee7e17b8ad4598e29d44358f115b265cc Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Fri, 16 Jun 2023 08:01:10 +0000 Subject: [PATCH 16/87] gpu: Update unit tests Some tests are now failing due to the changes how PCIe is handled. Update the test accordingly. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/drivers/vfio_test.go | 2 +- src/runtime/pkg/katautils/config_test.go | 6 +++--- src/runtime/pkg/oci/utils_test.go | 14 ++++++++------ 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/runtime/pkg/device/drivers/vfio_test.go b/src/runtime/pkg/device/drivers/vfio_test.go index 2ded0f850..9a03fa030 100644 --- a/src/runtime/pkg/device/drivers/vfio_test.go +++ b/src/runtime/pkg/device/drivers/vfio_test.go @@ -20,7 +20,7 @@ func TestGetVFIODetails(t *testing.T) { } data := []testData{ - {"0000:02:10.0", "02:10.0"}, + {"0000:02:10.0", "0000:02:10.0"}, {"0000:0210.0", ""}, {"f79944e4-5a3d-11e8-99ce-", ""}, {"f79944e4-5a3d-11e8-99ce", ""}, diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index 9ebdc7d57..059c1d69e 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -86,8 +86,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime blockDeviceAIO := "io_uring" enableIOThreads := true hotplugVFIOOnRootBus := true - hotPlugVFIO = config.BridgePort - coldPlugVFIO = config.RootPort + hotPlugVFIO = config.NoPort + coldPlugVFIO = config.BridgePort disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := path.Join(dir, "virtiofsd") @@ -612,7 +612,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) { disableBlock := true enableIOThreads := true hotplugVFIOOnRootBus := true - coldPlugVFIO = config.RootPort + coldPlugVFIO = config.BridgePort orgVHostVSockDevicePath := utils.VHostVSockDevicePath blockDeviceAIO := "io_uring" defer func() { diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go index deb62cc14..d54e7092e 100644 --- a/src/runtime/pkg/oci/utils_test.go +++ b/src/runtime/pkg/oci/utils_test.go @@ -660,8 +660,8 @@ func TestAddHypervisorAnnotations(t *testing.T) { ocispec.Annotations[vcAnnotations.GuestHookPath] = "/usr/bin/" ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true" ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true" - ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = string(config.InvalidPort) - ocispec.Annotations[vcAnnotations.HotPlugVFIO] = string(config.RootPort) + ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = config.BridgePort + ocispec.Annotations[vcAnnotations.HotPlugVFIO] = config.NoPort ocispec.Annotations[vcAnnotations.IOMMUPlatform] = "true" ocispec.Annotations[vcAnnotations.SGXEPC] = "64Mi" ocispec.Annotations[vcAnnotations.UseLegacySerial] = "true" @@ -669,7 +669,9 @@ func TestAddHypervisorAnnotations(t *testing.T) { ocispec.Annotations[vcAnnotations.RxRateLimiterMaxRate] = "10000000" ocispec.Annotations[vcAnnotations.TxRateLimiterMaxRate] = "10000000" - addAnnotations(ocispec, &sbConfig, runtimeConfig) + err := addAnnotations(ocispec, &sbConfig, runtimeConfig) + assert.NoError(err) + assert.Equal(sbConfig.HypervisorConfig.NumVCPUs, uint32(1)) assert.Equal(sbConfig.HypervisorConfig.DefaultMaxVCPUs, uint32(1)) assert.Equal(sbConfig.HypervisorConfig.MemorySize, uint32(1024)) @@ -699,8 +701,8 @@ func TestAddHypervisorAnnotations(t *testing.T) { assert.Equal(sbConfig.HypervisorConfig.GuestHookPath, "/usr/bin/") assert.Equal(sbConfig.HypervisorConfig.DisableImageNvdimm, true) assert.Equal(sbConfig.HypervisorConfig.HotplugVFIOOnRootBus, true) - assert.Equal(sbConfig.HypervisorConfig.ColdPlugVFIO, config.InvalidPort) - assert.Equal(sbConfig.HypervisorConfig.HotPlugVFIO, config.RootPort) + assert.Equal(string(sbConfig.HypervisorConfig.ColdPlugVFIO), string(config.BridgePort)) + assert.Equal(string(sbConfig.HypervisorConfig.HotPlugVFIO), string(config.NoPort)) assert.Equal(sbConfig.HypervisorConfig.IOMMUPlatform, true) assert.Equal(sbConfig.HypervisorConfig.SGXEPCSize, int64(67108864)) assert.Equal(sbConfig.HypervisorConfig.LegacySerial, true) @@ -709,7 +711,7 @@ func TestAddHypervisorAnnotations(t *testing.T) { // In case an absurd large value is provided, the config value if not over-ridden ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "655536" - err := addAnnotations(ocispec, &sbConfig, runtimeConfig) + err = addAnnotations(ocispec, &sbConfig, runtimeConfig) assert.Error(err) ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "-1" From beb70636830ec56dd6c4d56b0558ac0ba44bba30 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Wed, 28 Jun 2023 16:09:19 +0000 Subject: [PATCH 17/87] metrics: Uniformity across function names This PR adds the word function before the function names in order to have uniformity across the script as some are using this and some are not. Fixes #7196 Signed-off-by: Gabriela Cervantes --- tests/metrics/gha-run.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index cdff6c766..e92a8c698 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -13,7 +13,7 @@ kata_tarball_dir=${2:-kata-artifacts} metrics_dir="$(dirname "$(readlink -f "$0")")" source "${metrics_dir}/../common.bash" -create_symbolic_links() { +function create_symbolic_links() { hypervisor="${1:-qemu}" local link_configuration_file="/opt/kata/share/defaults/kata-containers/configuration.toml" local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${hypervisor}.toml" @@ -26,7 +26,7 @@ create_symbolic_links() { } # Configures containerd -overwrite_containerd_config() { +function overwrite_containerd_config() { containerd_config="/etc/containerd/config.toml" sudo rm "${containerd_config}" sudo tee "${containerd_config}" << EOF @@ -44,7 +44,7 @@ version = 2 EOF } -install_kata() { +function install_kata() { local kata_tarball="kata-static.tar.xz" declare -r katadir="/opt/kata" declare -r destdir="/" @@ -66,7 +66,7 @@ install_kata() { restart_containerd_service } -check_containerd_config_for_kata() { +function check_containerd_config_for_kata() { # check containerd config declare -r line1="default_runtime_name = \"kata\"" declare -r line2="runtime_type = \"io.containerd.kata.v2\"" From f680fc52bed87a3dfdb31c857108199fae9a8f82 Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Wed, 12 Oct 2022 22:03:11 +0100 Subject: [PATCH 18/87] agent: change `AGENT_CONFIG`'s lazy type to just `AgentConfig` Since it is never modified, it doesn't really need a lock of any kind. Removing the `RwLock` wrapper allows us to remove all `.read().await` calls when accessing it. Additionally, `AGENT_CONFIG` already has a static lifetime, so there is no need to wrap it in a ref-counted heap allocation. Fixes: #5409 Signed-off-by: Wedson Almeida Filho --- src/agent/src/main.rs | 15 +++++++-------- src/agent/src/rpc.rs | 10 +++------- src/agent/src/uevent.rs | 2 +- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/src/agent/src/main.rs b/src/agent/src/main.rs index f82904804..58c406fe1 100644 --- a/src/agent/src/main.rs +++ b/src/agent/src/main.rs @@ -65,7 +65,7 @@ use tokio::{ io::AsyncWrite, sync::{ watch::{channel, Receiver}, - Mutex, RwLock, + Mutex, }, task::JoinHandle, }; @@ -83,12 +83,11 @@ cfg_if! { const NAME: &str = "kata-agent"; lazy_static! { - static ref AGENT_CONFIG: Arc> = Arc::new(RwLock::new( + static ref AGENT_CONFIG: AgentConfig = // Note: We can't do AgentOpts.parse() here to send through the processed arguments to AgentConfig // clap::Parser::parse() greedily process all command line input including cargo test parameters, // so should only be used inside main. - AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap() - )); + AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap(); } #[derive(Parser)] @@ -181,13 +180,13 @@ async fn real_main() -> std::result::Result<(), Box> { lazy_static::initialize(&AGENT_CONFIG); - init_agent_as_init(&logger, AGENT_CONFIG.read().await.unified_cgroup_hierarchy)?; + init_agent_as_init(&logger, AGENT_CONFIG.unified_cgroup_hierarchy)?; drop(logger_async_guard); } else { lazy_static::initialize(&AGENT_CONFIG); } - let config = AGENT_CONFIG.read().await; + let config = &AGENT_CONFIG; let log_vport = config.log_vport as u32; let log_handle = tokio::spawn(create_logger_task(rfd, log_vport, shutdown_rx.clone())); @@ -200,7 +199,7 @@ async fn real_main() -> std::result::Result<(), Box> { let (logger, logger_async_guard) = logging::create_logger(NAME, "agent", config.log_level, writer); - announce(&logger, &config); + announce(&logger, config); // This variable is required as it enables the global (and crucially static) logger, // which is required to satisfy the the lifetime constraints of the auto-generated gRPC code. @@ -228,7 +227,7 @@ async fn real_main() -> std::result::Result<(), Box> { let span_guard = root_span.enter(); // Start the sandbox and wait for its ttRPC server to end - start_sandbox(&logger, &config, init_mode, &mut tasks, shutdown_rx.clone()).await?; + start_sandbox(&logger, config, init_mode, &mut tasks, shutdown_rx.clone()).await?; // Install a NOP logger for the remainder of the shutdown sequence // to ensure any log calls made by local crates using the scope logger diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index dede3204c..ed7729855 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -126,11 +126,7 @@ macro_rules! ttrpc_error { macro_rules! is_allowed { ($req:ident) => { - if !AGENT_CONFIG - .read() - .await - .is_allowed_endpoint($req.descriptor_dyn().name()) - { + if !AGENT_CONFIG.is_allowed_endpoint($req.descriptor_dyn().name()) { return Err(ttrpc_error!( ttrpc::Code::UNIMPLEMENTED, format!("{} is blocked", $req.descriptor_dyn().name()), @@ -240,7 +236,7 @@ impl AgentService { let mut ctr: LinuxContainer = LinuxContainer::new(cid.as_str(), CONTAINER_BASE, opts, &sl!())?; - let pipe_size = AGENT_CONFIG.read().await.container_pipe_size; + let pipe_size = AGENT_CONFIG.container_pipe_size; let p = if let Some(p) = oci.process { Process::new(&sl!(), &p, cid.as_str(), true, pipe_size)? @@ -374,7 +370,7 @@ impl AgentService { // Apply any necessary corrections for PCI addresses update_env_pci(&mut process.Env, &sandbox.pcimap)?; - let pipe_size = AGENT_CONFIG.read().await.container_pipe_size; + let pipe_size = AGENT_CONFIG.container_pipe_size; let ocip = rustjail::process_grpc_to_oci(&process); let p = Process::new(&sl!(), &ocip, exec_id.as_str(), false, pipe_size)?; diff --git a/src/agent/src/uevent.rs b/src/agent/src/uevent.rs index 5d1f55494..cabdd6235 100644 --- a/src/agent/src/uevent.rs +++ b/src/agent/src/uevent.rs @@ -141,7 +141,7 @@ pub async fn wait_for_uevent( info!(sl!(), "{}: waiting on channel", logprefix); - let hotplug_timeout = AGENT_CONFIG.read().await.hotplug_timeout; + let hotplug_timeout = AGENT_CONFIG.hotplug_timeout; let uev = match tokio::time::timeout(hotplug_timeout, rx).await { Ok(v) => v?, From 0e5d6ce6d7eb3cbff0c449e74710905ab30e935f Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Wed, 28 Jun 2023 13:04:31 -0300 Subject: [PATCH 19/87] agent: convert the `is_allowed` macro to a function Having a function allows for better error messages from the type checker and it makes it clearer to callers what can happen. For example: is_allowed!(req); Gives no indication that it may result in an early return, and no simple way for callers to modify the behaviour. It also makes it look like ownership of `req` is being transferred. On the other hand, is_allowed(&req)?; Indicates that `req` is being borrowed (immutably) and may fail. The question mark indicates that the caller wants an early return on failure. Fixes: #7201 Signed-off-by: Wedson Almeida Filho --- src/agent/src/rpc.rs | 90 ++++++++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index ed7729855..e9081c8df 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -124,15 +124,15 @@ macro_rules! ttrpc_error { }; } -macro_rules! is_allowed { - ($req:ident) => { - if !AGENT_CONFIG.is_allowed_endpoint($req.descriptor_dyn().name()) { - return Err(ttrpc_error!( - ttrpc::Code::UNIMPLEMENTED, - format!("{} is blocked", $req.descriptor_dyn().name()), - )); - } - }; +fn is_allowed(req: &impl MessageDyn) -> ttrpc::Result<()> { + if !AGENT_CONFIG.is_allowed_endpoint(req.descriptor_dyn().name()) { + Err(ttrpc_error!( + ttrpc::Code::UNIMPLEMENTED, + format!("{} is blocked", req.descriptor_dyn().name()), + )) + } else { + Ok(()) + } } #[derive(Clone, Debug)] @@ -650,7 +650,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::CreateContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "create_container", req); - is_allowed!(req); + is_allowed(&req)?; match self.do_create_container(req).await { Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), @@ -663,7 +663,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::StartContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "start_container", req); - is_allowed!(req); + is_allowed(&req)?; match self.do_start_container(req).await { Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), @@ -676,7 +676,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::RemoveContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "remove_container", req); - is_allowed!(req); + is_allowed(&req)?; match self.do_remove_container(req).await { Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), @@ -690,7 +690,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::ExecProcessRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "exec_process", req); - is_allowed!(req); + is_allowed(&req)?; match self.do_exec_process(req).await { Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), @@ -703,7 +703,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::SignalProcessRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "signal_process", req); - is_allowed!(req); + is_allowed(&req)?; match self.do_signal_process(req).await { Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), @@ -716,7 +716,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::WaitProcessRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "wait_process", req); - is_allowed!(req); + is_allowed(&req)?; self.do_wait_process(req) .await .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) @@ -728,7 +728,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::UpdateContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "update_container", req); - is_allowed!(req); + is_allowed(&req)?; let cid = req.container_id.clone(); let res = req.resources; @@ -764,7 +764,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::StatsContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "stats_container", req); - is_allowed!(req); + is_allowed(&req)?; let cid = req.container_id; let s = Arc::clone(&self.sandbox); let mut sandbox = s.lock().await; @@ -786,7 +786,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::PauseContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "pause_container", req); - is_allowed!(req); + is_allowed(&req)?; let cid = req.container_id(); let s = Arc::clone(&self.sandbox); let mut sandbox = s.lock().await; @@ -810,7 +810,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::ResumeContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "resume_container", req); - is_allowed!(req); + is_allowed(&req)?; let cid = req.container_id(); let s = Arc::clone(&self.sandbox); let mut sandbox = s.lock().await; @@ -834,7 +834,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::RemoveStaleVirtiofsShareMountsRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "remove_stale_virtiofs_share_mounts", req); - is_allowed!(req); + is_allowed(&req)?; let mount_infos = parse_mount_table("/proc/self/mountinfo") .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; for m in &mount_infos { @@ -856,7 +856,7 @@ impl agent_ttrpc::AgentService for AgentService { _ctx: &TtrpcContext, req: protocols::agent::WriteStreamRequest, ) -> ttrpc::Result { - is_allowed!(req); + is_allowed(&req)?; self.do_write_stream(req) .await .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) @@ -867,7 +867,7 @@ impl agent_ttrpc::AgentService for AgentService { _ctx: &TtrpcContext, req: protocols::agent::ReadStreamRequest, ) -> ttrpc::Result { - is_allowed!(req); + is_allowed(&req)?; self.do_read_stream(req, true) .await .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) @@ -878,7 +878,7 @@ impl agent_ttrpc::AgentService for AgentService { _ctx: &TtrpcContext, req: protocols::agent::ReadStreamRequest, ) -> ttrpc::Result { - is_allowed!(req); + is_allowed(&req)?; self.do_read_stream(req, false) .await .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) @@ -890,7 +890,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::CloseStdinRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "close_stdin", req); - is_allowed!(req); + is_allowed(&req)?; let cid = req.container_id.clone(); let eid = req.exec_id; @@ -917,7 +917,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::TtyWinResizeRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "tty_win_resize", req); - is_allowed!(req); + is_allowed(&req)?; let cid = req.container_id.clone(); let eid = req.exec_id.clone(); @@ -959,7 +959,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::UpdateInterfaceRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "update_interface", req); - is_allowed!(req); + is_allowed(&req)?; let interface = req.interface.into_option().ok_or_else(|| { ttrpc_error!( @@ -987,7 +987,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::UpdateRoutesRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "update_routes", req); - is_allowed!(req); + is_allowed(&req)?; let new_routes = req.routes.into_option().map(|r| r.Routes).ok_or_else(|| { ttrpc_error!( @@ -1024,7 +1024,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::UpdateEphemeralMountsRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "update_mounts", req); - is_allowed!(req); + is_allowed(&req)?; match update_ephemeral_mounts(sl!(), req.storages.to_vec(), self.sandbox.clone()).await { Ok(_) => Ok(Empty::new()), @@ -1041,7 +1041,7 @@ impl agent_ttrpc::AgentService for AgentService { req: GetIPTablesRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "get_iptables", req); - is_allowed!(req); + is_allowed(&req)?; info!(sl!(), "get_ip_tables: request received"); @@ -1080,7 +1080,7 @@ impl agent_ttrpc::AgentService for AgentService { req: SetIPTablesRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "set_iptables", req); - is_allowed!(req); + is_allowed(&req)?; info!(sl!(), "set_ip_tables request received"); @@ -1195,7 +1195,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::ListInterfacesRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "list_interfaces", req); - is_allowed!(req); + is_allowed(&req)?; let list = self .sandbox @@ -1223,7 +1223,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::ListRoutesRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "list_routes", req); - is_allowed!(req); + is_allowed(&req)?; let list = self .sandbox @@ -1246,7 +1246,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::CreateSandboxRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "create_sandbox", req); - is_allowed!(req); + is_allowed(&req)?; { let sandbox = self.sandbox.clone(); @@ -1311,7 +1311,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::DestroySandboxRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "destroy_sandbox", req); - is_allowed!(req); + is_allowed(&req)?; let s = Arc::clone(&self.sandbox); let mut sandbox = s.lock().await; @@ -1346,7 +1346,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::AddARPNeighborsRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "add_arp_neighbors", req); - is_allowed!(req); + is_allowed(&req)?; let neighs = req .neighbors @@ -1380,7 +1380,7 @@ impl agent_ttrpc::AgentService for AgentService { ctx: &TtrpcContext, req: protocols::agent::OnlineCPUMemRequest, ) -> ttrpc::Result { - is_allowed!(req); + is_allowed(&req)?; let s = Arc::clone(&self.sandbox); let sandbox = s.lock().await; trace_rpc_call!(ctx, "online_cpu_mem", req); @@ -1398,7 +1398,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::ReseedRandomDevRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "reseed_random_dev", req); - is_allowed!(req); + is_allowed(&req)?; random::reseed_rng(req.data.as_slice()) .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; @@ -1412,7 +1412,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::GuestDetailsRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "get_guest_details", req); - is_allowed!(req); + is_allowed(&req)?; info!(sl!(), "get guest details!"); let mut resp = GuestDetailsResponse::new(); @@ -1446,7 +1446,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::MemHotplugByProbeRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "mem_hotplug_by_probe", req); - is_allowed!(req); + is_allowed(&req)?; do_mem_hotplug_by_probe(&req.memHotplugProbeAddr) .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; @@ -1460,7 +1460,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::SetGuestDateTimeRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "set_guest_date_time", req); - is_allowed!(req); + is_allowed(&req)?; do_set_guest_date_time(req.Sec, req.Usec) .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; @@ -1474,7 +1474,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::CopyFileRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "copy_file", req); - is_allowed!(req); + is_allowed(&req)?; do_copy_file(&req).map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; @@ -1487,7 +1487,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::GetMetricsRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "get_metrics", req); - is_allowed!(req); + is_allowed(&req)?; match get_metrics(&req) { Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), @@ -1504,7 +1504,7 @@ impl agent_ttrpc::AgentService for AgentService { _ctx: &TtrpcContext, req: protocols::agent::GetOOMEventRequest, ) -> ttrpc::Result { - is_allowed!(req); + is_allowed(&req)?; let sandbox = self.sandbox.clone(); let s = sandbox.lock().await; let event_rx = &s.event_rx.clone(); @@ -1530,7 +1530,7 @@ impl agent_ttrpc::AgentService for AgentService { req: VolumeStatsRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "get_volume_stats", req); - is_allowed!(req); + is_allowed(&req)?; info!(sl!(), "get volume stats!"); let mut resp = VolumeStatsResponse::new(); @@ -1571,7 +1571,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::AddSwapRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "add_swap", req); - is_allowed!(req); + is_allowed(&req)?; do_add_swap(&self.sandbox, &req) .await From 0860fbd410341ec82ad89a072091107700fb03fb Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Wed, 28 Jun 2023 13:21:00 -0300 Subject: [PATCH 20/87] agent: convert the `ttrpc_error` macro to a function There is nothing in it that requires it to be a macro. Converting it to a function allows for better error messages. Fixes: #7201 Signed-off-by: Wedson Almeida Filho --- src/agent/src/rpc.rs | 134 +++++++++++++++++++++---------------------- 1 file changed, 66 insertions(+), 68 deletions(-) diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index e9081c8df..ef2c0f9e7 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -117,16 +117,14 @@ macro_rules! sl { }; } -// Convenience macro to wrap an error and response to ttrpc client -macro_rules! ttrpc_error { - ($code:path, $err:expr $(,)?) => { - get_rpc_status($code, format!("{:?}", $err)) - }; +// Convenience function to wrap an error and response to ttrpc client +fn ttrpc_error(code: ttrpc::Code, err: impl std::fmt::Debug) -> ttrpc::Error { + get_rpc_status(code, format!("{:?}", err)) } fn is_allowed(req: &impl MessageDyn) -> ttrpc::Result<()> { if !AGENT_CONFIG.is_allowed_endpoint(req.descriptor_dyn().name()) { - Err(ttrpc_error!( + Err(ttrpc_error( ttrpc::Code::UNIMPLEMENTED, format!("{} is blocked", req.descriptor_dyn().name()), )) @@ -652,7 +650,7 @@ impl agent_ttrpc::AgentService for AgentService { trace_rpc_call!(ctx, "create_container", req); is_allowed(&req)?; match self.do_create_container(req).await { - Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), } } @@ -665,7 +663,7 @@ impl agent_ttrpc::AgentService for AgentService { trace_rpc_call!(ctx, "start_container", req); is_allowed(&req)?; match self.do_start_container(req).await { - Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), } } @@ -679,7 +677,7 @@ impl agent_ttrpc::AgentService for AgentService { is_allowed(&req)?; match self.do_remove_container(req).await { - Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), } } @@ -692,7 +690,7 @@ impl agent_ttrpc::AgentService for AgentService { trace_rpc_call!(ctx, "exec_process", req); is_allowed(&req)?; match self.do_exec_process(req).await { - Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), } } @@ -705,7 +703,7 @@ impl agent_ttrpc::AgentService for AgentService { trace_rpc_call!(ctx, "signal_process", req); is_allowed(&req)?; match self.do_signal_process(req).await { - Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), } } @@ -719,7 +717,7 @@ impl agent_ttrpc::AgentService for AgentService { is_allowed(&req)?; self.do_wait_process(req) .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e)) } async fn update_container( @@ -736,7 +734,7 @@ impl agent_ttrpc::AgentService for AgentService { let mut sandbox = s.lock().await; let ctr = sandbox.get_container(&cid).ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "invalid container id".to_string(), ) @@ -748,7 +746,7 @@ impl agent_ttrpc::AgentService for AgentService { let oci_res = rustjail::resources_grpc_to_oci(res); match ctr.set(oci_res) { Err(e) => { - return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)); + return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } Ok(_) => return Ok(resp), @@ -770,14 +768,14 @@ impl agent_ttrpc::AgentService for AgentService { let mut sandbox = s.lock().await; let ctr = sandbox.get_container(&cid).ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "invalid container id".to_string(), ) })?; ctr.stats() - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e)) } async fn pause_container( @@ -792,14 +790,14 @@ impl agent_ttrpc::AgentService for AgentService { let mut sandbox = s.lock().await; let ctr = sandbox.get_container(cid).ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "invalid container id".to_string(), ) })?; ctr.pause() - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -816,14 +814,14 @@ impl agent_ttrpc::AgentService for AgentService { let mut sandbox = s.lock().await; let ctr = sandbox.get_container(cid).ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "invalid container id".to_string(), ) })?; ctr.resume() - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -836,7 +834,7 @@ impl agent_ttrpc::AgentService for AgentService { trace_rpc_call!(ctx, "remove_stale_virtiofs_share_mounts", req); is_allowed(&req)?; let mount_infos = parse_mount_table("/proc/self/mountinfo") - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; for m in &mount_infos { if m.mount_point.starts_with(KATA_GUEST_SHARE_DIR) { // stat the mount point, virtiofs daemon will remove the stale cache and release the fds if the mount point doesn't exist any more. @@ -859,7 +857,7 @@ impl agent_ttrpc::AgentService for AgentService { is_allowed(&req)?; self.do_write_stream(req) .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e)) } async fn read_stdout( @@ -870,7 +868,7 @@ impl agent_ttrpc::AgentService for AgentService { is_allowed(&req)?; self.do_read_stream(req, true) .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e)) } async fn read_stderr( @@ -881,7 +879,7 @@ impl agent_ttrpc::AgentService for AgentService { is_allowed(&req)?; self.do_read_stream(req, false) .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e)) } async fn close_stdin( @@ -900,7 +898,7 @@ impl agent_ttrpc::AgentService for AgentService { let p = sandbox .find_container_process(cid.as_str(), eid.as_str()) .map_err(|e| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, format!("invalid argument: {:?}", e), ) @@ -926,7 +924,7 @@ impl agent_ttrpc::AgentService for AgentService { let p = sandbox .find_container_process(cid.as_str(), eid.as_str()) .map_err(|e| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::UNAVAILABLE, format!("invalid argument: {:?}", e), ) @@ -943,11 +941,11 @@ impl agent_ttrpc::AgentService for AgentService { let err = libc::ioctl(fd, TIOCSWINSZ, &win); Errno::result(err).map(drop).map_err(|e| { - ttrpc_error!(ttrpc::Code::INTERNAL, format!("ioctl error: {:?}", e)) + ttrpc_error(ttrpc::Code::INTERNAL, format!("ioctl error: {:?}", e)) })?; } } else { - return Err(ttrpc_error!(ttrpc::Code::UNAVAILABLE, "no tty".to_string())); + return Err(ttrpc_error(ttrpc::Code::UNAVAILABLE, "no tty".to_string())); } Ok(Empty::new()) @@ -962,7 +960,7 @@ impl agent_ttrpc::AgentService for AgentService { is_allowed(&req)?; let interface = req.interface.into_option().ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "empty update interface request".to_string(), ) @@ -975,7 +973,7 @@ impl agent_ttrpc::AgentService for AgentService { .update_interface(&interface) .await .map_err(|e| { - ttrpc_error!(ttrpc::Code::INTERNAL, format!("update interface: {:?}", e)) + ttrpc_error(ttrpc::Code::INTERNAL, format!("update interface: {:?}", e)) })?; Ok(interface) @@ -990,7 +988,7 @@ impl agent_ttrpc::AgentService for AgentService { is_allowed(&req)?; let new_routes = req.routes.into_option().map(|r| r.Routes).ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "empty update routes request".to_string(), ) @@ -999,14 +997,14 @@ impl agent_ttrpc::AgentService for AgentService { let mut sandbox = self.sandbox.lock().await; sandbox.rtnl.update_routes(new_routes).await.map_err(|e| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INTERNAL, format!("Failed to update routes: {:?}", e), ) })?; let list = sandbox.rtnl.list_routes().await.map_err(|e| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INTERNAL, format!("Failed to list routes after update: {:?}", e), ) @@ -1028,7 +1026,7 @@ impl agent_ttrpc::AgentService for AgentService { match update_ephemeral_mounts(sl!(), req.storages.to_vec(), self.sandbox.clone()).await { Ok(_) => Ok(Empty::new()), - Err(e) => Err(ttrpc_error!( + Err(e) => Err(ttrpc_error( ttrpc::Code::INTERNAL, format!("Failed to update mounts: {:?}", e), )), @@ -1069,7 +1067,7 @@ impl agent_ttrpc::AgentService for AgentService { }), Err(e) => { warn!(sl!(), "failed to run {}: {:?}", cmd, e.kind()); - return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)); + return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } } } @@ -1112,7 +1110,7 @@ impl agent_ttrpc::AgentService for AgentService { Ok(child) => child, Err(e) => { warn!(sl!(), "failure to spawn {}: {:?}", cmd, e.kind()); - return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)); + return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } }; @@ -1120,9 +1118,9 @@ impl agent_ttrpc::AgentService for AgentService { Some(si) => si, None => { println!("failed to get stdin from child"); - return Err(ttrpc_error!( + return Err(ttrpc_error( ttrpc::Code::INTERNAL, - "failed to take stdin from child".to_string() + "failed to take stdin from child".to_string(), )); } }; @@ -1145,16 +1143,16 @@ impl agent_ttrpc::AgentService for AgentService { .await .is_err() { - return Err(ttrpc_error!( + return Err(ttrpc_error( ttrpc::Code::INTERNAL, - "timeout waiting for stdin writer to complete".to_string() + "timeout waiting for stdin writer to complete".to_string(), )); } if handle.await.is_err() { - return Err(ttrpc_error!( + return Err(ttrpc_error( ttrpc::Code::INTERNAL, - "stdin writer thread failure".to_string() + "stdin writer thread failure".to_string(), )); } @@ -1167,19 +1165,19 @@ impl agent_ttrpc::AgentService for AgentService { cmd, e.kind() ); - return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)); + return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } }; if !output.status.success() { warn!(sl!(), "{} failed: {:?}", cmd, output.stderr); - return Err(ttrpc_error!( + return Err(ttrpc_error( ttrpc::Code::INTERNAL, format!( "{} failed: {:?}", cmd, String::from_utf8_lossy(&output.stderr) - ) + ), )); } @@ -1205,7 +1203,7 @@ impl agent_ttrpc::AgentService for AgentService { .list_interfaces() .await .map_err(|e| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INTERNAL, format!("Failed to list interfaces: {:?}", e), ) @@ -1232,7 +1230,7 @@ impl agent_ttrpc::AgentService for AgentService { .rtnl .list_routes() .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, format!("list routes: {:?}", e)))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, format!("list routes: {:?}", e)))?; Ok(protocols::agent::Routes { Routes: list, @@ -1272,12 +1270,12 @@ impl agent_ttrpc::AgentService for AgentService { } for m in req.kernel_modules.iter() { - load_kernel_module(m).map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + load_kernel_module(m).map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; } s.setup_shared_namespaces() .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; } match add_storages(sl!(), req.storages.to_vec(), self.sandbox.clone(), None).await { @@ -1286,7 +1284,7 @@ impl agent_ttrpc::AgentService for AgentService { let mut s = sandbox.lock().await; s.mounts = m } - Err(e) => return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), }; match setup_guest_dns(sl!(), req.dns.to_vec()) { @@ -1299,7 +1297,7 @@ impl agent_ttrpc::AgentService for AgentService { .iter() .map(|dns| s.network.set_dns(dns.to_string())); } - Err(e) => return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), }; Ok(Empty::new()) @@ -1320,7 +1318,7 @@ impl agent_ttrpc::AgentService for AgentService { sandbox .destroy() .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; // Close get_oom_event connection, // otherwise it will block the shutdown of ttrpc. sandbox.event_tx.take(); @@ -1329,13 +1327,13 @@ impl agent_ttrpc::AgentService for AgentService { .sender .take() .ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INTERNAL, "failed to get sandbox sender channel".to_string(), ) })? .send(1) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -1353,7 +1351,7 @@ impl agent_ttrpc::AgentService for AgentService { .into_option() .map(|n| n.ARPNeighbors) .ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "empty add arp neighbours request".to_string(), ) @@ -1366,7 +1364,7 @@ impl agent_ttrpc::AgentService for AgentService { .add_arp_neighbors(neighs) .await .map_err(|e| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INTERNAL, format!("Failed to add ARP neighbours: {:?}", e), ) @@ -1387,7 +1385,7 @@ impl agent_ttrpc::AgentService for AgentService { sandbox .online_cpu_memory(&req) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -1401,7 +1399,7 @@ impl agent_ttrpc::AgentService for AgentService { is_allowed(&req)?; random::reseed_rng(req.data.as_slice()) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -1429,7 +1427,7 @@ impl agent_ttrpc::AgentService for AgentService { } Err(e) => { info!(sl!(), "fail to get memory info!"); - return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)); + return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } } @@ -1449,7 +1447,7 @@ impl agent_ttrpc::AgentService for AgentService { is_allowed(&req)?; do_mem_hotplug_by_probe(&req.memHotplugProbeAddr) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -1463,7 +1461,7 @@ impl agent_ttrpc::AgentService for AgentService { is_allowed(&req)?; do_set_guest_date_time(req.Sec, req.Usec) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -1476,7 +1474,7 @@ impl agent_ttrpc::AgentService for AgentService { trace_rpc_call!(ctx, "copy_file", req); is_allowed(&req)?; - do_copy_file(&req).map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + do_copy_file(&req).map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -1490,7 +1488,7 @@ impl agent_ttrpc::AgentService for AgentService { is_allowed(&req)?; match get_metrics(&req) { - Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), Ok(s) => { let mut metrics = Metrics::new(); metrics.set_metrics(s); @@ -1521,7 +1519,7 @@ impl agent_ttrpc::AgentService for AgentService { return Ok(resp); } - Err(ttrpc_error!(ttrpc::Code::INTERNAL, "")) + Err(ttrpc_error(ttrpc::Code::INTERNAL, "")) } async fn get_volume_stats( @@ -1544,7 +1542,7 @@ impl agent_ttrpc::AgentService for AgentService { } Err(e) => { info!(sl!(), "failed to open the volume"); - return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)); + return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } }; @@ -1553,12 +1551,12 @@ impl agent_ttrpc::AgentService for AgentService { // to get volume capacity stats get_volume_capacity_stats(&req.volume_guest_path) .map(|u| usage_vec.push(u)) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; // to get volume inode stats get_volume_inode_stats(&req.volume_guest_path) .map(|u| usage_vec.push(u)) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; resp.usage = usage_vec; resp.volume_condition = MessageField::some(condition); @@ -1575,7 +1573,7 @@ impl agent_ttrpc::AgentService for AgentService { do_add_swap(&self.sandbox, &req) .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } From 0504bd725420c6a8a0769bea963758d366e0c0f2 Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Wed, 28 Jun 2023 13:28:18 -0300 Subject: [PATCH 21/87] agent: convert the `sl` macros to functions There is nothing in them that requires them to be macros. Converting them to functions allows for better error messages. Fixes: #7201 Signed-off-by: Wedson Almeida Filho --- src/agent/rustjail/src/cgroups/fs/mod.rs | 46 ++++----- src/agent/rustjail/src/cgroups/notifier.rs | 30 +++--- src/agent/rustjail/src/container.rs | 12 +-- src/agent/src/device.rs | 16 ++- src/agent/src/metrics.rs | 28 +++-- src/agent/src/rpc.rs | 114 ++++++++++----------- src/agent/src/tracer.rs | 2 +- src/agent/src/uevent.rs | 16 ++- 8 files changed, 125 insertions(+), 139 deletions(-) diff --git a/src/agent/rustjail/src/cgroups/fs/mod.rs b/src/agent/rustjail/src/cgroups/fs/mod.rs index fc023ac61..6145f5f9c 100644 --- a/src/agent/rustjail/src/cgroups/fs/mod.rs +++ b/src/agent/rustjail/src/cgroups/fs/mod.rs @@ -39,11 +39,9 @@ use std::path::Path; const GUEST_CPUS_PATH: &str = "/sys/devices/system/cpu/online"; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger().new(o!("subsystem" => "cgroups")) - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger().new(o!("subsystem" => "cgroups")) } macro_rules! get_controller_or_return_singular_none { @@ -82,7 +80,7 @@ impl CgroupManager for Manager { fn set(&self, r: &LinuxResources, update: bool) -> Result<()> { info!( - sl!(), + sl(), "cgroup manager set resources for container. Resources input {:?}", r ); @@ -120,7 +118,7 @@ impl CgroupManager for Manager { // set devices resources set_devices_resources(&self.cgroup, &r.devices, res); - info!(sl!(), "resources after processed {:?}", res); + info!(sl(), "resources after processed {:?}", res); // apply resources self.cgroup.apply(res)?; @@ -197,7 +195,7 @@ impl CgroupManager for Manager { if guest_cpuset.is_empty() { return Ok(()); } - info!(sl!(), "update_cpuset_path to: {}", guest_cpuset); + info!(sl(), "update_cpuset_path to: {}", guest_cpuset); let h = cgroups::hierarchies::auto(); let root_cg = h.root_control_group(); @@ -205,12 +203,12 @@ impl CgroupManager for Manager { let root_cpuset_controller: &CpuSetController = root_cg.controller_of().unwrap(); let path = root_cpuset_controller.path(); let root_path = Path::new(path); - info!(sl!(), "root cpuset path: {:?}", &path); + info!(sl(), "root cpuset path: {:?}", &path); let container_cpuset_controller: &CpuSetController = self.cgroup.controller_of().unwrap(); let path = container_cpuset_controller.path(); let container_path = Path::new(path); - info!(sl!(), "container cpuset path: {:?}", &path); + info!(sl(), "container cpuset path: {:?}", &path); let mut paths = vec![]; for ancestor in container_path.ancestors() { @@ -219,7 +217,7 @@ impl CgroupManager for Manager { } paths.push(ancestor); } - info!(sl!(), "parent paths to update cpuset: {:?}", &paths); + info!(sl(), "parent paths to update cpuset: {:?}", &paths); let mut i = paths.len(); loop { @@ -233,7 +231,7 @@ impl CgroupManager for Manager { .to_str() .unwrap() .trim_start_matches(root_path.to_str().unwrap()); - info!(sl!(), "updating cpuset for parent path {:?}", &r_path); + info!(sl(), "updating cpuset for parent path {:?}", &r_path); let cg = new_cgroup(cgroups::hierarchies::auto(), r_path)?; let cpuset_controller: &CpuSetController = cg.controller_of().unwrap(); cpuset_controller.set_cpus(guest_cpuset)?; @@ -241,7 +239,7 @@ impl CgroupManager for Manager { if !container_cpuset.is_empty() { info!( - sl!(), + sl(), "updating cpuset for container path: {:?} cpuset: {}", &container_path, container_cpuset @@ -276,7 +274,7 @@ fn set_network_resources( network: &LinuxNetwork, res: &mut cgroups::Resources, ) { - info!(sl!(), "cgroup manager set network"); + info!(sl(), "cgroup manager set network"); // set classid // description can be found at https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v1/net_cls.html @@ -303,7 +301,7 @@ fn set_devices_resources( device_resources: &[LinuxDeviceCgroup], res: &mut cgroups::Resources, ) { - info!(sl!(), "cgroup manager set devices"); + info!(sl(), "cgroup manager set devices"); let mut devices = vec![]; for d in device_resources.iter() { @@ -332,7 +330,7 @@ fn set_hugepages_resources( hugepage_limits: &[LinuxHugepageLimit], res: &mut cgroups::Resources, ) { - info!(sl!(), "cgroup manager set hugepage"); + info!(sl(), "cgroup manager set hugepage"); let mut limits = vec![]; let hugetlb_controller = cg.controller_of::(); @@ -346,7 +344,7 @@ fn set_hugepages_resources( limits.push(hr); } else { warn!( - sl!(), + sl(), "{} page size support cannot be verified, dropping requested limit", l.page_size ); } @@ -359,7 +357,7 @@ fn set_block_io_resources( blkio: &LinuxBlockIo, res: &mut cgroups::Resources, ) { - info!(sl!(), "cgroup manager set block io"); + info!(sl(), "cgroup manager set block io"); res.blkio.weight = blkio.weight; res.blkio.leaf_weight = blkio.leaf_weight; @@ -387,13 +385,13 @@ fn set_block_io_resources( } fn set_cpu_resources(cg: &cgroups::Cgroup, cpu: &LinuxCpu) -> Result<()> { - info!(sl!(), "cgroup manager set cpu"); + info!(sl(), "cgroup manager set cpu"); let cpuset_controller: &CpuSetController = cg.controller_of().unwrap(); if !cpu.cpus.is_empty() { if let Err(e) = cpuset_controller.set_cpus(&cpu.cpus) { - warn!(sl!(), "write cpuset failed: {:?}", e); + warn!(sl(), "write cpuset failed: {:?}", e); } } @@ -424,7 +422,7 @@ fn set_cpu_resources(cg: &cgroups::Cgroup, cpu: &LinuxCpu) -> Result<()> { } fn set_memory_resources(cg: &cgroups::Cgroup, memory: &LinuxMemory, update: bool) -> Result<()> { - info!(sl!(), "cgroup manager set memory"); + info!(sl(), "cgroup manager set memory"); let mem_controller: &MemController = cg.controller_of().unwrap(); if !update { @@ -493,7 +491,7 @@ fn set_memory_resources(cg: &cgroups::Cgroup, memory: &LinuxMemory, update: bool } fn set_pids_resources(cg: &cgroups::Cgroup, pids: &LinuxPids) -> Result<()> { - info!(sl!(), "cgroup manager set pids"); + info!(sl(), "cgroup manager set pids"); let pid_controller: &PidController = cg.controller_of().unwrap(); let v = if pids.limit > 0 { MaxValue::Value(pids.limit) @@ -962,7 +960,7 @@ pub fn get_paths() -> Result> { for l in fs::read_to_string(PATHS)?.lines() { let fl: Vec<&str> = l.split(':').collect(); if fl.len() != 3 { - info!(sl!(), "Corrupted cgroup data!"); + info!(sl(), "Corrupted cgroup data!"); continue; } @@ -983,7 +981,7 @@ pub fn get_mounts(paths: &HashMap) -> Result = p[1].split(' ').collect(); if post.len() != 3 { - warn!(sl!(), "can't parse {} line {:?}", MOUNTS, l); + warn!(sl(), "can't parse {} line {:?}", MOUNTS, l); continue; } diff --git a/src/agent/rustjail/src/cgroups/notifier.rs b/src/agent/rustjail/src/cgroups/notifier.rs index 9f91b3584..5260a3d3f 100644 --- a/src/agent/rustjail/src/cgroups/notifier.rs +++ b/src/agent/rustjail/src/cgroups/notifier.rs @@ -16,11 +16,9 @@ use inotify::{Inotify, WatchMask}; use tokio::io::AsyncReadExt; use tokio::sync::mpsc::{channel, Receiver}; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger().new(o!("subsystem" => "cgroups_notifier")) - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger().new(o!("subsystem" => "cgroups_notifier")) } pub async fn notify_oom(cid: &str, cg_dir: String) -> Result> { @@ -38,7 +36,7 @@ pub async fn notify_oom(cid: &str, cg_dir: String) -> Result> { fn get_value_from_cgroup(path: &Path, key: &str) -> Result { let content = fs::read_to_string(path)?; info!( - sl!(), + sl(), "get_value_from_cgroup file: {:?}, content: {}", &path, &content ); @@ -67,11 +65,11 @@ async fn register_memory_event_v2( let event_control_path = Path::new(&cg_dir).join(memory_event_name); let cgroup_event_control_path = Path::new(&cg_dir).join(cgroup_event_name); info!( - sl!(), + sl(), "register_memory_event_v2 event_control_path: {:?}", &event_control_path ); info!( - sl!(), + sl(), "register_memory_event_v2 cgroup_event_control_path: {:?}", &cgroup_event_control_path ); @@ -82,8 +80,8 @@ async fn register_memory_event_v2( // Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited let cg_wd = inotify.add_watch(&cgroup_event_control_path, WatchMask::MODIFY)?; - info!(sl!(), "ev_wd: {:?}", ev_wd); - info!(sl!(), "cg_wd: {:?}", cg_wd); + info!(sl(), "ev_wd: {:?}", ev_wd); + info!(sl(), "cg_wd: {:?}", cg_wd); let (sender, receiver) = channel(100); let containere_id = containere_id.to_string(); @@ -97,17 +95,17 @@ async fn register_memory_event_v2( while let Some(event_or_error) = stream.next().await { let event = event_or_error.unwrap(); info!( - sl!(), + sl(), "container[{}] get event for container: {:?}", &containere_id, &event ); // info!("is1: {}", event.wd == wd1); - info!(sl!(), "event.wd: {:?}", event.wd); + info!(sl(), "event.wd: {:?}", event.wd); if event.wd == ev_wd { let oom = get_value_from_cgroup(&event_control_path, "oom_kill"); if oom.unwrap_or(0) > 0 { let _ = sender.send(containere_id.clone()).await.map_err(|e| { - error!(sl!(), "send containere_id failed, error: {:?}", e); + error!(sl(), "send containere_id failed, error: {:?}", e); }); return; } @@ -171,13 +169,13 @@ async fn register_memory_event( let mut buf = [0u8; 8]; match eventfd_stream.read(&mut buf).await { Err(err) => { - warn!(sl!(), "failed to read from eventfd: {:?}", err); + warn!(sl(), "failed to read from eventfd: {:?}", err); return; } Ok(_) => { let content = fs::read_to_string(path.clone()); info!( - sl!(), + sl(), "cgroup event for container: {}, path: {:?}, content: {:?}", &containere_id, &path, @@ -193,7 +191,7 @@ async fn register_memory_event( } let _ = sender.send(containere_id.clone()).await.map_err(|e| { - error!(sl!(), "send containere_id failed, error: {:?}", e); + error!(sl(), "send containere_id failed, error: {:?}", e); }); } }); diff --git a/src/agent/rustjail/src/container.rs b/src/agent/rustjail/src/container.rs index b1d7499cd..2964ae377 100644 --- a/src/agent/rustjail/src/container.rs +++ b/src/agent/rustjail/src/container.rs @@ -1596,10 +1596,8 @@ mod tests { use tempfile::tempdir; use test_utils::skip_if_not_root; - macro_rules! sl { - () => { - slog_scope::logger() - }; + fn sl() -> slog::Logger { + slog_scope::logger() } #[test] @@ -1854,7 +1852,7 @@ mod tests { let _ = new_linux_container_and_then(|mut c: LinuxContainer| { c.processes.insert( 1, - Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap(), + Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap(), ); let p = c.get_process("123"); assert!(p.is_ok(), "Expecting Ok, Got {:?}", p); @@ -1881,7 +1879,7 @@ mod tests { let (c, _dir) = new_linux_container(); let ret = c .unwrap() - .start(Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap()) + .start(Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap()) .await; assert!(ret.is_err(), "Expecting Err, Got {:?}", ret); } @@ -1891,7 +1889,7 @@ mod tests { let (c, _dir) = new_linux_container(); let ret = c .unwrap() - .run(Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap()) + .run(Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap()) .await; assert!(ret.is_err(), "Expecting Err, Got {:?}", ret); } diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index 53fe77b1b..0268f8c34 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -26,11 +26,9 @@ use oci::{LinuxDeviceCgroup, LinuxResources, Spec}; use protocols::agent::Device; use tracing::instrument; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger().new(o!("subsystem" => "device")) - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger().new(o!("subsystem" => "device")) } const VM_ROOTFS: &str = "/"; @@ -78,7 +76,7 @@ where { let syspci = Path::new(&syspci); let drv = drv.as_ref(); - info!(sl!(), "rebind_pci_driver: {} => {:?}", dev, drv); + info!(sl(), "rebind_pci_driver: {} => {:?}", dev, drv); let devpath = syspci.join("devices").join(dev.to_string()); let overridepath = &devpath.join("driver_override"); @@ -606,7 +604,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) - let host_minor = specdev.minor; info!( - sl!(), + sl(), "update_spec_devices() updating device"; "container_path" => &specdev.path, "type" => &specdev.r#type, @@ -657,7 +655,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) - if let Some(update) = res_updates.get(&(r.r#type.as_str(), host_major, host_minor)) { info!( - sl!(), + sl(), "update_spec_devices() updating resource"; "type" => &r.r#type, "host_major" => host_major, @@ -921,7 +919,7 @@ pub async fn add_devices( #[instrument] async fn add_device(device: &Device, sandbox: &Arc>) -> Result { // log before validation to help with debugging gRPC protocol version differences. - info!(sl!(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}", + info!(sl(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}", device.id, device.type_, device.vm_path, device.container_path, device.options); if device.type_.is_empty() { diff --git a/src/agent/src/metrics.rs b/src/agent/src/metrics.rs index a5522c0eb..d7fc4d12b 100644 --- a/src/agent/src/metrics.rs +++ b/src/agent/src/metrics.rs @@ -15,11 +15,9 @@ use tracing::instrument; const NAMESPACE_KATA_AGENT: &str = "kata_agent"; const NAMESPACE_KATA_GUEST: &str = "kata_guest"; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger().new(o!("subsystem" => "metrics")) - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger().new(o!("subsystem" => "metrics")) } lazy_static! { @@ -139,7 +137,7 @@ fn update_agent_metrics() -> Result<()> { Ok(p) => p, Err(e) => { // FIXME: return Ok for all errors? - warn!(sl!(), "failed to create process instance: {:?}", e); + warn!(sl(), "failed to create process instance: {:?}", e); return Ok(()); } @@ -160,7 +158,7 @@ fn update_agent_metrics() -> Result<()> { // io match me.io() { Err(err) => { - info!(sl!(), "failed to get process io stat: {:?}", err); + info!(sl(), "failed to get process io stat: {:?}", err); } Ok(io) => { set_gauge_vec_proc_io(&AGENT_IO_STAT, &io); @@ -169,7 +167,7 @@ fn update_agent_metrics() -> Result<()> { match me.stat() { Err(err) => { - info!(sl!(), "failed to get process stat: {:?}", err); + info!(sl(), "failed to get process stat: {:?}", err); } Ok(stat) => { set_gauge_vec_proc_stat(&AGENT_PROC_STAT, &stat); @@ -177,7 +175,7 @@ fn update_agent_metrics() -> Result<()> { } match me.status() { - Err(err) => error!(sl!(), "failed to get process status: {:?}", err), + Err(err) => error!(sl(), "failed to get process status: {:?}", err), Ok(status) => set_gauge_vec_proc_status(&AGENT_PROC_STATUS, &status), } @@ -189,7 +187,7 @@ fn update_guest_metrics() { // try get load and task info match procfs::LoadAverage::new() { Err(err) => { - info!(sl!(), "failed to get guest LoadAverage: {:?}", err); + info!(sl(), "failed to get guest LoadAverage: {:?}", err); } Ok(load) => { GUEST_LOAD @@ -209,7 +207,7 @@ fn update_guest_metrics() { // try to get disk stats match procfs::diskstats() { Err(err) => { - info!(sl!(), "failed to get guest diskstats: {:?}", err); + info!(sl(), "failed to get guest diskstats: {:?}", err); } Ok(diskstats) => { for diskstat in diskstats { @@ -221,7 +219,7 @@ fn update_guest_metrics() { // try to get vm stats match procfs::vmstat() { Err(err) => { - info!(sl!(), "failed to get guest vmstat: {:?}", err); + info!(sl(), "failed to get guest vmstat: {:?}", err); } Ok(vmstat) => { for (k, v) in vmstat { @@ -233,7 +231,7 @@ fn update_guest_metrics() { // cpu stat match procfs::KernelStats::new() { Err(err) => { - info!(sl!(), "failed to get guest KernelStats: {:?}", err); + info!(sl(), "failed to get guest KernelStats: {:?}", err); } Ok(kernel_stats) => { set_gauge_vec_cpu_time(&GUEST_CPU_TIME, "total", &kernel_stats.total); @@ -246,7 +244,7 @@ fn update_guest_metrics() { // try to get net device stats match procfs::net::dev_status() { Err(err) => { - info!(sl!(), "failed to get guest net::dev_status: {:?}", err); + info!(sl(), "failed to get guest net::dev_status: {:?}", err); } Ok(devs) => { // netdev: map[string]procfs::net::DeviceStatus @@ -259,7 +257,7 @@ fn update_guest_metrics() { // get statistics about memory from /proc/meminfo match procfs::Meminfo::new() { Err(err) => { - info!(sl!(), "failed to get guest Meminfo: {:?}", err); + info!(sl(), "failed to get guest Meminfo: {:?}", err); } Ok(meminfo) => { set_gauge_vec_meminfo(&GUEST_MEMINFO, &meminfo); diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index ef2c0f9e7..8299ed34b 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -110,11 +110,9 @@ const ERR_NO_SANDBOX_PIDNS: &str = "Sandbox does not have sandbox_pidns"; // not available. const IPTABLES_RESTORE_WAIT_SEC: u64 = 5; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger() - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger() } // Convenience function to wrap an error and response to ttrpc client @@ -158,14 +156,14 @@ impl AgentService { let mut oci = match oci_spec.as_mut() { Some(spec) => rustjail::grpc_to_oci(spec), None => { - error!(sl!(), "no oci spec in the create container request!"); + error!(sl(), "no oci spec in the create container request!"); return Err(anyhow!(nix::Error::EINVAL)); } }; - info!(sl!(), "receive createcontainer, spec: {:?}", &oci); + info!(sl(), "receive createcontainer, spec: {:?}", &oci); info!( - sl!(), + sl(), "receive createcontainer, storages: {:?}", &req.storages ); @@ -184,7 +182,7 @@ impl AgentService { // here, the agent will rely on rustjail (using the oci.Mounts // list) to bind mount all of them inside the container. let m = add_storages( - sl!(), + sl(), req.storages.to_vec(), self.sandbox.clone(), Some(req.container_id.clone()), @@ -232,33 +230,33 @@ impl AgentService { }; let mut ctr: LinuxContainer = - LinuxContainer::new(cid.as_str(), CONTAINER_BASE, opts, &sl!())?; + LinuxContainer::new(cid.as_str(), CONTAINER_BASE, opts, &sl())?; let pipe_size = AGENT_CONFIG.container_pipe_size; let p = if let Some(p) = oci.process { - Process::new(&sl!(), &p, cid.as_str(), true, pipe_size)? + Process::new(&sl(), &p, cid.as_str(), true, pipe_size)? } else { - info!(sl!(), "no process configurations!"); + info!(sl(), "no process configurations!"); return Err(anyhow!(nix::Error::EINVAL)); }; // if starting container failed, we will do some rollback work // to ensure no resources are leaked. if let Err(err) = ctr.start(p).await { - error!(sl!(), "failed to start container: {:?}", err); + error!(sl(), "failed to start container: {:?}", err); if let Err(e) = ctr.destroy().await { - error!(sl!(), "failed to destroy container: {:?}", e); + error!(sl(), "failed to destroy container: {:?}", e); } if let Err(e) = remove_container_resources(&mut s, &cid) { - error!(sl!(), "failed to remove container resources: {:?}", e); + error!(sl(), "failed to remove container resources: {:?}", e); } return Err(err); } s.update_shared_pidns(&ctr)?; s.add_container(ctr); - info!(sl!(), "created container!"); + info!(sl(), "created container!"); Ok(()) } @@ -355,7 +353,7 @@ impl AgentService { let cid = req.container_id.clone(); let exec_id = req.exec_id.clone(); - info!(sl!(), "do_exec_process cid: {} eid: {}", cid, exec_id); + info!(sl(), "do_exec_process cid: {} eid: {}", cid, exec_id); let s = self.sandbox.clone(); let mut sandbox = s.lock().await; @@ -370,7 +368,7 @@ impl AgentService { let pipe_size = AGENT_CONFIG.container_pipe_size; let ocip = rustjail::process_grpc_to_oci(&process); - let p = Process::new(&sl!(), &ocip, exec_id.as_str(), false, pipe_size)?; + let p = Process::new(&sl(), &ocip, exec_id.as_str(), false, pipe_size)?; let ctr = sandbox .get_container(&cid) @@ -388,7 +386,7 @@ impl AgentService { let s = self.sandbox.clone(); info!( - sl!(), + sl(), "signal process"; "container-id" => cid.clone(), "exec-id" => eid.clone(), @@ -410,7 +408,7 @@ impl AgentService { match p.signal(sig) { Err(Errno::ESRCH) => { info!( - sl!(), + sl(), "signal encounter ESRCH, continue"; "container-id" => cid.clone(), "exec-id" => eid.clone(), @@ -426,7 +424,7 @@ impl AgentService { if eid.is_empty() { // eid is empty, signal all the remaining processes in the container cgroup info!( - sl!(), + sl(), "signal all the remaining processes"; "container-id" => cid.clone(), "exec-id" => eid.clone(), @@ -434,7 +432,7 @@ impl AgentService { if let Err(err) = self.freeze_cgroup(&cid, FreezerState::Frozen).await { warn!( - sl!(), + sl(), "freeze cgroup failed"; "container-id" => cid.clone(), "exec-id" => eid.clone(), @@ -447,7 +445,7 @@ impl AgentService { let res = unsafe { libc::kill(*pid, sig) }; if let Err(err) = Errno::result(res).map(drop) { warn!( - sl!(), + sl(), "signal failed"; "container-id" => cid.clone(), "exec-id" => eid.clone(), @@ -458,7 +456,7 @@ impl AgentService { } if let Err(err) = self.freeze_cgroup(&cid, FreezerState::Thawed).await { warn!( - sl!(), + sl(), "unfreeze cgroup failed"; "container-id" => cid.clone(), "exec-id" => eid.clone(), @@ -503,7 +501,7 @@ impl AgentService { let (exit_send, mut exit_recv) = tokio::sync::mpsc::channel(100); info!( - sl!(), + sl(), "wait process"; "container-id" => cid.clone(), "exec-id" => eid.clone() @@ -520,9 +518,9 @@ impl AgentService { }; if let Some(mut exit_rx) = exit_rx { - info!(sl!(), "cid {} eid {} waiting for exit signal", &cid, &eid); + info!(sl(), "cid {} eid {} waiting for exit signal", &cid, &eid); while exit_rx.changed().await.is_ok() {} - info!(sl!(), "cid {} eid {} received exit signal", &cid, &eid); + info!(sl(), "cid {} eid {} received exit signal", &cid, &eid); } let mut sandbox = s.lock().await; @@ -840,8 +838,8 @@ impl agent_ttrpc::AgentService for AgentService { // stat the mount point, virtiofs daemon will remove the stale cache and release the fds if the mount point doesn't exist any more. // More details in https://github.com/kata-containers/kata-containers/issues/6455#issuecomment-1477137277 match stat::stat(Path::new(&m.mount_point)) { - Ok(_) => info!(sl!(), "stat {} success", m.mount_point), - Err(e) => info!(sl!(), "stat {} failed: {}", m.mount_point, e), + Ok(_) => info!(sl(), "stat {} success", m.mount_point), + Err(e) => info!(sl(), "stat {} failed: {}", m.mount_point, e), } } } @@ -1024,7 +1022,7 @@ impl agent_ttrpc::AgentService for AgentService { trace_rpc_call!(ctx, "update_mounts", req); is_allowed(&req)?; - match update_ephemeral_mounts(sl!(), req.storages.to_vec(), self.sandbox.clone()).await { + match update_ephemeral_mounts(sl(), req.storages.to_vec(), self.sandbox.clone()).await { Ok(_) => Ok(Empty::new()), Err(e) => Err(ttrpc_error( ttrpc::Code::INTERNAL, @@ -1041,7 +1039,7 @@ impl agent_ttrpc::AgentService for AgentService { trace_rpc_call!(ctx, "get_iptables", req); is_allowed(&req)?; - info!(sl!(), "get_ip_tables: request received"); + info!(sl(), "get_ip_tables: request received"); // the binary could exists in either /usr/sbin or /sbin // here check both of the places and return the one exists @@ -1066,7 +1064,7 @@ impl agent_ttrpc::AgentService for AgentService { ..Default::default() }), Err(e) => { - warn!(sl!(), "failed to run {}: {:?}", cmd, e.kind()); + warn!(sl(), "failed to run {}: {:?}", cmd, e.kind()); return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } } @@ -1080,7 +1078,7 @@ impl agent_ttrpc::AgentService for AgentService { trace_rpc_call!(ctx, "set_iptables", req); is_allowed(&req)?; - info!(sl!(), "set_ip_tables request received"); + info!(sl(), "set_ip_tables request received"); // the binary could exists in both /usr/sbin and /sbin // here check both of the places and return the one exists @@ -1109,7 +1107,7 @@ impl agent_ttrpc::AgentService for AgentService { { Ok(child) => child, Err(e) => { - warn!(sl!(), "failure to spawn {}: {:?}", cmd, e.kind()); + warn!(sl(), "failure to spawn {}: {:?}", cmd, e.kind()); return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } }; @@ -1130,12 +1128,12 @@ impl agent_ttrpc::AgentService for AgentService { let _ = match stdin.write_all(&req.data) { Ok(o) => o, Err(e) => { - warn!(sl!(), "error writing stdin: {:?}", e.kind()); + warn!(sl(), "error writing stdin: {:?}", e.kind()); return; } }; if tx.send(1).is_err() { - warn!(sl!(), "stdin writer thread receiver dropped"); + warn!(sl(), "stdin writer thread receiver dropped"); }; }); @@ -1160,7 +1158,7 @@ impl agent_ttrpc::AgentService for AgentService { Ok(o) => o, Err(e) => { warn!( - sl!(), + sl(), "failure waiting for spawned {} to complete: {:?}", cmd, e.kind() @@ -1170,7 +1168,7 @@ impl agent_ttrpc::AgentService for AgentService { }; if !output.status.success() { - warn!(sl!(), "{} failed: {:?}", cmd, output.stderr); + warn!(sl(), "{} failed: {:?}", cmd, output.stderr); return Err(ttrpc_error( ttrpc::Code::INTERNAL, format!( @@ -1259,7 +1257,7 @@ impl agent_ttrpc::AgentService for AgentService { if !req.guest_hook_path.is_empty() { let _ = s.add_hooks(&req.guest_hook_path).map_err(|e| { error!( - sl!(), + sl(), "add guest hook {} failed: {:?}", req.guest_hook_path, e ); }); @@ -1278,7 +1276,7 @@ impl agent_ttrpc::AgentService for AgentService { .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; } - match add_storages(sl!(), req.storages.to_vec(), self.sandbox.clone(), None).await { + match add_storages(sl(), req.storages.to_vec(), self.sandbox.clone(), None).await { Ok(m) => { let sandbox = self.sandbox.clone(); let mut s = sandbox.lock().await; @@ -1287,7 +1285,7 @@ impl agent_ttrpc::AgentService for AgentService { Err(e) => return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), }; - match setup_guest_dns(sl!(), req.dns.to_vec()) { + match setup_guest_dns(sl(), req.dns.to_vec()) { Ok(_) => { let sandbox = self.sandbox.clone(); let mut s = sandbox.lock().await; @@ -1412,7 +1410,7 @@ impl agent_ttrpc::AgentService for AgentService { trace_rpc_call!(ctx, "get_guest_details", req); is_allowed(&req)?; - info!(sl!(), "get guest details!"); + info!(sl(), "get guest details!"); let mut resp = GuestDetailsResponse::new(); // to get memory block size match get_memory_info( @@ -1426,7 +1424,7 @@ impl agent_ttrpc::AgentService for AgentService { resp.support_mem_hotplug_probe = v; } Err(e) => { - info!(sl!(), "fail to get memory info!"); + info!(sl(), "fail to get memory info!"); return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } } @@ -1511,7 +1509,7 @@ impl agent_ttrpc::AgentService for AgentService { drop(sandbox); if let Some(container_id) = event_rx.recv().await { - info!(sl!(), "get_oom_event return {}", &container_id); + info!(sl(), "get_oom_event return {}", &container_id); let mut resp = OOMEvent::new(); resp.container_id = container_id; @@ -1530,7 +1528,7 @@ impl agent_ttrpc::AgentService for AgentService { trace_rpc_call!(ctx, "get_volume_stats", req); is_allowed(&req)?; - info!(sl!(), "get volume stats!"); + info!(sl(), "get volume stats!"); let mut resp = VolumeStatsResponse::new(); let mut condition = VolumeCondition::new(); @@ -1541,7 +1539,7 @@ impl agent_ttrpc::AgentService for AgentService { condition.message = String::from("OK"); } Err(e) => { - info!(sl!(), "failed to open the volume"); + info!(sl(), "failed to open the volume"); return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } }; @@ -1600,7 +1598,7 @@ impl health_ttrpc::Health for HealthService { _ctx: &TtrpcContext, req: protocols::health::CheckRequest, ) -> ttrpc::Result { - info!(sl!(), "version {:?}", req); + info!(sl(), "version {:?}", req); let mut rep = protocols::health::VersionCheckResponse::new(); rep.agent_version = AGENT_VERSION.to_string(); rep.grpc_version = API_VERSION.to_string(); @@ -1621,17 +1619,17 @@ fn get_memory_info( match fs::read_to_string(block_size_path) { Ok(v) => { if v.is_empty() { - warn!(sl!(), "file {} is empty", block_size_path); + warn!(sl(), "file {} is empty", block_size_path); return Err(anyhow!(ERR_INVALID_BLOCK_SIZE)); } size = u64::from_str_radix(v.trim(), 16).map_err(|_| { - warn!(sl!(), "failed to parse the str {} to hex", size); + warn!(sl(), "failed to parse the str {} to hex", size); anyhow!(ERR_INVALID_BLOCK_SIZE) })?; } Err(e) => { - warn!(sl!(), "memory block size error: {:?}", e.kind()); + warn!(sl(), "memory block size error: {:?}", e.kind()); if e.kind() != std::io::ErrorKind::NotFound { return Err(anyhow!(e)); } @@ -1643,7 +1641,7 @@ fn get_memory_info( match stat::stat(hotplug_probe_path) { Ok(_) => plug = true, Err(e) => { - warn!(sl!(), "hotplug memory error: {:?}", e); + warn!(sl(), "hotplug memory error: {:?}", e); match e { nix::Error::ENOENT => plug = false, _ => return Err(anyhow!(e)), @@ -1739,7 +1737,7 @@ pub fn start(s: Arc>, server_address: &str, init_mode: bool) -> R .register_service(aservice) .register_service(hservice); - info!(sl!(), "ttRPC server started"; "address" => server_address); + info!(sl(), "ttRPC server started"; "address" => server_address); Ok(server) } @@ -1814,7 +1812,7 @@ fn remove_container_resources(sandbox: &mut Sandbox, cid: &str) -> Result<()> { for m in cmounts.iter() { if let Err(err) = sandbox.unset_and_remove_sandbox_storage(m) { error!( - sl!(), + sl(), "failed to unset_and_remove_sandbox_storage for container {}, error: {:?}", cid, err @@ -1850,7 +1848,7 @@ fn is_signal_handled(proc_status_file: &str, signum: u32) -> bool { return fs::metadata(proc_status_file).is_ok(); } else if signum > 64 { // Ensure invalid signum won't break bit shift logic - warn!(sl!(), "received invalid signum {}", signum); + warn!(sl(), "received invalid signum {}", signum); return false; } else { (signum - 1).into() @@ -1860,7 +1858,7 @@ fn is_signal_handled(proc_status_file: &str, signum: u32) -> bool { let file = match File::open(proc_status_file) { Ok(f) => f, Err(_) => { - warn!(sl!(), "failed to open file {}", proc_status_file); + warn!(sl(), "failed to open file {}", proc_status_file); return false; } }; @@ -2013,7 +2011,7 @@ pub fn setup_bundle(cid: &str, spec: &mut Spec) -> Result { "bind", MsFlags::MS_BIND, "", - &sl!(), + &sl(), )?; let rootfs_path_name = rootfs_path @@ -2048,7 +2046,7 @@ fn load_kernel_module(module: &protocols::agent::KernelModule) -> Result<()> { } info!( - sl!(), + sl(), "load_kernel_module {}: {:?}", module.name, module.parameters ); @@ -2834,7 +2832,7 @@ OtherField:other for cmd in iptables_cmd_list { if !check_command(cmd) { warn!( - sl!(), + sl(), "one or more commands for ip tables test are missing, skip it" ); return; diff --git a/src/agent/src/tracer.rs b/src/agent/src/tracer.rs index bad4a6f50..1199b601c 100644 --- a/src/agent/src/tracer.rs +++ b/src/agent/src/tracer.rs @@ -69,7 +69,7 @@ macro_rules! trace_rpc_call { propagator.extract(&extract_carrier_from_ttrpc($ctx)) }); - info!(sl!(), "rpc call from shim to agent: {:?}", $name); + info!(sl(), "rpc call from shim to agent: {:?}", $name); // generate tracing span let rpc_span = span!(tracing::Level::INFO, $name, "mod"="rpc.rs", req=?$req); diff --git a/src/agent/src/uevent.rs b/src/agent/src/uevent.rs index cabdd6235..53b7c103d 100644 --- a/src/agent/src/uevent.rs +++ b/src/agent/src/uevent.rs @@ -19,11 +19,9 @@ use tokio::sync::watch::Receiver; use tokio::sync::Mutex; use tracing::instrument; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger().new(o!("subsystem" => "uevent")) - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger().new(o!("subsystem" => "uevent")) } #[derive(Debug, Default, Clone, PartialEq, Eq)] @@ -120,11 +118,11 @@ pub async fn wait_for_uevent( ) -> Result { let logprefix = format!("Waiting for {:?}", &matcher); - info!(sl!(), "{}", logprefix); + info!(sl(), "{}", logprefix); let mut sb = sandbox.lock().await; for uev in sb.uevent_map.values() { if matcher.is_match(uev) { - info!(sl!(), "{}: found {:?} in uevent map", logprefix, &uev); + info!(sl(), "{}: found {:?} in uevent map", logprefix, &uev); return Ok(uev.clone()); } } @@ -139,7 +137,7 @@ pub async fn wait_for_uevent( sb.uevent_watchers.push(Some((Box::new(matcher), tx))); drop(sb); // unlock - info!(sl!(), "{}: waiting on channel", logprefix); + info!(sl(), "{}: waiting on channel", logprefix); let hotplug_timeout = AGENT_CONFIG.hotplug_timeout; @@ -157,7 +155,7 @@ pub async fn wait_for_uevent( } }; - info!(sl!(), "{}: found {:?} on channel", logprefix, &uev); + info!(sl(), "{}: found {:?} on channel", logprefix, &uev); Ok(uev) } From 6bb2ea81952ec66e65da3557572814d50ba5b323 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Thu, 29 Jun 2023 15:46:54 +0000 Subject: [PATCH 22/87] packaging: Fix indentation of build.sh script at ovmf This PR fixes the indentation of build.sh script at ovmf. Fixes #7208 Signed-off-by: Gabriela Cervantes --- tools/packaging/static-build/ovmf/build.sh | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/packaging/static-build/ovmf/build.sh b/tools/packaging/static-build/ovmf/build.sh index 8a2f20698..9066196d2 100755 --- a/tools/packaging/static-build/ovmf/build.sh +++ b/tools/packaging/static-build/ovmf/build.sh @@ -30,17 +30,17 @@ fi [ -n "$ovmf_repo" ] || die "failed to get ovmf repo" if [ "${ovmf_build}" == "x86_64" ]; then - [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.x86_64.version") - [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.x86_64.package") - [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.x86_64.package_output_dir") + [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.x86_64.version") + [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.x86_64.package") + [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.x86_64.package_output_dir") elif [ "${ovmf_build}" == "sev" ]; then - [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.sev.version") - [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.sev.package") - [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.sev.package_output_dir") + [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.sev.version") + [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.sev.package") + [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.sev.package_output_dir") elif [ "${ovmf_build}" == "tdx" ]; then - [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.tdx.version") - [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.tdx.package") - [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.tdx.package_output_dir") + [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.tdx.version") + [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.tdx.package") + [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.tdx.package_output_dir") fi [ -n "$ovmf_version" ] || die "failed to get ovmf version or commit" @@ -49,8 +49,8 @@ fi sudo docker pull ${container_image} || \ (sudo docker build -t "${container_image}" "${script_dir}" && \ - # No-op unless PUSH_TO_REGISTRY is exported as "yes" - push_to_registry "${container_image}") + # No-op unless PUSH_TO_REGISTRY is exported as "yes" + push_to_registry "${container_image}") sudo docker run --rm -i -v "${repo_root_dir}:${repo_root_dir}" \ -w "${PWD}" \ From 0f454d0c04a9bcc84134f18f3bee48a37fc913da Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Fri, 30 Jun 2023 08:42:55 +0000 Subject: [PATCH 23/87] gpu: Fixing typos for PCIe topology changes Some comments and functions had typos and wrong capitalization. Signed-off-by: Zvonko Kaiser --- src/runtime/pkg/device/config/config.go | 4 ++-- src/runtime/pkg/device/drivers/utils.go | 2 +- src/runtime/pkg/oci/utils.go | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index c65e86123..32f2251ff 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -188,7 +188,7 @@ var PCIePortPrefixMapping = map[PCIePort]PCIePortBusPrefix{ BridgePort: PCIBridgePortPrefix, } -func (p PCIePort) InValid() bool { +func (p PCIePort) Invalid() bool { switch p { case RootPort: fallthrough @@ -374,7 +374,7 @@ const ( VFIOAPDeviceMediatedType ) -// VFIOPCIDev represents a VFIO PCI device used for hotplugging +// VFIODev represents a VFIO PCI device used for hotplugging type VFIODev struct { // ID is used to identify this drive in the hypervisor options. ID string diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index ddb1a0ded..8c9055ae2 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -47,7 +47,7 @@ func deviceLogger() *logrus.Entry { return api.DeviceLogger() } -// IsPCIeDevice Identifies PCIe device by reading the size of the PCI config space +// IsPCIeDevice identifies PCIe device by reading the size of the PCI config space // Plain PCI device have 256 bytes of config space where PCIe devices have 4K func IsPCIeDevice(bdf string) bool { if len(strings.Split(bdf, ":")) == 2 { diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index 21e027160..d782211dd 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -579,7 +579,7 @@ func addHypervisorPCIePortOverride(value string) (config.PCIePort, error) { return config.NoPort, nil } port := config.PCIePort(value) - if port.InValid() { + if port.Invalid() { return config.InvalidPort, fmt.Errorf("Invalid PCIe port \"%v\" specified in annotation", value) } return port, nil From d035955ef516c8d84aa454b6688f47e725fd5004 Mon Sep 17 00:00:00 2001 From: Zvonko Kaiser Date: Mon, 17 Oct 2022 07:59:38 -0700 Subject: [PATCH 24/87] doc: Add documentation for the virtualization reference architecture Fixes: #4041 Signed-off-by: Zvonko Kaiser --- docs/design/README.md | 1 + docs/design/kata-vra.md | 434 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 435 insertions(+) create mode 100644 docs/design/kata-vra.md diff --git a/docs/design/README.md b/docs/design/README.md index adcffd019..a5d9dc712 100644 --- a/docs/design/README.md +++ b/docs/design/README.md @@ -13,6 +13,7 @@ Kata Containers design documents: - [Design for Kata Containers `Lazyload` ability with `nydus`](kata-nydus-design.md) - [Design for direct-assigned volume](direct-blk-device-assignment.md) - [Design for core-scheduling](core-scheduling.md) +- [Virtualization Reference Architecture](kata-vra.md) --- - [Design proposals](proposals) diff --git a/docs/design/kata-vra.md b/docs/design/kata-vra.md new file mode 100644 index 000000000..ba53c3371 --- /dev/null +++ b/docs/design/kata-vra.md @@ -0,0 +1,434 @@ +# Virtualization Reference Architecture + +## Subject to Change | © 2022 by NVIDIA Corporation. All rights reserved. | For test and development only_ + +Before digging deeper into the virtualization reference architecture, let's +first look at the various GPUDirect use cases in the following table. We’re +distinguishing between two top-tier use cases where the devices are (1) +passthrough and (2) virtualized, where a VM gets assigned a virtual function +(VF) and not the physical function (PF). A combination of PF and VF would also +be possible. + +| Device #1  (passthrough) | Device #2 (passthrough) | P2P Compatibility and Mode | +| ------------------------- | ----------------------- | -------------------------------------------- | +| GPU PF | GPU PF | GPUDirect P2P  | +| GPU PF | NIC PF | GPUDirect RDMA | +| MIG-slice | MIG-slice | _No GPUDirect P2P_ | +| MIG-slice | NIC PF | GPUDirect RDMA | +| **PDevice #1  (virtualized)** | **Device #2 (virtualized)** | **P2P Compatibility and Mode** | +| Time-slice vGPU VF | Time-slice vGPU VF | _No GPUDirect P2P  but NVLINK P2P available_ | +| Time-slice vGPU VF | NIC VF | GPUDirect RDMA | +| MIG-slice vGPU | MIG-slice vGPU | _No GPUDirect P2P_ | +| MIG-slice vGPU | NIC VF | GPUDirect RDMA | + +In a virtualized environment we have several distinct features that may prevent +Peer-to-peer (P2P) communication of two endpoints in a PCI Express topology. The +IOMMU translates IO virtual addresses (IOVA) to physical addresses (PA). Each +device behind an IOMMU has its own IOVA memory space, usually, no two devices +share the same IOVA memory space but it’s up to the hypervisor or OS how it +chooses to map devices to IOVA spaces.  Any PCI Express DMA transactions will +use IOVAs, which the IOMMU must translate. By default, all the traffic is routed +to the root complex and not issued directly to the peer device. + +An IOMMU can be used to isolate and protect devices even if virtualization is +not used; since devices can only access memory regions that are mapped for it, a +DMA from one device to another is not possible. DPDK uses the IOMMU to have +better isolation between devices, another benefit is that IOVA space can be +represented as a contiguous memory even if the PA space is heavily scattered. + +In the case of virtualization, the IOMMU is responsible for isolating the device +and memory between VMs for safe device assignment without compromising the host +and other guest OSes. Without an IOMMU, any device can access the entire system +and perform DMA transactions _anywhere_. + +The second feature is ACS (Access Control Services), which controls which +devices are allowed to communicate with one another and thus avoids improper +routing of packets irrespectively of whether IOMMU is enabled or not. + +When IOMMU is enabled, ACS is normally configured to force all PCI Express DMA +to go through the root complex so IOMMU can translate it, impacting performance +between peers with higher latency and reduced bandwidth. + +A way to avoid the performance hit is to enable Address Translation Services +(ATS). ATS-capable endpoints can prefetch IOVA -> PA translations from the IOMMU +and then perform DMA transactions directly to another endpoint. Hypervisors +enable this by enabling ATS in such endpoints, configuring ACS to enable Direct +Translated P2P, and configuring the IOMMU to allow Address Translation requests. + +Another important factor is that the NVIDIA driver stack will use the PCI +Express topology of the system it is running on to determine whether the +hardware is capable of supporting P2P. The driver stack qualifies specific +chipsets, and PCI Express switches for use with GPUDirect P2P. In virtual +environments, the PCI Express topology is flattened and obfuscated to present a +uniform environment to the software inside the VM, which breaks the GPUDirect +P2P use case. + +On a bare metal machine, the driver stack groups GPUs into cliques that can +perform GPUDirect P2P communication, excluding peer mappings where P2P +communication is not possible, prominently if GPUs are attached to multiple CPU +sockets.   + +CPUs and local memory banks are referred to as NUMA nodes. In a two-socket +server, each of the CPUs has a local memory bank for a total of two NUMA nodes. +Some servers provide the ability to configure additional NUMA nodes per CPU, +which means a CPU socket can have two NUMA nodes  (some servers support four +NUMA nodes per socket) with local memory banks and L3 NUMA domains for improved +performance. + +One of the current solutions is that the hypervisor provides additional topology +information that the driver stack can pick up and enable GPUDirect P2P between +GPUs, even if the virtualized environment does not directly expose it. The PCI +Express virtual P2P approval capability structure in the PCI configuration space +is entirely emulated by the hypervisor of passthrough GPU devices. + +A clique ID is provided where GPUs with the same clique ID belong to a group of +GPUs capable of P2P communication + +On vSphere, Azure, and other CPSs,  the hypervisor lays down a `topologies.xml` +which NCCL can pick up and deduce the right P2P level[^1]. NCCL is leveraging +Infiniband (IB) and/or Unified Communication X (UCX) for communication, and +GPUDirect P2P and GPUDirect RDMA should just work in this case. The only culprit +is that software or applications that do not use the XML file to deduce the +topology will fail and not enable GPUDirect ( [`nccl-p2p-level`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-p2p-level) ) + +## Hypervisor PCI Express Topology + +To enable every part of the accelerator stack, we propose a virtualized +reference architecture to enable GPUDirect P2P and GPUDirect RDMA for any +hypervisor. The idea is split into two parts to enable the right PCI Express +topology. The first part builds upon extending the PCI Express virtual P2P +approval capability structure to every device that wants to do P2P in some way +and groups devices by clique ID. The other part involves replicating a subset of +the host topology so that applications running in the VM do not need to read +additional information and enable the P2P capability like in the bare-metal use +case described above. The driver stack can then deduce automatically if the +topology presented in the VM is capable of P2P communication. + +We will work with the following host topology for the following sections. It is +a system with two converged DPUs, each having an `A100X` GPU and two `ConnectX-6` +network ports connected to the downstream ports of a PCI Express switch. + +```sh ++-00.0-[d8-df]----00.0-[d9-df]--+-00.0-[da-db]--+-00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network + | +-00.1 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network + | \-00.2 Mellanox Tech MT42822 BlueField-2 SoC Management Interface + \-01.0-[dc-df]----00.0-[dd-df]----08.0-[de-df]----00.0 NVIDIA Corporation GA100 [A100X] + ++-00.0-[3b-42]----00.0-[3c-42]--+-00.0-[3d-3e]--+-00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network + | +-00.1 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network + | \-00.2 Mellanox Tech MT42822 BlueField-2 SoC Management Interface + \-01.0-[3f-42]----00.0-[40-42]----08.0-[41-42]----00.0 NVIDIA Corporation GA100 [A100X] +``` + +The green path highlighted above is the optimal and preferred path for +efficient P2P communication. + +## PCI Express Virtual P2P Approval Capability + +Most of the time, the PCI Express topology is flattened and obfuscated to ensure +easy migration of the VM image between different physical hardware topologies. +In Kata, we can configure the hypervisor to use PCI Express root ports to +hotplug the VFIO  devices one is passing through. A user can select how many PCI +Express root ports to allocate depending on how many devices are passed through. +A recent addition to Kata will detect the right amount of PCI Express devices +that need hotplugging and bail out if the number of root ports is insufficient. +In Kata, we do not automatically increase the number of root ports, we want the +user to be in full control of the topology. + +```toml +# /etc/kata-containers/configuration.toml + +# VFIO devices are hotplugged on a bridge by default. +# Enable hot-plugging on the root bus. This may be required for devices with +# a large PCI bar, as this is a current limitation with hot-plugging on +# a bridge. +# Default “bridge-port” +hotplug_vfio = "root-port" + +# Before hot plugging a PCIe device, you need to add a pcie_root_port device. +# Use this parameter when using some large PCI bar devices, such as NVIDIA GPU +# The value means the number of pcie_root_port +# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35" +# Default 0 +pcie_root_port = 8 +``` + +VFIO devices are hotplugged on a PCIe-PCI bridge by default. Hotplug of PCI +Express devices is only supported on PCI Express root or downstream ports. With +this configuration set, if we start up a Kata container, we can inspect our +topology and see the allocated PCI Express root ports and the hotplugged +devices. + +```sh +$ lspci -tv + -[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller + +-01.0 Red Hat, Inc. Virtio console + +-02.0 Red Hat, Inc. Virtio SCSI + +-03.0 Red Hat, Inc. Virtio RNG + +-04.0-[01]----00.0 Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6 + +-05.0-[02]----00.0 Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6 + +-06.0-[03]----00.0 NVIDIA Corporation Device 20b8 + +-07.0-[04]----00.0 NVIDIA Corporation Device 20b8 + +-08.0-[05]-- + +-09.0-[06]-- + +-0a.0-[07]-- + +-0b.0-[08]-- + +-0c.0 Red Hat, Inc. Virtio socket + +-0d.0 Red Hat, Inc. Virtio file system + +-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller + +-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller + \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller +``` + +For devices with huge BARs (Base Address Registers) like the GPU (we need to +configure the PCI Express root port properly and allocate enough memory for +mapping), we have added a heuristic to Kata to deduce the right settings. Hence, +the BARs can be mapped correctly. This functionality is added to +[`nvidia/go-nvlib1](https://gitlab.com/nvidia/cloud-native/go-nvlib) which is part +of Kata now. + +```sh +$ sudo dmesg | grep BAR +[ 0.179960] pci 0000:00:04.0: BAR 7: assigned [io 0x1000-0x1fff] +[ 0.179962] pci 0000:00:05.0: BAR 7: assigned [io 0x2000-0x2fff] +[ 0.179963] pci 0000:00:06.0: BAR 7: assigned [io 0x3000-0x3fff] +[ 0.179964] pci 0000:00:07.0: BAR 7: assigned [io 0x4000-0x4fff] +[ 0.179966] pci 0000:00:08.0: BAR 7: assigned [io 0x5000-0x5fff] +[ 0.179967] pci 0000:00:09.0: BAR 7: assigned [io 0x6000-0x6fff] +[ 0.179968] pci 0000:00:0a.0: BAR 7: assigned [io 0x7000-0x7fff] +[ 0.179969] pci 0000:00:0b.0: BAR 7: assigned [io 0x8000-0x8fff] +[ 2.115912] pci 0000:01:00.0: BAR 0: assigned [mem 0x13000000000-0x13001ffffff 64bit pref] +[ 2.116203] pci 0000:01:00.0: BAR 2: assigned [mem 0x13002000000-0x130027fffff 64bit pref] +[ 2.683132] pci 0000:02:00.0: BAR 0: assigned [mem 0x12000000000-0x12001ffffff 64bit pref] +[ 2.683419] pci 0000:02:00.0: BAR 2: assigned [mem 0x12002000000-0x120027fffff 64bit pref] +[ 2.959155] pci 0000:03:00.0: BAR 1: assigned [mem 0x11000000000-0x117ffffffff 64bit pref] +[ 2.959345] pci 0000:03:00.0: BAR 3: assigned [mem 0x11800000000-0x11801ffffff 64bit pref] +[ 2.959523] pci 0000:03:00.0: BAR 0: assigned [mem 0xf9000000-0xf9ffffff] +[ 2.966119] pci 0000:04:00.0: BAR 1: assigned [mem 0x10000000000-0x107ffffffff 64bit pref] +[ 2.966295] pci 0000:04:00.0: BAR 3: assigned [mem 0x10800000000-0x10801ffffff 64bit pref] +[ 2.966472] pci 0000:04:00.0: BAR 0: assigned [mem 0xf7000000-0xf7ffffff] +``` + +The NVIDIA driver stack in this case would refuse to do P2P communication since +(1) the topology is not what it expects, (2)  we do not have a qualified +chipset. Since our P2P devices are not connected to a PCI Express switch port, +we need to provide additional information to support the P2P functionality. One +way of providing such meta information would be to annotate the container; most +of the settings in Kata's configuration file can be overridden via annotations, +but this limits the flexibility, and a user would need to update all the +containers that he wants to run with Kata. The goal is to make such things as +transparent as possible, so we also introduced +[CDI](https://github.com/container-orchestrated-devices/container-device-interface) +(Container Device Interface) to Kata. CDI is a[ +specification](https://github.com/container-orchestrated-devices/container-device-interface/blob/master/SPEC.md) +for container runtimes to support third-party devices. + +As written before, we can provide a clique ID for the devices that belong +together and are capable of doing P2P. This information is provided to the +hypervisor, which will set up things in the VM accordingly. Let's suppose the +user wanted to do GPUDirect RDMA with the first GPU and the NIC that reside on +the same DPU, one could provide the specification telling the hypervisor that +they belong to the same clique. + +```yaml +# /etc/cdi/nvidia.yaml +cdiVersion: 0.4.0 +kind: nvidia.com/gpu +devices: +- name: gpu0 + annotations: + bdf: “41:00.0” + clique-id: “0” + containerEdits: + deviceNodes: + - path: “/dev/vfio/71" + +# /etc/cdi/mellanox.yaml +cdiVersion: 0.4.0 +kind: mellanox.com/nic +devices: +- name: nic0 + annotations: + bdf: “3d:00.0” + clique-id: “0” + attach-pci: “true” + containerEdits: + deviceNodes: + - path: "/dev/vfio/66" +``` + +Since this setting is bound to the device and not the container we do not need +to alter the container just allocate the right resource and GPUDirect RDMA would +be set up correctly. Rather than exposing them separately, an idea would be to +expose a GPUDirect RDMA device via NFD (Node Feature Discovery) that combines +both of them; this way, we could make sure that the right pair is allocated and +used more on  Kubernetes deployment in the next section. + +The GPU driver stack is leveraging the PCI Express virtual P2P approval +capability, but the NIC stack does not use this now. One of the action items is +to enable MOFED to read the P2P approval capability and enable ATS and ACS +settings as described above. + +This way, we could enable GPUDirect P2P and GPUDirect RDMA on any topology +presented to the VM application. It is the responsibility of the administrator +or infrastructure engineer to provide the right information either via +annotations or a CDI specification. + +## Host Topology Replication + +The other way to represent the PCI Express topology in the VM is to replicate a +subset of the topology needed to support the P2P use case inside the VM. Similar +to the configuration for the root ports, we can easily configure the usage of +PCI Express switch ports to hotplug the devices. + +```toml +# /etc/kata-containers/configuration.toml + +# VFIO devices are hotplugged on a bridge by default. +# Enable hot plugging on the root bus. This may be required for devices with +# a large PCI bar, as this is a current limitation with hot plugging on +# a bridge. +# Default “bridge-port” +hotplug_vfio = "switch-port" + +# Before hot plugging a PCIe device, you need to add a pcie_root_port device. +# Use this parameter when using some large PCI bar devices, such as Nvidia GPU +# The value means the number of pcie_root_port +# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35" +# Default 0 +pcie_switch_port = 8 +``` + +Each device that is passed through is attached to a PCI Express downstream port +as illustrated below. We can even replicate the host’s two DPUs topologies with +added metadata through the CDI. Most of the time, a container only needs one +pair of GPU and NIC for GPUDirect RDMA. This is more of a showcase of what we +can do with the power of Kata and CDI. One could even think of adding groups of +devices that support P2P, even from different CPU sockets or NUMA nodes, into +one container; indeed, the first group is NUMA node 0 (red), and the second +group is NUMA node 1 (green). Since they are grouped correctly, P2P would be +enabled naturally inside a group, aka clique ID. + +```sh +$ lspci -tv + -[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller + +-01.0 Red Hat, Inc. Virtio console + +-02.0 Red Hat, Inc. Virtio SCSI + +-03.0 Red Hat, Inc. Virtio RNG + +-04.0-[01-04]----00.0-[02-04]--+-00.0-[03]----00.0 NVIDIA Corporation Device 20b8 + | \-01.0-[04]----00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx + +-05.0-[05-08]----00.0-[06-08]--+-00.0-[07]----00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx + | \-01.0-[08]----00.0 NVIDIA Corporation Device 20b8 + +-06.0 Red Hat, Inc. Virtio socket + +-07.0 Red Hat, Inc. Virtio file system + +-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller + +-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller [AHCI mode] + \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller + \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller +``` + +The configuration of using either the root port or switch port can be applied on +a per Container or Pod basis, meaning we can switch PCI Express topologies on +each run of an application. + +## Hypervisor Resource Limits + +Every hypervisor will have resource limits in terms of how many PCI Express root +ports, switch ports, or bridge ports can be created, especially with devices +that need to reserve a 4K IO range per PCI specification. Each instance of root +or switch port will consume 4K IO of very limited capacity, 64k is the maximum. + +Simple math brings us to the conclusion that we can have a maximum of 16 PCI +Express root ports or 16 PCI Express switch ports in QEMU if devices with IO +BARs are used in the PCI Express hierarchy. + +Additionally, one can have 32 slots on the PCI root bus and a maximum of 256 +slots for the complete PCI(e) topology. + +Per default, QEMU will attach a multi-function device in the last slot on the +PCI root bus, + +```sh + +-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller + +-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller [AHCI mode] + \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller +``` + +Kata will additionally add `virtio-xxx-pci` devices consuming (5 slots) plus a +PCIe-PCI-bridge (1 slot) and a DRAM controller (1 slot), meaning per default, we +have already eight slots used. This leaves us 24 slots for adding other devices +to the root bus. + +The problem that arises here is one use-case from a customer that uses recent +RTX GPUs with Kata. The user wanted to pass through eight of these GPUs into one +container and ran into issues. The problem is that those cards often consist of +four individual device nodes: GPU, Audio, and two USB controller devices (some +cards have a USB-C output). + +These devices are grouped into one IOMMU group. Since one needs to pass through +the complete IOMMU group into the VM, we need to allocate 32 PCI Express root +ports or 32 PCI Express switch ports, which is technically impossible due to the +resource limits outlined above. Since all the devices appear as PCI Express +devices, we need to hotplug those into a root or switch port. + +The solution to this problem is leveraging CDI. For each device, add the +information if it is going to be hotplugged as a PCI Express or PCI device, +which results in either using a PCI Express root/switch port or an ordinary PCI +bridge. PCI bridges are not affected by the limited IO range. This way, the GPU +is attached as a PCI Express device to a root/switch port and the other three +PCI devices to a PCI bridge, leaving enough resources to create the needed PCI +Express root/switch ports.  For example, we’re going to attach the GPUs to a PCI +Express root port and the NICs to a PCI bridge. + +```jsonld +# /etc/cdi/mellanox.json +cdiVersion: 0.4.0 +kind: mellanox.com/nic +devices: +- name: nic0 + annotations: + bdf: “3d:00.0” + clique-id: “0” + attach-pci: “true” + containerEdits: + deviceNodes: + - path: "/dev/vfio/66" +- name: nic1 + annotations: + bdf: “3d:00.1” + clique-id: “1” + attach-pci: “true” + containerEdits: + deviceNodes: + - path: "/dev/vfio/67” +``` + +The configuration is set to use eight root ports for the GPUs and attach the +NICs to a PCI bridge which is connected to a PCI Express-PCI bridge which is the +preferred way of introducing a PCI topology in a PCI Express machine. + +```sh +$ lspci -tv +-[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller + +-01.0 Red Hat, Inc. Virtio console + +-02.0 Red Hat, Inc. Virtio SCSI + +-03.0 Red Hat, Inc. Virtio RNG + +-04.0-[01]----00.0 NVIDIA Corporation Device 20b8 + +-05.0-[02]----00.0 NVIDIA Corporation Device 20b8 + +-06.0-[03]-- + +-07.0-[04]-- + +-08.0-[05]-- + +-09.0-[06]-- + +-0a.0-[07]-- + +-0b.0-[08]-- + +-0c.0-[09-0a]----00.0-[0a]--+-00.0 Mellanox Tech MT42822 BlueField-2 ConnectX-6 + | \-01.0 Mellanox Tech MT42822 BlueField-2 ConnectX-6 + +-0d.0 Red Hat, Inc. Virtio socket + +-0e.0 Red Hat, Inc. Virtio file system + +-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller + +-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller + \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller +``` + +The PCI devices will consume a slot of which we have 256 in the PCI(e) topology +and leave scarce resources for the needed PCI Express devices. From b2ce8b4d61cbca8fc082403f34c37ef48f77af8f Mon Sep 17 00:00:00 2001 From: David Esparza Date: Thu, 22 Jun 2023 09:23:13 -0600 Subject: [PATCH 25/87] metrics: Add memory footprint tests to the CI This PR adds memory foot print metrics to tests/metrics/density folder. Intentionally, each test exits w/ zero in all test cases to ensure that tests would be green when added, and will be enabled in a subsequent PR. A workflow matrix was added to define hypervisor variation on each job, in order to run them sequentially. The launch-times test was updated to make use of the matrix environment variables. Fixes: #7066 Signed-off-by: David Esparza --- .github/workflows/run-metrics.yaml | 18 +- tests/metrics/README.md | 2 + tests/metrics/density/README.md | 53 +++ tests/metrics/density/fast_footprint.sh | 433 ++++++++++++++++++ tests/metrics/density/footprint_data.md | 87 ++++ tests/metrics/density/footprint_data.sh | 360 +++++++++++++++ tests/metrics/density/memory_usage.sh | 381 +++++++++++++++ .../density/memory_usage_inside_container.sh | 134 ++++++ tests/metrics/gha-run.sh | 40 +- 9 files changed, 1494 insertions(+), 14 deletions(-) create mode 100644 tests/metrics/density/README.md create mode 100755 tests/metrics/density/fast_footprint.sh create mode 100644 tests/metrics/density/footprint_data.md create mode 100755 tests/metrics/density/footprint_data.sh create mode 100755 tests/metrics/density/memory_usage.sh create mode 100755 tests/metrics/density/memory_usage_inside_container.sh diff --git a/.github/workflows/run-metrics.yaml b/.github/workflows/run-metrics.yaml index 5e06c3074..4ef44cb4d 100644 --- a/.github/workflows/run-metrics.yaml +++ b/.github/workflows/run-metrics.yaml @@ -8,9 +8,15 @@ on: jobs: run-metrics: + strategy: + fail-fast: true + matrix: + vmm: ['clh', 'qemu'] + max-parallel: 1 runs-on: metrics env: GOPATH: ${{ github.workspace }} + KATA_HYPERVISOR: ${{ matrix.vmm }} steps: - uses: actions/checkout@v3 with: @@ -25,8 +31,12 @@ jobs: - name: Install kata run: bash tests/metrics/gha-run.sh install-kata kata-artifacts - - name: run launch times on qemu - run: bash tests/metrics/gha-run.sh run-test-launchtimes-qemu + - name: run launch times test + run: bash tests/metrics/gha-run.sh run-test-launchtimes + + - name: run memory foot print test + run: bash tests/metrics/gha-run.sh run-test-memory-usage + + - name: run memory usage inside container test + run: bash tests/metrics/gha-run.sh run-test-memory-usage-inside-container - - name: run launch times on clh - run: bash tests/metrics/gha-run.sh run-test-launchtimes-clh diff --git a/tests/metrics/README.md b/tests/metrics/README.md index 1dd996046..d017ed3fc 100644 --- a/tests/metrics/README.md +++ b/tests/metrics/README.md @@ -55,6 +55,8 @@ For further details see the [time tests documentation](time). Tests that measure the size and overheads of the runtime. Generally this is looking at memory footprint sizes, but could also cover disk space or even CPU consumption. +For further details see the [density tests documentation](density). + ### Networking Tests relating to networking. General items could include: diff --git a/tests/metrics/density/README.md b/tests/metrics/density/README.md new file mode 100644 index 000000000..e07ee18b3 --- /dev/null +++ b/tests/metrics/density/README.md @@ -0,0 +1,53 @@ +# Kata Containers density metrics tests + +This directory contains a number of tests to help measure container +memory footprint. Some measures are based around the +[PSS](https://en.wikipedia.org/wiki/Proportional_set_size) of the runtime +components, and others look at the system level (`free` and `/proc/meminfo` +for instance) impact. + +## `memory_usage` + +This test measures the PSS footprint of the runtime components whilst +launching a number of small ([BusyBox](https://hub.docker.com/_/busybox/)) containers +using ctr. + +## `fast_footprint` + +This test takes system level resource measurements after launching a number of +containers in parallel and optionally waiting for KSM to settle its memory +compaction cycles. + +The script is quite configurable via environment variables, including: + +* Which container workload to run. +* How many containers to launch. +* How many containers are launched in parallel. +* How long to wait until taking the measures. + +See the script itself for more details. + +This test shares many config options with the `footprint_data` test. Thus, referring +to the [footprint test documentation](footprint_data.md) may be useful. + +> *Note:* If this test finds KSM is enabled on the host, it will wait for KSM +> to "settle" before taking the final measurement. If your KSM is not configured +> to process all the allocated VM memory fast enough, the test will hit a timeout +> and proceed to take the final measurement anyway. + +## `footprint_data` + +Similar to the `fast_footprint` test, but this test launches the containers +sequentially and takes a system level measurement between each launch. Thus, +this test provides finer grained information on system scaling, but takes +significantly longer to run than the `fast_footprint` test. If you are only +interested in the final figure or the average impact, you may be better running +the `fast_footprint` test. + +For more details see the [footprint test documentation](footprint_data.md). + +## `memory_usage_inside_container` + +Measures the memory statistics *inside* the container. This allows evaluation of +the overhead the VM kernel and rootfs are having on the memory that was requested +by the container co-ordination system, and thus supplied to the VM. diff --git a/tests/metrics/density/fast_footprint.sh b/tests/metrics/density/fast_footprint.sh new file mode 100755 index 000000000..9c84f57fc --- /dev/null +++ b/tests/metrics/density/fast_footprint.sh @@ -0,0 +1,433 @@ +#!/bin/bash +# Copyright (c) 2017-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# A script to gather memory 'footprint' information as we launch more +# and more containers +# +# The script gathers information about both user and kernel space consumption +# Output is into a .json file, named using some of the config component names +# (such as footprint-busybox.json) + +# Pull in some common, useful, items +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +# Note that all vars that can be set from outside the script (that is, +# passed in the ENV), use the ':-' setting to allow being over-ridden + +# Default sleep, in seconds, to let containers come up and finish their +# initialisation before we take the measures. Some of the larger +# containers can take a number of seconds to get running. +PAYLOAD_SLEEP="${PAYLOAD_SLEEP:-10}" + +# How long, in seconds, do we wait for KSM to 'settle down', before we +# timeout and just continue anyway. +KSM_WAIT_TIME="${KSM_WAIT_TIME:-300}" + +# How long, in seconds, do we poll for ctr to complete launching all the +# containers? +CTR_POLL_TIMEOUT="${CTR_POLL_TIMEOUT:-300}" + +# How many containers do we launch in parallel before taking the PAYLOAD_SLEEP +# nap +PARALLELISM="${PARALLELISM:-10}" + +### The default config - run a small busybox image +# Define what we will be running (app under test) +# Default is we run busybox, as a 'small' workload +PAYLOAD="${PAYLOAD:-quay.io/prometheus/busybox:latest}" +PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}" + +### +# which RUNTIME we use is picked up from the env in +# common.bash. You can over-ride by setting RUNTIME in your env + +### +# Define the cutoff checks for when we stop running the test + # Run up to this many containers +NUM_CONTAINERS="${NUM_CONTAINERS:-100}" + # Run until we have consumed this much memory (from MemFree) +MAX_MEMORY_CONSUMED="${MAX_MEMORY_CONSUMED:-256*1024*1024*1024}" + # Run until we have this much MemFree left +MIN_MEMORY_FREE="${MIN_MEMORY_FREE:-2*1024*1024*1024}" + +# Tools we need to have installed in order to operate +REQUIRED_COMMANDS="smem awk" + +# If we 'dump' the system caches before we measure then we get less +# noise in the results - they show more what our un-reclaimable footprint is +DUMP_CACHES="${DUMP_CACHES:-1}" + +# Affects the name of the file to store the results in +TEST_NAME="${TEST_NAME:-fast-footprint-busybox}" + +############# end of configurable items ################### + +# vars to remember where we started so we can calc diffs +base_mem_avail=0 +base_mem_free=0 + +# dump the kernel caches, so we get a more precise (or just different) +# view of what our footprint really is. +function dump_caches() { + sudo bash -c "echo 3 > /proc/sys/vm/drop_caches" +} + +function init() { + restart_containerd_service + + check_cmds $REQUIRED_COMMANDS + sudo -E "${CTR_EXE}" image pull "$PAYLOAD" + + # Modify the test name if running with KSM enabled + check_for_ksm + + # Use the common init func to get to a known state + init_env + + # Prepare to start storing results + metrics_json_init + + # Store up baseline measures + base_mem_avail=$(free -b | head -2 | tail -1 | awk '{print $7}') + base_mem_free=$(get_memfree) + + # Store our configuration for this run + save_config +} + +save_config(){ + metrics_json_start_array + + local json="$(cat << EOF + { + "testname": "${TEST_NAME}", + "payload": "${PAYLOAD}", + "payload_args": "${PAYLOAD_ARGS}", + "payload_sleep": ${PAYLOAD_SLEEP}, + "ksm_settle_time": ${KSM_WAIT_TIME}, + "num_containers": ${NUM_CONTAINERS}, + "parallelism": ${PARALLELISM}, + "max_memory_consumed": "${MAX_MEMORY_CONSUMED}", + "min_memory_free": "${MIN_MEMORY_FREE}", + "dump_caches": "${DUMP_CACHES}" + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Config" +} + +function cleanup() { + # Finish storing the results + metrics_json_save + + clean_env_ctr +} + +# helper function to get USS of process in arg1 +function get_proc_uss() { + item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $4}') + ((item*=1024)) + echo $item +} + +# helper function to get PSS of process in arg1 +function get_proc_pss() { + item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $5}') + ((item*=1024)) + echo $item +} + +# Get the PSS for the whole of userspace (all processes) +# This allows us to see if we had any impact on the rest of the system, for instance +# dockerd grows as we launch containers, so we should account for that in our total +# memory breakdown +function grab_all_pss() { + item=$(sudo smem -t | tail -1 | awk '{print $5}') + ((item*=1024)) + + local json="$(cat << EOF + "all_pss": { + "pss": $item, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function grab_user_smem() { + # userspace + item=$(sudo smem -w | head -5 | tail -1 | awk '{print $3}') + ((item*=1024)) + + local json="$(cat << EOF + "user_smem": { + "userspace": $item, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function grab_slab() { + # Grabbing slab total from meminfo is easier than doing the math + # on slabinfo + item=$(fgrep "Slab:" /proc/meminfo | awk '{print $2}') + ((item*=1024)) + + local json="$(cat << EOF + "slab": { + "slab": $item, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function get_memfree() { + mem_free=$(sudo smem -w | head -6 | tail -1 | awk '{print $4}') + ((mem_free*=1024)) + echo $mem_free +} + +function grab_system() { + + # avail memory, from 'free' + local avail=$(free -b | head -2 | tail -1 | awk '{print $7}') + local avail_decr=$((base_mem_avail-avail)) + + # cached memory, from 'free' + local cached=$(free -b | head -2 | tail -1 | awk '{print $6}') + + # free memory from smem + local smem_free=$(get_memfree) + local free_decr=$((base_mem_free-item)) + + # Anon pages + local anon=$(fgrep "AnonPages:" /proc/meminfo | awk '{print $2}') + ((anon*=1024)) + + # Mapped pages + local mapped=$(egrep "^Mapped:" /proc/meminfo | awk '{print $2}') + ((mapped*=1024)) + + # Cached + local meminfo_cached=$(grep "^Cached:" /proc/meminfo | awk '{print $2}') + ((meminfo_cached*=1024)) + + local json="$(cat << EOF + "system": { + "avail": $avail, + "avail_decr": $avail_decr, + "cached": $cached, + "smem_free": $smem_free, + "free_decr": $free_decr, + "anon": $anon, + "mapped": $mapped, + "meminfo_cached": $meminfo_cached, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function grab_stats() { + # If configured, dump the caches so we get a more stable + # view of what our static footprint really is + if [[ "$DUMP_CACHES" ]] ; then + dump_caches + fi + + # user space data + # PSS taken all userspace + grab_all_pss + # user as reported by smem + grab_user_smem + + # System overview data + # System free and cached + grab_system + + # kernel data + # The 'total kernel space taken' we can work out as: + # ktotal = ((free-avail)-user) + # So, we don't grab that number from smem, as that is what it does + # internally anyhow. + # Still try to grab any finer kernel details that we can though + + # totals from slabinfo + grab_slab + + metrics_json_close_array_element +} + +function check_limits() { + mem_free=$(get_memfree) + if ((mem_free <= MIN_MEMORY_FREE)); then + echo 1 + return + fi + + mem_consumed=$((base_mem_avail-mem_free)) + if ((mem_consumed >= MAX_MEMORY_CONSUMED)); then + echo 1 + return + fi + + echo 0 +} + +launch_containers() { + local parloops leftovers + + (( parloops=${NUM_CONTAINERS}/${PARALLELISM} )) + (( leftovers=${NUM_CONTAINERS} - (${parloops}*${PARALLELISM}) )) + + echo "Launching ${parloops}x${PARALLELISM} containers + ${leftovers} etras" + + containers=() + + local iter n + for iter in $(seq 1 $parloops); do + echo "Launch iteration ${iter}" + for n in $(seq 1 $PARALLELISM); do + containers+=($(random_name)) + sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS & + done + + if [[ $PAYLOAD_SLEEP ]]; then + sleep $PAYLOAD_SLEEP + fi + + # check if we have hit one of our limits and need to wrap up the tests + if (($(check_limits))); then + echo "Ran out of resources, check_limits failed" + return + fi + done + + for n in $(seq 1 $leftovers); do + containers+=($(random_name)) + sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS & + done +} + +wait_containers() { + local t numcontainers + # nap 3s between checks + local step=3 + + for ((t=0; t<${CTR_POLL_TIMEOUT}; t+=step)); do + + numcontainers=$(sudo -E "${CTR_EXE}" c list -q | wc -l) + + if (( numcontainers >= ${NUM_CONTAINERS} )); then + echo "All containers now launched (${t}s)" + return + else + echo "Waiting for containers to launch (${numcontainers} at ${t}s)" + fi + sleep ${step} + done + + echo "Timed out waiting for containers to launch (${t}s)" + cleanup + die "Timed out waiting for containers to launch (${t}s)" +} + +function go() { + # Init the json cycle for this save + metrics_json_start_array + + # Grab the first set of stats before we run any containers. + grab_stats + + launch_containers + wait_containers + + if [ $ksm_on == "1" ]; then + echo "Wating for KSM to settle..." + wait_ksm_settle ${KSM_WAIT_TIME} + fi + + grab_stats + + # Wrap up the results array + metrics_json_end_array "Results" +} + +function show_vars() +{ + echo -e "\nEvironment variables:" + echo -e "\tName (default)" + echo -e "\t\tDescription" + echo -e "\tPAYLOAD (${PAYLOAD})" + echo -e "\t\tThe ctr image to run" + echo -e "\tPAYLOAD_ARGS (${PAYLOAD_ARGS})" + echo -e "\t\tAny extra arguments passed into the docker 'run' command" + echo -e "\tPAYLOAD_SLEEP (${PAYLOAD_SLEEP})" + echo -e "\t\tSeconds to sleep between launch and measurement, to allow settling" + echo -e "\tKSM_WAIT_TIME (${KSM_WAIT_TIME})" + echo -e "\t\tSeconds to wait for KSM to settle before we take the final measure" + echo -e "\tCTR_POLL_TIMEOUT (${CTR_POLL_TIMEOUT})" + echo -e "\t\tSeconds to poll for ctr to finish launching containers" + echo -e "\tPARALLELISM (${PARALLELISM})" + echo -e "\t\tNumber of containers we launch in parallel" + echo -e "\tNUM_CONTAINERS (${NUM_CONTAINERS})" + echo -e "\t\tThe total number of containers to run" + echo -e "\tMAX_MEMORY_CONSUMED (${MAX_MEMORY_CONSUMED})" + echo -e "\t\tThe maximum amount of memory to be consumed before terminating" + echo -e "\tMIN_MEMORY_FREE (${MIN_MEMORY_FREE})" + echo -e "\t\tThe minimum amount of memory allowed to be free before terminating" + echo -e "\tDUMP_CACHES (${DUMP_CACHES})" + echo -e "\t\tA flag to note if the system caches should be dumped before capturing stats" + echo -e "\tTEST_NAME (${TEST_NAME})" + echo -e "\t\tCan be set to over-ride the default JSON results filename" + +} + +function help() +{ + usage=$(cat << EOF +Usage: $0 [-h] [options] + Description: + Launch a series of workloads and take memory metric measurements after + each launch. + Options: + -h, Help page. +EOF +) + echo "$usage" + show_vars +} + +function main() { + + local OPTIND + while getopts "h" opt;do + case ${opt} in + h) + help + exit 0; + ;; + esac + done + shift $((OPTIND-1)) + + init + go + cleanup +} + +main "$@" diff --git a/tests/metrics/density/footprint_data.md b/tests/metrics/density/footprint_data.md new file mode 100644 index 000000000..b9ba27fe0 --- /dev/null +++ b/tests/metrics/density/footprint_data.md @@ -0,0 +1,87 @@ +# Footprint data script details + +The `footprint_data.sh` script runs a number of identical containers sequentially +via ctr and takes a number of memory related measurements after each +launch. The script is generally not used in a CI type environment, but is intended +to be run and analyzed manually. + +You can configure the script by setting a number of environment variables. + +The following sections list details of the configurable variables, along with a +small example invocation script. + +## Variables +Environment variables can take effect in two ways. + +Some variables affect how the payload is executed. The `RUNTIME` and `PAYLOAD` +arguments directly affect the payload execution with the following line in +the script: + +`$ ctr run --memory-limit $PAYLOAD_RUNTIME_ARGS --rm --runtime=$CONTAINERD_RUNTIME $PAYLOAD $NAME sh -c $PAYLOAD_ARGS` + +Other settings affect how memory footprint is measured and the test termination +conditions. + +| Variable | Function +| -------- | -------- +| `PAYLOAD` | The ctr image to run +| `PAYLOAD_ARGS` | Any arguments passed into the ctr image +| `PAYLOAD_RUNTIME_ARGS` | Any extra arguments passed into the ctr `run` command +| `PAYLOAD_SLEEP` | Seconds to sleep between launch and measurement, to allow settling +| `MAX_NUM_CONTAINERS` | The maximum number of containers to run before terminating +| `MAX_MEMORY_CONSUMED` | The maximum amount of memory to be consumed before terminating +| `MIN_MEMORY_FREE` | The minimum amount of memory allowed to be free before terminating +| `DUMP_CACHES` | A flag to note if the system caches should be dumped before capturing stats +| `DATAFILE` | Can be set to over-ride the default JSON results filename + +## Output files +The names of the JSON files generated by the test are dictated by some of the parameters +the test is utilising. The default filename is generated in the form of: +`footprint-${PAYLOAD}[-ksm].json` + +## Measurements +The test measures, calculates, and stores a number of data items: + +| Item | Description +| ---- | ----------- +| `uss` | USS for all the VM runtime components +| `pss` | PSS for all the VM runtime components +| `all_pss` | PSS of all of userspace - to monitor if we had other impact on the system +| `user_smem` | `smem` "userspace" consumption value +| `avail` | "available" memory from `free` +| `avail_decr` | "available" memory decrease since start of test +| `cached` | "Cached" memory from `/proc/meminfo` +| `smem_free` | Free memory as reported by `smem` +| `free_decr` | Decrease in Free memory reported by `smem` since start of test +| `anon` | `AnonPages` as reported from `/proc/meminfo` +| `mapped` | Mapped pages as reported from `/proc/meminfo` +| `cached` | Cached pages as reported from `/proc/meminfo` +| `slab` | Slab as reported from `/proc/meminfo` + +## Example script +The following script is an example of how to configure the environment variables and +invoke the test script to run a number of different container tests. + +``` +#!/bin/bash + +set -e +set -x + +export MAX_NUM_CONTAINERS=10 +export MAX_MEMORY_CONSUMED=6*1024*1024*1024 + +function run() { + ### + # Define what we will be running (app under test) + # Default is we run busybox, as a 'small' workload + export PAYLOAD="quay.io/prometheus/busybox:latest" + export PAYLOAD_ARGS="tail -f /dev/null" + export PAYLOAD_SLEEP=10 + export PAYLOAD_RUNTIME_ARGS="5120" + sudo -E bash $(pwd)/density/footprint_data.sh +} + +export CONTAINERD_RUNTIME=io.containerd.kata.v2 +run +``` diff --git a/tests/metrics/density/footprint_data.sh b/tests/metrics/density/footprint_data.sh new file mode 100755 index 000000000..f5fc11341 --- /dev/null +++ b/tests/metrics/density/footprint_data.sh @@ -0,0 +1,360 @@ +#!/bin/bash +# Copyright (c) 2017-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# A script to gather memory 'footprint' information as we launch more +# and more containers +# +# The script gathers information about both user and kernel space consumption +# Output is into a .json file, named using some of the config component names +# (such as footprint-busybox.json) + +# Pull in some common, useful, items +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +KSM_ENABLE_FILE="/sys/kernel/mm/ksm/run" + +# Note that all vars that can be set from outside the script (that is, +# passed in the ENV), use the ':-' setting to allow being over-ridden + +# Default sleep for 10s to let containers come up and finish their +# initialisation before we take the measures. Some of the larger +# containers can take a number of seconds to get running. +PAYLOAD_SLEEP="${PAYLOAD_SLEEP:-10}" + +### The default config - run a small busybox image +# Define what we will be running (app under test) +# Default is we run busybox, as a 'small' workload +PAYLOAD="${PAYLOAD:-quay.io/prometheus/busybox:latest}" +PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}" + +### +# Define the cutoff checks for when we stop running the test + # Run up to this many containers +MAX_NUM_CONTAINERS="${MAX_NUM_CONTAINERS:-10}" + # Run until we have consumed this much memory (from MemFree) +MAX_MEMORY_CONSUMED="${MAX_MEMORY_CONSUMED:-6*1024*1024*1024}" + # Run until we have this much MemFree left +MIN_MEMORY_FREE="${MIN_MEMORY_FREE:-2*1024*1024*1024}" + +# Tools we need to have installed in order to operate +REQUIRED_COMMANDS="smem awk" + +# If we 'dump' the system caches before we measure then we get less +# noise in the results - they show more what our un-reclaimable footprint is +DUMP_CACHES="${DUMP_CACHES:-1}" + +# Affects the name of the file to store the results in +TEST_NAME="${TEST_NAME:-footprint-busybox}" + +############# end of configurable items ################### + +# vars to remember where we started so we can calc diffs +base_mem_avail=0 +base_mem_free=0 + +# dump the kernel caches, so we get a more precise (or just different) +# view of what our footprint really is. +function dump_caches() { + sudo bash -c "echo 3 > /proc/sys/vm/drop_caches" +} + +function init() { + restart_containerd_service + + check_cmds $REQUIRED_COMMANDS + sudo -E "${CTR_EXE}" image pull "$PAYLOAD" + + # Modify the test name if running with KSM enabled + check_for_ksm + + # Use the common init func to get to a known state + init_env + + # Prepare to start storing results + metrics_json_init + + # Store up baseline measures + base_mem_avail=$(free -b | head -2 | tail -1 | awk '{print $7}') + base_mem_free=$(get_memfree) + + # Store our configuration for this run + save_config +} + +save_config(){ + metrics_json_start_array + + local json="$(cat << EOF + { + "testname": "${TEST_NAME}", + "payload": "${PAYLOAD}", + "payload_args": "${PAYLOAD_ARGS}", + "payload_sleep": ${PAYLOAD_SLEEP}, + "max_containers": ${MAX_NUM_CONTAINERS}, + "max_memory_consumed": "${MAX_MEMORY_CONSUMED}", + "min_memory_free": "${MIN_MEMORY_FREE}", + "dump_caches": "${DUMP_CACHES}" + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Config" +} + +function cleanup() { + # Finish storing the results + metrics_json_save + + clean_env_ctr +} + +# helper function to get USS of process in arg1 +function get_proc_uss() { + item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $4}') + ((item*=1024)) + echo $item +} + +# helper function to get PSS of process in arg1 +function get_proc_pss() { + item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $5}') + ((item*=1024)) + echo $item +} + +# Get the PSS for the whole of userspace (all processes) +# This allows us to see if we had any impact on the rest of the system, for instance +# containerd grows as we launch containers, so we should account for that in our total +# memory breakdown +function grab_all_pss() { + item=$(sudo smem -t | tail -1 | awk '{print $5}') + ((item*=1024)) + + local json="$(cat << EOF + "all_pss": { + "pss": $item, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function grab_user_smem() { + # userspace + item=$(sudo smem -w | head -5 | tail -1 | awk '{print $3}') + ((item*=1024)) + + local json="$(cat << EOF + "user_smem": { + "userspace": $item, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function grab_slab() { + # Grabbing slab total from meminfo is easier than doing the math + # on slabinfo + item=$(fgrep "Slab:" /proc/meminfo | awk '{print $2}') + ((item*=1024)) + + local json="$(cat << EOF + "slab": { + "slab": $item, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function get_memfree() { + mem_free=$(sudo smem -w | head -6 | tail -1 | awk '{print $4}') + ((mem_free*=1024)) + echo $mem_free +} + +function grab_system() { + # avail memory, from 'free' + local avail=$(free -b | head -2 | tail -1 | awk '{print $7}') + local avail_decr=$((base_mem_avail-avail)) + + # cached memory, from 'free' + local cached=$(free -b | head -2 | tail -1 | awk '{print $6}') + + # free memory from smem + local smem_free=$(get_memfree) + local free_decr=$((base_mem_free-item)) + + # Anon pages + local anon=$(fgrep "AnonPages:" /proc/meminfo | awk '{print $2}') + ((anon*=1024)) + + # Mapped pages + local mapped=$(egrep "^Mapped:" /proc/meminfo | awk '{print $2}') + ((mapped*=1024)) + + # Cached + local meminfo_cached=$(grep "^Cached:" /proc/meminfo | awk '{print $2}') + ((meminfo_cached*=1024)) + + local json="$(cat << EOF + "system": { + "avail": $avail, + "avail_decr": $avail_decr, + "cached": $cached, + "smem_free": $smem_free, + "free_decr": $free_decr, + "anon": $anon, + "mapped": $mapped, + "meminfo_cached": $meminfo_cached, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function grab_stats() { + # If configured, dump the caches so we get a more stable + # view of what our static footprint really is + if [[ "$DUMP_CACHES" ]] ; then + dump_caches + fi + + # user space data + # PSS taken all userspace + grab_all_pss + # user as reported by smem + grab_user_smem + + # System overview data + # System free and cached + grab_system + + # kernel data + # The 'total kernel space taken' we can work out as: + # ktotal = ((free-avail)-user) + # So, we don't grab that number from smem, as that is what it does + # internally anyhow. + # Still try to grab any finer kernel details that we can though + + # totals from slabinfo + grab_slab + + metrics_json_close_array_element +} + +function check_limits() { + mem_free=$(get_memfree) + if ((mem_free <= MIN_MEMORY_FREE)); then + echo 1 + return + fi + + mem_consumed=$((base_mem_avail-mem_free)) + if ((mem_consumed >= MAX_MEMORY_CONSUMED)); then + echo 1 + return + fi + + echo 0 +} + +function go() { + # Init the json cycle for this save + metrics_json_start_array + + containers=() + + for i in $(seq 1 $MAX_NUM_CONTAINERS); do + containers+=($(random_name)) + sudo -E "${CTR_EXE}" run --rm --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS + + if [[ $PAYLOAD_SLEEP ]]; then + sleep $PAYLOAD_SLEEP + fi + + grab_stats + + # check if we have hit one of our limits and need to wrap up the tests + if (($(check_limits))); then + # Wrap up the results array + metrics_json_end_array "Results" + return + fi + done + + # Wrap up the results array + metrics_json_end_array "Results" +} + + +function show_vars() +{ + echo -e "\nEvironment variables:" + echo -e "\tName (default)" + echo -e "\t\tDescription" + echo -e "\tPAYLOAD (${PAYLOAD})" + echo -e "\t\tThe ctr image to run" + echo -e "\tPAYLOAD_ARGS (${PAYLOAD_ARGS})" + echo -e "\t\tAny extra arguments passed into the ctr 'run' command" + echo -e "\tPAYLOAD_SLEEP (${PAYLOAD_SLEEP})" + echo -e "\t\tSeconds to sleep between launch and measurement, to allow settling" + echo -e "\tMAX_NUM_CONTAINERS (${MAX_NUM_CONTAINERS})" + echo -e "\t\tThe maximum number of containers to run before terminating" + echo -e "\tMAX_MEMORY_CONSUMED (${MAX_MEMORY_CONSUMED})" + echo -e "\t\tThe maximum amount of memory to be consumed before terminating" + echo -e "\tMIN_MEMORY_FREE (${MIN_MEMORY_FREE})" + echo -e "\t\tThe path to the ctr binary (for 'smem' measurements)" + echo -e "\tDUMP_CACHES (${DUMP_CACHES})" + echo -e "\t\tA flag to note if the system caches should be dumped before capturing stats" + echo -e "\tTEST_NAME (${TEST_NAME})" + echo -e "\t\tCan be set to over-ride the default JSON results filename" + +} + +function help() +{ + usage=$(cat << EOF +Usage: $0 [-h] [options] + Description: + Launch a series of workloads and take memory metric measurements after + each launch. + Options: + -h, Help page. +EOF +) + echo "$usage" + show_vars +} + +function main() { + + local OPTIND + while getopts "h" opt;do + case ${opt} in + h) + help + exit 0; + ;; + esac + done + shift $((OPTIND-1)) + + init + go + cleanup +} + +main "$@" diff --git a/tests/metrics/density/memory_usage.sh b/tests/metrics/density/memory_usage.sh new file mode 100755 index 000000000..455148845 --- /dev/null +++ b/tests/metrics/density/memory_usage.sh @@ -0,0 +1,381 @@ +#!/bin/bash +# Copyright (c) 2017-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Description of the test: +# This test launches a number of containers in idle mode, +# It will then sleep for a configurable period of time to allow +# any memory optimisations to 'settle, and then checks the +# amount of memory used by all the containers to come up with +# an average (using the PSS measurements) +# This test uses smem tool to get the memory used. + +set -e + +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +# Busybox image: Choose a small workload image, this is +# in order to measure the runtime footprint, not the workload +# footprint. +IMAGE='quay.io/prometheus/busybox:latest' + +CMD='tail -f /dev/null' +NUM_CONTAINERS="$1" +WAIT_TIME="$2" +AUTO_MODE="$3" +TEST_NAME="memory footprint" +SMEM_BIN="smem" +KSM_ENABLE_FILE="/sys/kernel/mm/ksm/run" +MEM_TMP_FILE=$(mktemp meminfo.XXXXXXXXXX) +PS_TMP_FILE=$(mktemp psinfo.XXXXXXXXXX) + +function remove_tmp_file() { + rm -rf $MEM_TMP_FILE $PS_TMP_FILE +} + +trap remove_tmp_file EXIT + +# Show help about this script +help(){ +cat << EOF +Usage: $0 [auto] + Description: + : Number of containers to run. + : Time in seconds to wait before taking + metrics. + [auto] : Optional 'auto KSM settle' mode + waits for ksm pages_shared to settle down +EOF +} + + +get_runc_pss_memory(){ + ctr_runc_shim_path="/usr/local/bin/containerd-shim-runc-v2" + get_pss_memory "$ctr_runc_shim_path" +} + +get_runc_individual_memory() { + runc_process_result=$(cat $MEM_TMP_FILE | tr "\n" " " | sed -e 's/\s$//g' | sed 's/ /, /g') + + # Verify runc process result + if [ -z "$runc_process_result" ];then + die "Runc process not found" + fi + + read -r -a runc_values <<< "${runc_process_result}" + + metrics_json_start_array + + local json="$(cat << EOF + { + "runc individual results": [ + $(for ((i=0;i<${NUM_CONTAINERS[@]};++i)); do + printf '%s\n\t\t\t' "${runc_values[i]}" + done) + ] + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Raw results" +} + +# This function measures the PSS average +# memory of a process. +get_pss_memory(){ + ps="$1" + mem_amount=0 + count=0 + avg=0 + + if [ -z "$ps" ]; then + die "No argument to get_pss_memory()" + fi + + # Save all the processes names + # This will be help us to retrieve raw information + echo $ps >> $PS_TMP_FILE + + data=$(sudo "$SMEM_BIN" --no-header -P "^$ps" -c "pss" | sed 's/[[:space:]]//g') + + # Save all the smem results + # This will help us to retrieve raw information + echo $data >> $MEM_TMP_FILE + + for i in $data;do + if (( i > 0 ));then + mem_amount=$(( i + mem_amount )) + (( count++ )) + fi + done + + if (( $count > 0 ));then + avg=$(bc -l <<< "scale=2; $mem_amount / $count") + fi + + echo "$avg" +} + +ppid() { + local pid + pid=$(ps -p "${1:-nopid}" -o ppid=) + echo "${pid//[[:blank:]]/}" +} + +# This function measures the PSS average +# memory of virtiofsd. +# It is a special case of get_pss_memory, +# virtiofsd forks itself so, smem sees the process +# two times, this function sum both pss values: +# pss_virtiofsd=pss_fork + pss_parent +get_pss_memory_virtiofsd() { + mem_amount=0 + count=0 + avg=0 + + virtiofsd_path=${1:-} + if [ -z "${virtiofsd_path}" ]; then + die "virtiofsd_path not provided" + fi + + echo "${virtiofsd_path}" >> $PS_TMP_FILE + + virtiofsd_pids=$(ps aux | grep [v]irtiofsd | awk '{print $2}') + data=$(sudo smem --no-header -P "^${virtiofsd_path}" -c pid -c "pid pss") + + for p in ${virtiofsd_pids}; do + parent_pid=$(ppid ${p}) + cmd="$(cat /proc/${p}/cmdline | tr -d '\0')" + cmd_parent="$(cat /proc/${parent_pid}/cmdline | tr -d '\0')" + if [ "${cmd}" != "${cmd_parent}" ]; then + pss_parent=$(printf "%s" "${data}" | grep "\s^${p}" | awk '{print $2}') + + fork=$(pgrep -P ${p}) + + pss_fork=$(printf "%s" "${data}" | grep "^\s*${fork}" | awk '{print $2}') + pss_process=$((pss_fork + pss_parent)) + + # Save all the smem results + # This will help us to retrieve raw information + echo "${pss_process}" >>$MEM_TMP_FILE + + if ((pss_process > 0)); then + mem_amount=$((pss_process + mem_amount)) + ((count++)) + fi + fi + done + + if (( $count > 0 ));then + avg=$(bc -l <<< "scale=2; $mem_amount / $count") + fi + echo "${avg}" +} + +get_individual_memory(){ + # Getting all the individual container information + first_process_name=$(cat $PS_TMP_FILE | awk 'NR==1' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g') + first_process_result=$(cat $MEM_TMP_FILE | awk 'NR==1' | sed 's/ /, /g') + + second_process_name=$(cat $PS_TMP_FILE | awk 'NR==2' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g') + second_process_result=$(cat $MEM_TMP_FILE | awk 'NR==2' | sed 's/ /, /g') + + third_process_name=$(cat $PS_TMP_FILE | awk 'NR==3' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g') + third_process_result=$(cat $MEM_TMP_FILE | awk 'NR==3' | sed 's/ /, /g') + + read -r -a first_values <<< "${first_process_result}" + read -r -a second_values <<< "${second_process_result}" + read -r -a third_values <<< "${third_process_result}" + + metrics_json_start_array + + local json="$(cat << EOF + { + "$first_process_name memory": [ + $(for ((i=0;i<${NUM_CONTAINERS[@]};++i)); do + [ -n "${first_values[i]}" ] && + printf '%s\n\t\t\t' "${first_values[i]}" + done) + ], + "$second_process_name memory": [ + $(for ((i=0;i<${NUM_CONTAINERS[@]};++i)); do + [ -n "${second_values[i]}" ] && + printf '%s\n\t\t\t' "${second_values[i]}" + done) + ], + "$third_process_name memory": [ + $(for ((i=0;i<${NUM_CONTAINERS[@]};++i)); do + [ -n "${third_values[i]}" ] && + printf '%s\n\t\t\t' "${third_values[i]}" + done) + ] + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Raw results" +} + +# Try to work out the 'average memory footprint' of a container. +get_docker_memory_usage(){ + hypervisor_mem=0 + virtiofsd_mem=0 + shim_mem=0 + memory_usage=0 + + containers=() + + for ((i=1; i<= NUM_CONTAINERS; i++)); do + containers+=($(random_name)) + ${CTR_EXE} run --runtime "${CTR_RUNTIME}" -d ${IMAGE} ${containers[-1]} ${CMD} + done + + if [ "$AUTO_MODE" == "auto" ]; then + if (( ksm_on != 1 )); then + die "KSM not enabled, cannot use auto mode" + fi + + echo "Entering KSM settle auto detect mode..." + wait_ksm_settle $WAIT_TIME + else + # If KSM is enabled, then you normally want to sleep long enough to + # let it do its work and for the numbers to 'settle'. + echo "napping $WAIT_TIME s" + sleep "$WAIT_TIME" + fi + + metrics_json_start_array + # Check the runtime in order in order to determine which process will + # be measured about PSS + if [ "$RUNTIME" == "runc" ]; then + runc_workload_mem="$(get_runc_pss_memory)" + memory_usage="$runc_workload_mem" + + local json="$(cat << EOF + { + "average": { + "Result": $memory_usage, + "Units" : "KB" + }, + "runc": { + "Result": $runc_workload_mem, + "Units" : "KB" + } + } +EOF +)" + + else [ "$RUNTIME" == "kata-runtime" ] || [ "$RUNTIME" == "kata-qemu" ] + # Get PSS memory of VM runtime components. + # And check that the smem search has found the process - we get a "0" + # back if that procedure fails (such as if a process has changed its name + # or is not running when expected to be so) + # As an added bonus - this script must be run as root. + # Now if you do not have enough rights + # the smem failure to read the stats will also be trapped. + + hypervisor_mem="$(get_pss_memory "$HYPERVISOR_PATH")" + if [ "$hypervisor_mem" == "0" ]; then + die "Failed to find PSS for $HYPERVISOR_PATH" + fi + + virtiofsd_mem="$(get_pss_memory_virtiofsd "$VIRTIOFSD_PATH")" + if [ "$virtiofsd_mem" == "0" ]; then + echo >&2 "WARNING: Failed to find PSS for $VIRTIOFSD_PATH" + fi + shim_mem="$(get_pss_memory "$SHIM_PATH")" + if [ "$shim_mem" == "0" ]; then + die "Failed to find PSS for $SHIM_PATH" + fi + + mem_usage="$(bc -l <<< "scale=2; $hypervisor_mem +$virtiofsd_mem + $shim_mem")" + memory_usage="$mem_usage" + + local json="$(cat << EOF + { + "average": { + "Result": $mem_usage, + "Units" : "KB" + }, + "qemus": { + "Result": $hypervisor_mem, + "Units" : "KB" + }, + "virtiofsds": { + "Result": $virtiofsd_mem, + "Units" : "KB" + }, + "shims": { + "Result": $shim_mem, + "Units" : "KB" + } + } +EOF +)" + fi + + metrics_json_add_array_element "$json" + metrics_json_end_array "Results" + + clean_env_ctr +} + +save_config(){ + metrics_json_start_array + + local json="$(cat << EOF + { + "containers": $NUM_CONTAINERS, + "ksm": $ksm_on, + "auto": "$AUTO_MODE", + "waittime": $WAIT_TIME, + "image": "$IMAGE", + "command": "$CMD" + } +EOF + +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Config" +} + +main(){ + # Verify enough arguments + if [ $# != 2 ] && [ $# != 3 ];then + echo >&2 "error: Not enough arguments [$@]" + help + exit 1 + fi + + #Check for KSM before reporting test name, as it can modify it + check_for_ksm + + init_env + + check_cmds "${SMEM_BIN}" bc + check_images "$IMAGE" + + if [ "${CTR_RUNTIME}" == "io.containerd.kata.v2" ]; then + export RUNTIME="kata-runtime" + elif [ "${CTR_RUNTIME}" == "io.containerd.runc.v2" ]; then + export RUNTIME="runc" + else + die "Unknown runtime ${CTR_RUNTIME}" + fi + + metrics_json_init + save_config + get_docker_memory_usage + + if [ "$RUNTIME" == "runc" ]; then + get_runc_individual_memory + elif [ "$RUNTIME" == "kata-runtime" ]; then + get_individual_memory + fi + + metrics_json_save +} + +main "$@" diff --git a/tests/metrics/density/memory_usage_inside_container.sh b/tests/metrics/density/memory_usage_inside_container.sh new file mode 100755 index 000000000..071ded175 --- /dev/null +++ b/tests/metrics/density/memory_usage_inside_container.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# Copyright (c) 2017-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Description of the test: +# This test launches a busybox container and inside +# memory free, memory available and total memory +# is measured by using /proc/meminfo. + +set -e + +# General env +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +TEST_NAME="memory footprint inside container" +VERSIONS_FILE="${SCRIPT_PATH}/../../versions.yaml" +IMAGE='quay.io/prometheus/busybox:latest' +CMD="sleep 10; cat /proc/meminfo" +# We specify here in 'k', as that then matches the results we get from the meminfo, +# which makes later direct comparison easier. +MEMSIZE=${MEMSIZE:-$((2048*1024))} + +# this variable determines the number of attempts when a test +# result is considered not valid (a zero value or a negative value) +MAX_FAILED_ATTEMPTS=3 +memtotalAvg=0 +units_memtotal="" +memfreeAvg=0 +units_memfree="" +memavailableAvg=0 +units_memavailable="" + +# count_iters: is the index of the current iteration +count_iters=0 + +# valid_result: if value stored is '1' the result is valid, '0' otherwise +valid_result=0 + +parse_results() { + local raw_results="${1}" + + # Variables used for sum cummulative values in the case of two or more reps. + # and used to compute average results for 'json' output format. + local memtotal_acu="${2:-0}" + local memfree_acu="${3:-0}" + local memavailable_acu="${4:-0}" + + local memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $2}') + units_memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $3}') + + local memfree=$(echo "$raw_results" | awk '/MemFree/ {print $2}') + units_memfree=$(echo "$raw_results" | awk '/MemFree/ {print $3}') + + local memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $2}') + units_memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $3}') + + # check results: if any result is zero or negative, it is considered as invalid, and the test will be repeated. + if (( $(echo "$memtotal <= 0" | bc -l) )) || (( $(echo "$memfree <= 0" | bc -l) )) || (( $(echo "$memavailable <= 0" | bc -l) )); then + MAX_FAILED_ATTEMPTS=$((MAX_FAILED_ATTEMPTS-1)) + valid_result=0 + info "Skipping invalid result: memtotal: $memtotal memfree: $memfree memavailable: $memavailable" + return 0 + fi + + memtotalAvg=$((memtotal+memtotal_acu)) + memfreeAvg=$((memfree+memfree_acu)) + memavailableAvg=$((memavailable+memavailable_acu)) + valid_result=1 + info "Iteration# $count_iters memtotal: $memtotal memfree: $memfree memavailable: $memavailable" +} + +store_results_json() { + metrics_json_start_array + memtotalAvg=$(echo "scale=2; $memtotalAvg / $count_iters" | bc) + memfreeAvg=$(echo "scale=2; $memfreeAvg / $count_iters" | bc) + memavailableAvg=$(echo "scale=2; $memavailableAvg / $count_iters" | bc) + + local json="$(cat << EOF + { + "memrequest": { + "Result" : ${MEMSIZE}, + "Units" : "Kb" + }, + "memtotal": { + "Result" : ${memtotalAvg}, + "Units" : "${units_memtotal}" + }, + "memfree": { + "Result" : ${memfreeAvg}, + "Units" : "${units_memfree}" + }, + "memavailable": { + "Result" : ${memavailableAvg}, + "Units" : "${units_memavailable}" + }, + "repetitions": { + "Result" : ${count_iters} + } + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Results" + metrics_json_save +} + +function main() { + # switch to select output format + local num_iterations=${1:-1} + info "Iterations: $num_iterations" + + # Check tools/commands dependencies + cmds=("awk" "ctr") + init_env + check_cmds "${cmds[@]}" + check_images "${IMAGE}" + metrics_json_init + while [ $count_iters -lt $num_iterations ]; do + local output=$(sudo -E "${CTR_EXE}" run --memory-limit $((MEMSIZE*1024)) --rm --runtime=$CTR_RUNTIME $IMAGE busybox sh -c "$CMD" 2>&1) + parse_results "${output}" "${memtotalAvg}" "${memfreeAvg}" "${memavailableAvg}" + + # quit if number of attempts exceeds the allowed value. + [ ${MAX_FAILED_ATTEMPTS} -eq 0 ] && die "Max number of attempts exceeded." + [ ${valid_result} -eq 1 ] && count_iters=$((count_iters+1)) + done + store_results_json + clean_env_ctr +} + +# Parameters +# @1: num_iterations {integer} +main "$@" diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index e92a8c698..867161673 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -14,12 +14,11 @@ metrics_dir="$(dirname "$(readlink -f "$0")")" source "${metrics_dir}/../common.bash" function create_symbolic_links() { - hypervisor="${1:-qemu}" local link_configuration_file="/opt/kata/share/defaults/kata-containers/configuration.toml" - local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${hypervisor}.toml" + local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${KATA_HYPERVISOR}.toml" - if [ ${hypervisor} != 'qemu' ] && [ ${hypervisor} != 'clh' ]; then - die "Failed to set the configuration.toml: '${hypervisor}' is not recognized as a valid hypervisor name." + if [ ${KATA_HYPERVISOR} != 'qemu' ] && [ ${KATA_HYPERVISOR} != 'clh' ]; then + die "Failed to set the configuration.toml: '${KATA_HYPERVISOR}' is not recognized as a valid hypervisor name." fi sudo ln -sf "${source_configuration_file}" "${link_configuration_file}" @@ -45,6 +44,8 @@ EOF } function install_kata() { + # ToDo: remove the exit once the metrics workflow is stable + exit 0 local kata_tarball="kata-static.tar.xz" declare -r katadir="/opt/kata" declare -r destdir="/" @@ -83,20 +84,39 @@ function check_containerd_config_for_kata() { } function run_test_launchtimes() { - hypervisor="${1}" + info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor" - info "Running Launch Time test using ${hypervisor} hypervisor" - - create_symbolic_links "${hypervisor}" + # ToDo: remove the exit once the metrics workflow is stable + exit 0 + create_symbolic_links bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20 } +function run_test_memory_usage() { + info "Running memory-usage test using ${KATA_HYPERVISOR} hypervisor" + + # ToDo: remove the exit once the metrics workflow is stable + exit 0 + create_symbolic_links + bash tests/metrics/density/memory_usage.sh 20 5 +} + +function run_test_memory_usage_inside_container() { + info "Running memory-usage inside the container test using ${KATA_HYPERVISOR} hypervisor" + + # ToDo: remove the exit once the metrics workflow is stable + exit 0 + create_symbolic_links + bash tests/metrics/density/memory_usage_inside_container.sh 5 +} + function main() { action="${1:-}" case "${action}" in install-kata) install_kata ;; - run-test-launchtimes-qemu) run_test_launchtimes "qemu" ;; - run-test-launchtimes-clh) run_test_launchtimes "clh" ;; + run-test-launchtimes) run_test_launchtimes ;; + run-test-memory-usage) run_test_memory_usage ;; + run-test-memory-usage-inside-container) run_test_memory_usage_inside_container ;; *) >&2 die "Invalid argument" ;; esac } From 5681caad5c5281b0cadb33f16fda867f5c4ee4bf Mon Sep 17 00:00:00 2001 From: Bo Chen Date: Fri, 30 Jun 2023 09:34:07 -0700 Subject: [PATCH 26/87] versions: Upgrade to Cloud Hypervisor v33.0 Details of this release can be found in ourroadmap project as iteration v33.0: https://github.com/orgs/cloud-hypervisor/projects/6. Fixes: #7216 Signed-off-by: Bo Chen --- versions.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/versions.yaml b/versions.yaml index ec9f8df7a..c26a68134 100644 --- a/versions.yaml +++ b/versions.yaml @@ -75,7 +75,7 @@ assets: url: "https://github.com/cloud-hypervisor/cloud-hypervisor" uscan-url: >- https://github.com/cloud-hypervisor/cloud-hypervisor/tags.*/v?(\d\S+)\.tar\.gz - version: "v32.0" + version: "v33.0" firecracker: description: "Firecracker micro-VMM" From 6a21e20c6397f84bdfd393a28a4339c92f0812bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 28 Jun 2023 16:43:08 +0200 Subject: [PATCH 27/87] runtime: Add "none" as a shared_fs option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, even when using devmapper, if the VMM supports virtio-fs / virtio-9p, that's used to share a few files between the host and the guest. This *needed*, as we need to share with the guest contents like secrets, certificates, and configurations, via Kubernetes objects like configMaps or secrets, and those are rotated and must be updated into the guest whenever the rotation happens. However, there are still use-cases users can live with just copying those files into the guest at the pod creation time, and for those there's absolutely no need to have a shared filesystem process running with no extra obvious benefit, consuming memory and even increasing the attack surface used by Kata Containers. For the case mentioned above, we should allow users, making it very clear which limitations it'll bring, to run Kata Containers with devmapper without actually having to use a shared file system, which is already the approach taken when using Firecracker as the VMM. Fixes: #7207 Signed-off-by: Fabiano Fidêncio --- src/runtime/config/configuration-clh.toml.in | 5 ++++ .../configuration-qemu-nvidia-gpu.toml.in | 5 ++++ .../config/configuration-qemu-sev.toml.in | 7 ++++- .../config/configuration-qemu-snp.toml.in | 5 ++++ .../config/configuration-qemu-tdx.toml.in | 5 ++++ src/runtime/config/configuration-qemu.toml.in | 5 ++++ src/runtime/pkg/device/config/config.go | 11 ++++++++ src/runtime/pkg/katautils/config.go | 9 ++++--- src/runtime/virtcontainers/acrn.go | 2 +- src/runtime/virtcontainers/acrn_arch_base.go | 4 +-- .../virtcontainers/acrn_arch_base_test.go | 3 ++- src/runtime/virtcontainers/clh.go | 8 +++++- src/runtime/virtcontainers/clh_test.go | 27 +++++++++++++++++++ src/runtime/virtcontainers/qemu.go | 2 +- src/runtime/virtcontainers/qemu_amd64.go | 7 +++-- src/runtime/virtcontainers/qemu_amd64_test.go | 5 ++-- src/runtime/virtcontainers/qemu_arch_base.go | 8 +++--- .../virtcontainers/qemu_arch_base_test.go | 9 ++++++- src/runtime/virtcontainers/qemu_ppc64le.go | 7 +++-- 19 files changed, 113 insertions(+), 21 deletions(-) diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index 0ac6e1434..fd906ff0a 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -131,6 +131,11 @@ default_maxmemory = @DEFMAXMEMSZ@ # Shared file system type: # - virtio-fs (default) # - virtio-fs-nydus +# - none +# WARNING: "none" should be carefully used, and only used in very few specific cases, as +# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing +# issues with rotation of secrets, certs, or configurations via kubernetes objects like +# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*. shared_fs = "@DEFSHAREDFS_CLH_VIRTIOFS@" # Path to vhost-user-fs daemon. diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in index 33574b17d..4861cb1ed 100644 --- a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in +++ b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in @@ -178,6 +178,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@ # - virtio-fs (default) # - virtio-9p # - virtio-fs-nydus +# - none +# WARNING: "none" should be carefully used, and only used in very few specific cases, as +# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing +# issues with rotation of secrets, certs, or configurations via kubernetes objects like +# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*. shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@" # Path to vhost-user-fs daemon. diff --git a/src/runtime/config/configuration-qemu-sev.toml.in b/src/runtime/config/configuration-qemu-sev.toml.in index a108b726c..413ae1950 100644 --- a/src/runtime/config/configuration-qemu-sev.toml.in +++ b/src/runtime/config/configuration-qemu-sev.toml.in @@ -164,6 +164,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@ # - virtio-fs (default) # - virtio-9p # - virtio-fs-nydus +# - none +# WARNING: "none" should be carefully used, and only used in very few specific cases, as +# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing +# issues with rotation of secrets, certs, or configurations via kubernetes objects like +# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*. shared_fs = "@DEFSHAREDFS_QEMU_SEV_VIRTIOFS@" # Path to vhost-user-fs daemon. @@ -647,4 +652,4 @@ service_offload = @DEFSERVICEOFFLOAD@ # # Keys can be remotely provisioned. The Kata agent fetches them from e.g. # a HTTPS URL: -#provision=https://my-key-broker.foo/tenant/ \ No newline at end of file +#provision=https://my-key-broker.foo/tenant/ diff --git a/src/runtime/config/configuration-qemu-snp.toml.in b/src/runtime/config/configuration-qemu-snp.toml.in index 6a608a133..54551cba5 100644 --- a/src/runtime/config/configuration-qemu-snp.toml.in +++ b/src/runtime/config/configuration-qemu-snp.toml.in @@ -176,6 +176,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@ # - virtio-fs (default) # - virtio-9p # - virtio-fs-nydus +# - none +# WARNING: "none" should be carefully used, and only used in very few specific cases, as +# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing +# issues with rotation of secrets, certs, or configurations via kubernetes objects like +# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*. shared_fs = "@DEFSHAREDFS_QEMU_SNP_VIRTIOFS@" # Path to vhost-user-fs daemon. diff --git a/src/runtime/config/configuration-qemu-tdx.toml.in b/src/runtime/config/configuration-qemu-tdx.toml.in index 52f415f2f..fcf1dbe3d 100644 --- a/src/runtime/config/configuration-qemu-tdx.toml.in +++ b/src/runtime/config/configuration-qemu-tdx.toml.in @@ -172,6 +172,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@ # - virtio-fs (default) # - virtio-9p # - virtio-fs-nydus +# - none +# WARNING: "none" should be carefully used, and only used in very few specific cases, as +# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing +# issues with rotation of secrets, certs, or configurations via kubernetes objects like +# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*. shared_fs = "@DEFSHAREDFS_QEMU_TDX_VIRTIOFS@" # Path to vhost-user-fs daemon. diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index 37ee3a752..558d55265 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -178,6 +178,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@ # - virtio-fs (default) # - virtio-9p # - virtio-fs-nydus +# - none +# WARNING: "none" should be carefully used, and only used in very few specific cases, as +# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing +# issues with rotation of secrets, certs, or configurations via kubernetes objects like +# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*. shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@" # Path to vhost-user-fs daemon. diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index dee6291ed..ed679da18 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -81,6 +81,17 @@ const ( // VirtioFSNydus means use nydus for the shared file system VirtioFSNydus = "virtio-fs-nydus" + + // NoSharedFS means *no* shared file system solution will be used + // and files will be copied into the guest system. + // + // WARNING: This should be carefully used, and only used in very few + // specific cases, as any update to the mount will *NOT* be reflected + // during the lifecycle of the pod, causing issues with rotation of + // secrets, certs, or configurations via kubernetes objects like + // configMaps or secrets, as those will be copied into the guest at + // *pod* *creation* *time*. + NoSharedFS = "none" ) const ( diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 6b08f4afe..a0d232215 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -512,7 +512,7 @@ func (h hypervisor) blockDeviceAIO() (string, error) { } func (h hypervisor) sharedFS() (string, error) { - supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus} + supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus, config.NoSharedFS} if h.SharedFS == "" { return config.VirtioFS, nil @@ -1009,11 +1009,12 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { return vc.HypervisorConfig{}, err } - if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus { - return vc.HypervisorConfig{}, errors.New("clh only support virtio-fs or virtio-fs-nydus") + if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus && sharedFS != config.NoSharedFS { + return vc.HypervisorConfig{}, + fmt.Errorf("Cloud Hypervisor does not support %s shared filesystem option", sharedFS) } - if h.VirtioFSDaemon == "" { + if (sharedFS == config.VirtioFS || sharedFS == config.VirtioFSNydus) && h.VirtioFSDaemon == "" { return vc.HypervisorConfig{}, fmt.Errorf("cannot enable %s without daemon path in configuration file", sharedFS) } diff --git a/src/runtime/virtcontainers/acrn.go b/src/runtime/virtcontainers/acrn.go index 35c71a6d6..b652ff71c 100644 --- a/src/runtime/virtcontainers/acrn.go +++ b/src/runtime/virtcontainers/acrn.go @@ -90,7 +90,7 @@ func (a *Acrn) Capabilities(ctx context.Context) types.Capabilities { span, _ := katatrace.Trace(ctx, a.Logger(), "Capabilities", acrnTracingTags, map[string]string{"sandbox_id": a.id}) defer span.End() - return a.arch.capabilities() + return a.arch.capabilities(a.config) } func (a *Acrn) HypervisorConfig() HypervisorConfig { diff --git a/src/runtime/virtcontainers/acrn_arch_base.go b/src/runtime/virtcontainers/acrn_arch_base.go index 77fb8e9e5..0b9ee53cc 100644 --- a/src/runtime/virtcontainers/acrn_arch_base.go +++ b/src/runtime/virtcontainers/acrn_arch_base.go @@ -33,7 +33,7 @@ type acrnArch interface { kernelParameters(debug bool) []Param //capabilities returns the capabilities supported by acrn - capabilities() types.Capabilities + capabilities(config HypervisorConfig) types.Capabilities // memoryTopology returns the memory topology using the given amount of memoryMb and hostMemoryMb memoryTopology(memMb uint64) Memory @@ -361,7 +361,7 @@ func (a *acrnArchBase) memoryTopology(memoryMb uint64) Memory { return memory } -func (a *acrnArchBase) capabilities() types.Capabilities { +func (a *acrnArchBase) capabilities(config HypervisorConfig) types.Capabilities { var caps types.Capabilities caps.SetBlockDeviceSupport() diff --git a/src/runtime/virtcontainers/acrn_arch_base_test.go b/src/runtime/virtcontainers/acrn_arch_base_test.go index 61db47416..c34974e69 100644 --- a/src/runtime/virtcontainers/acrn_arch_base_test.go +++ b/src/runtime/virtcontainers/acrn_arch_base_test.go @@ -83,8 +83,9 @@ func TestAcrnArchBaseKernelParameters(t *testing.T) { func TestAcrnArchBaseCapabilities(t *testing.T) { assert := assert.New(t) acrnArchBase := newAcrnArchBase() + config := HypervisorConfig{} - c := acrnArchBase.capabilities() + c := acrnArchBase.capabilities(config) assert.True(c.IsBlockDeviceSupported()) assert.True(c.IsBlockDeviceHotplugSupported()) assert.False(c.IsFsSharingSupported()) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 6ae99d673..9c8d1faea 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -349,6 +349,10 @@ func (clh *cloudHypervisor) createVirtiofsDaemon(sharedPath string) (VirtiofsDae } func (clh *cloudHypervisor) setupVirtiofsDaemon(ctx context.Context) error { + if clh.config.SharedFS == config.NoSharedFS { + return nil + } + if clh.config.SharedFS == config.Virtio9P { return errors.New("cloud-hypervisor only supports virtio based file sharing") } @@ -1205,7 +1209,9 @@ func (clh *cloudHypervisor) Capabilities(ctx context.Context) types.Capabilities clh.Logger().WithField("function", "Capabilities").Info("get Capabilities") var caps types.Capabilities - caps.SetFsSharingSupport() + if clh.config.SharedFS != config.NoSharedFS { + caps.SetFsSharingSupport() + } caps.SetBlockDeviceHotplugSupport() return caps } diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go index b5c800e95..a5eacabca 100644 --- a/src/runtime/virtcontainers/clh_test.go +++ b/src/runtime/virtcontainers/clh_test.go @@ -726,3 +726,30 @@ func TestClhSetConfig(t *testing.T) { assert.Equal(clh.config, config) } + +func TestClhCapabilities(t *testing.T) { + assert := assert.New(t) + + hConfig, err := newClhConfig() + assert.NoError(err) + + clh := &cloudHypervisor{} + assert.Equal(clh.config, HypervisorConfig{}) + + hConfig.SharedFS = config.VirtioFS + + err = clh.setConfig(&hConfig) + assert.NoError(err) + + var ctx context.Context + c := clh.Capabilities(ctx) + assert.True(c.IsFsSharingSupported()) + + hConfig.SharedFS = config.NoSharedFS + + err = clh.setConfig(&hConfig) + assert.NoError(err) + + c = clh.Capabilities(ctx) + assert.False(c.IsFsSharingSupported()) +} diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index d856490e5..5ad55121c 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -211,7 +211,7 @@ func (q *qemu) Capabilities(ctx context.Context) types.Capabilities { span, _ := katatrace.Trace(ctx, q.Logger(), "Capabilities", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() - return q.arch.capabilities() + return q.arch.capabilities(q.config) } func (q *qemu) HypervisorConfig() HypervisorConfig { diff --git a/src/runtime/virtcontainers/qemu_amd64.go b/src/runtime/virtcontainers/qemu_amd64.go index 4ecf0804a..dc47f17cc 100644 --- a/src/runtime/virtcontainers/qemu_amd64.go +++ b/src/runtime/virtcontainers/qemu_amd64.go @@ -16,6 +16,7 @@ import ( "github.com/sirupsen/logrus" "github.com/intel-go/cpuid" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu" ) @@ -155,7 +156,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) { return q, nil } -func (q *qemuAmd64) capabilities() types.Capabilities { +func (q *qemuAmd64) capabilities(hConfig HypervisorConfig) types.Capabilities { var caps types.Capabilities if q.qemuMachine.Type == QemuQ35 || @@ -164,7 +165,9 @@ func (q *qemuAmd64) capabilities() types.Capabilities { } caps.SetMultiQueueSupport() - caps.SetFsSharingSupport() + if hConfig.SharedFS != config.NoSharedFS { + caps.SetFsSharingSupport() + } return caps } diff --git a/src/runtime/virtcontainers/qemu_amd64_test.go b/src/runtime/virtcontainers/qemu_amd64_test.go index cb9a7bb38..82005b745 100644 --- a/src/runtime/virtcontainers/qemu_amd64_test.go +++ b/src/runtime/virtcontainers/qemu_amd64_test.go @@ -42,13 +42,14 @@ func TestQemuAmd64BadMachineType(t *testing.T) { func TestQemuAmd64Capabilities(t *testing.T) { assert := assert.New(t) + config := HypervisorConfig{} amd64 := newTestQemu(assert, QemuQ35) - caps := amd64.capabilities() + caps := amd64.capabilities(config) assert.True(caps.IsBlockDeviceHotplugSupported()) amd64 = newTestQemu(assert, QemuMicrovm) - caps = amd64.capabilities() + caps = amd64.capabilities(config) assert.False(caps.IsBlockDeviceHotplugSupported()) } diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index 9aff1e76c..72c9853ce 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -51,7 +51,7 @@ type qemuArch interface { kernelParameters(debug bool) []Param //capabilities returns the capabilities supported by QEMU - capabilities() types.Capabilities + capabilities(config HypervisorConfig) types.Capabilities // bridges sets the number bridges for the machine type bridges(number uint32) @@ -280,11 +280,13 @@ func (q *qemuArchBase) kernelParameters(debug bool) []Param { return params } -func (q *qemuArchBase) capabilities() types.Capabilities { +func (q *qemuArchBase) capabilities(hConfig HypervisorConfig) types.Capabilities { var caps types.Capabilities caps.SetBlockDeviceHotplugSupport() caps.SetMultiQueueSupport() - caps.SetFsSharingSupport() + if hConfig.SharedFS != config.NoSharedFS { + caps.SetFsSharingSupport() + } return caps } diff --git a/src/runtime/virtcontainers/qemu_arch_base_test.go b/src/runtime/virtcontainers/qemu_arch_base_test.go index 51c11bd91..b0d08ddc9 100644 --- a/src/runtime/virtcontainers/qemu_arch_base_test.go +++ b/src/runtime/virtcontainers/qemu_arch_base_test.go @@ -117,9 +117,16 @@ func TestQemuArchBaseKernelParameters(t *testing.T) { func TestQemuArchBaseCapabilities(t *testing.T) { assert := assert.New(t) qemuArchBase := newQemuArchBase() + hConfig := HypervisorConfig{} + hConfig.SharedFS = config.VirtioFS - c := qemuArchBase.capabilities() + c := qemuArchBase.capabilities(hConfig) assert.True(c.IsBlockDeviceHotplugSupported()) + assert.True(c.IsFsSharingSupported()) + + hConfig.SharedFS = config.NoSharedFS + c = qemuArchBase.capabilities(hConfig) + assert.False(c.IsFsSharingSupported()) } func TestQemuArchBaseBridges(t *testing.T) { diff --git a/src/runtime/virtcontainers/qemu_ppc64le.go b/src/runtime/virtcontainers/qemu_ppc64le.go index 2d4010fbc..7d71d72ba 100644 --- a/src/runtime/virtcontainers/qemu_ppc64le.go +++ b/src/runtime/virtcontainers/qemu_ppc64le.go @@ -11,6 +11,7 @@ import ( "fmt" "time" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" "github.com/sirupsen/logrus" @@ -97,7 +98,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) { return q, nil } -func (q *qemuPPC64le) capabilities() types.Capabilities { +func (q *qemuPPC64le) capabilities(hConfig HypervisorConfig) types.Capabilities { var caps types.Capabilities // pseries machine type supports hotplugging drives @@ -106,7 +107,9 @@ func (q *qemuPPC64le) capabilities() types.Capabilities { } caps.SetMultiQueueSupport() - caps.SetFsSharingSupport() + if hConfig.SharedFS != config.NoSharedFS { + caps.SetFsSharingSupport() + } return caps } From 532755ce31d069dafaa1427c55757f0973381647 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bombo?= Date: Mon, 17 Apr 2023 16:59:22 -0700 Subject: [PATCH 28/87] tests: Build Mariner rootfs initrd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adds a new `rootfs-initrd-mariner` build target. * Sets the custom initrd path via annotation in `setup.sh` at test time. * Adapts versions.yaml to specify a `cbl-mariner` initrd variant. * Introduces env variable `HOST_OS` at deploy time to enable using a custom initrd. * Refactors the image builder so that its caller specifies the desired guest OS. Signed-off-by: Aurélien Bombo --- tests/integration/gha-run.sh | 7 ++- tests/integration/kubernetes/setup.sh | 10 +++ tools/packaging/guest-image/build_image.sh | 63 +++++++++---------- .../local-build/kata-deploy-binaries.sh | 56 ++++++++++++++--- .../kata-deploy/scripts/kata-deploy.sh | 5 ++ versions.yaml | 26 +++++--- 6 files changed, 113 insertions(+), 54 deletions(-) diff --git a/tests/integration/gha-run.sh b/tests/integration/gha-run.sh index 103ce2cda..b2493e3b9 100755 --- a/tests/integration/gha-run.sh +++ b/tests/integration/gha-run.sh @@ -9,7 +9,8 @@ set -o nounset set -o pipefail integration_dir="$(dirname "$(readlink -f "$0")")" -tools_dir="${integration_dir}/../../tools" +repo_root_dir="$(cd "${integration_dir}/../../" && pwd)" +tools_dir="${repo_root_dir}/tools" function _print_cluster_name() { short_sha="$(git rev-parse --short=12 HEAD)" @@ -56,9 +57,13 @@ function get_cluster_credentials() { } function run_tests() { + INSTALL_IN_GOPATH=false bash "${repo_root_dir}/ci/install_yq.sh" + platform="${1}" sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}|g" "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" + yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[+].name' "HOST_OS" + yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[-1].value' "${KATA_HOST_OS}" cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" | grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" || die "Failed to setup the tests image" diff --git a/tests/integration/kubernetes/setup.sh b/tests/integration/kubernetes/setup.sh index 0c3baf2dc..63d9fb682 100755 --- a/tests/integration/kubernetes/setup.sh +++ b/tests/integration/kubernetes/setup.sh @@ -8,13 +8,23 @@ set -o nounset set -o pipefail kubernetes_dir=$(dirname "$(readlink -f "$0")") +repo_root_dir="$(cd "${kubernetes_dir}/../../../" && pwd)" set_runtime_class() { sed -i -e "s|runtimeClassName: kata|runtimeClassName: kata-${KATA_HYPERVISOR}|" ${kubernetes_dir}/runtimeclass_workloads/*.yaml } +set_initrd_path() { + if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then + initrd_path="/opt/kata/share/kata-containers/kata-containers-initrd-cbl-mariner.img" + find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.initrd]' "${initrd_path}" \; + fi +} + main() { + INSTALL_IN_GOPATH=false bash "${repo_root_dir}/ci/install_yq.sh" set_runtime_class + set_initrd_path } main "$@" diff --git a/tools/packaging/guest-image/build_image.sh b/tools/packaging/guest-image/build_image.sh index 230538d1c..fad664651 100755 --- a/tools/packaging/guest-image/build_image.sh +++ b/tools/packaging/guest-image/build_image.sh @@ -22,45 +22,44 @@ readonly osbuilder_dir="$(cd "${repo_root_dir}/tools/osbuilder" && pwd)" export GOPATH=${GOPATH:-${HOME}/go} arch_target="$(uname -m)" -final_image_name="kata-containers" -final_initrd_name="kata-containers-initrd" +final_artifact_name="kata-containers" image_initrd_extension=".img" build_initrd() { info "Build initrd" - info "initrd os: $initrd_distro" - info "initrd os version: $initrd_os_version" + info "initrd os: $os_name" + info "initrd os version: $os_version" sudo -E PATH="$PATH" make initrd \ - DISTRO="$initrd_distro" \ + DISTRO="$os_name" \ DEBUG="${DEBUG:-}" \ - OS_VERSION="${initrd_os_version}" \ + OS_VERSION="${os_version}" \ ROOTFS_BUILD_DEST="${builddir}/initrd-image" \ USE_DOCKER=1 \ AGENT_INIT="yes" - mv "kata-containers-initrd.img" "${install_dir}/${initrd_name}" + mv "kata-containers-initrd.img" "${install_dir}/${artifact_name}" ( cd "${install_dir}" - ln -sf "${initrd_name}" "${final_initrd_name}${image_initrd_extension}" + ln -sf "${artifact_name}" "${final_artifact_name}${image_initrd_extension}" ) } build_image() { info "Build image" - info "image os: $img_distro" - info "image os version: $img_os_version" + info "image os: $os_name" + info "image os version: $os_version" sudo -E PATH="${PATH}" make image \ - DISTRO="${img_distro}" \ + DISTRO="${os_name}" \ DEBUG="${DEBUG:-}" \ USE_DOCKER="1" \ - IMG_OS_VERSION="${img_os_version}" \ + IMG_OS_VERSION="${os_version}" \ ROOTFS_BUILD_DEST="${builddir}/rootfs-image" - mv -f "kata-containers.img" "${install_dir}/${image_name}" + mv -f "kata-containers.img" "${install_dir}/${artifact_name}" if [ -e "root_hash.txt" ]; then cp root_hash.txt "${install_dir}/" fi ( cd "${install_dir}" - ln -sf "${image_name}" "${final_image_name}${image_initrd_extension}" + ln -sf "${artifact_name}" "${final_artifact_name}${image_initrd_extension}" ) } @@ -74,6 +73,8 @@ Usage: ${script_name} [options] Options: + --osname=${os_name} + --osversion=${os_version} --imagetype=${image_type} --prefix=${prefix} --destdir=${destdir} @@ -94,33 +95,20 @@ main() { case "$opt" in -) case "${OPTARG}" in + osname=*) + os_name=${OPTARG#*=} + ;; + osversion=*) + os_version=${OPTARG#*=} + ;; imagetype=image) image_type=image - #image information - img_distro=$(get_from_kata_deps "assets.image.architecture.${arch_target}.name") - img_os_version=$(get_from_kata_deps "assets.image.architecture.${arch_target}.version") - image_name="kata-${img_distro}-${img_os_version}.${image_type}" ;; imagetype=initrd) image_type=initrd - #initrd information - initrd_distro=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.name") - initrd_os_version=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.version") - initrd_name="kata-${initrd_distro}-${initrd_os_version}.${image_type}" ;; image_initrd_suffix=*) image_initrd_suffix=${OPTARG#*=} - if [ "${image_initrd_suffix}" == "sev" ]; then - initrd_distro=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.sev.name") - initrd_os_version=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.sev.version") - initrd_name="kata-${initrd_distro}-${initrd_os_version}-${image_initrd_suffix}.${image_type}" - final_initrd_name="${final_initrd_name}-${image_initrd_suffix}" - elif [ "${image_initrd_suffix}" == "tdx" ]; then - img_distro=$(get_from_kata_deps "assets.image.architecture.${arch_target}.name") - img_os_version=$(get_from_kata_deps "assets.image.architecture.${arch_target}.version") - image_name="kata-${img_distro}-${img_os_version}-${image_initrd_suffix}.${image_type}" - final_image_name="${final_image_name}-${image_initrd_suffix}" - fi ;; prefix=*) prefix=${OPTARG#*=} @@ -149,7 +137,16 @@ main() { echo "build ${image_type}" + if [ "${image_type}" = "initrd" ]; then + final_artifact_name+="-initrd" + fi + if [ -n "${image_initrd_suffix}" ]; then + artifact_name="kata-${os_name}-${os_version}-${image_initrd_suffix}.${image_type}" + final_artifact_name+="-${image_initrd_suffix}" + else + artifact_name="kata-${os_name}-${os_version}.${image_type}" + fi install_dir="${destdir}/${prefix}/share/kata-containers/" readonly install_dir diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh index a552aed12..58e62bb48 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh @@ -97,6 +97,7 @@ options: rootfs-image rootfs-image-tdx rootfs-initrd + rootfs-initrd-mariner rootfs-initrd-sev shim-v2 tdvf @@ -136,8 +137,13 @@ install_cached_tarball_component() { #Install guest image install_image() { - local image_type="${1:-"image"}" - local initrd_suffix="${2:-""}" + local variant="${1:-}" + + image_type="image" + if [ -n "${variant}" ]; then + image_type+="-${variant}" + fi + local jenkins="${jenkins_url}/job/kata-containers-main-rootfs-${image_type}-$(uname -m)/${cached_artifacts_path}" local component="rootfs-${image_type}" @@ -152,25 +158,39 @@ install_image() { install_cached_tarball_component \ "${component}" \ "${jenkins}" \ - "${osbuilder_last_commit}-${guest_image_last_commit}-${agent_last_commit}-${libs_last_commit}-${gperf_version}-${libseccomp_version}-${rust_version}-image" \ + "${osbuilder_last_commit}-${guest_image_last_commit}-${agent_last_commit}-${libs_last_commit}-${gperf_version}-${libseccomp_version}-${rust_version}-${image_type}" \ "" \ "${final_tarball_name}" \ "${final_tarball_path}" \ && return 0 info "Create image" - "${rootfs_builder}" --imagetype=image --prefix="${prefix}" --destdir="${destdir}" --image_initrd_suffix="${initrd_suffix}" + + if [ -n "${variant}" ]; then + os_name="$(get_from_kata_deps "assets.image.architecture.${ARCH}.${variant}.name")" + os_version="$(get_from_kata_deps "assets.image.architecture.${ARCH}.${variant}.version")" + else + os_name="$(get_from_kata_deps "assets.image.architecture.${ARCH}.name")" + os_version="$(get_from_kata_deps "assets.image.architecture.${ARCH}.version")" + fi + + "${rootfs_builder}" --osname="${os_name}" --osversion="${os_version}" --imagetype=image --prefix="${prefix}" --destdir="${destdir}" --image_initrd_suffix="${variant}" } #Install guest image for tdx install_image_tdx() { - install_image "image-tdx" "tdx" + install_image "tdx" } #Install guest initrd install_initrd() { - local initrd_type="${1:-"initrd"}" - local initrd_suffix="${2:-""}" + local variant="${1:-}" + + initrd_type="initrd" + if [ -n "${variant}" ]; then + initrd_type+="-${variant}" + fi + local jenkins="${jenkins_url}/job/kata-containers-main-rootfs-${initrd_type}-$(uname -m)/${cached_artifacts_path}" local component="rootfs-${initrd_type}" @@ -192,12 +212,26 @@ install_initrd() { && return 0 info "Create initrd" - "${rootfs_builder}" --imagetype=initrd --prefix="${prefix}" --destdir="${destdir}" --image_initrd_suffix="${initrd_suffix}" + + if [ -n "${variant}" ]; then + os_name="$(get_from_kata_deps "assets.initrd.architecture.${ARCH}.${variant}.name")" + os_version="$(get_from_kata_deps "assets.initrd.architecture.${ARCH}.${variant}.version")" + else + os_name="$(get_from_kata_deps "assets.initrd.architecture.${ARCH}.name")" + os_version="$(get_from_kata_deps "assets.initrd.architecture.${ARCH}.version")" + fi + + "${rootfs_builder}" --osname="${os_name}" --osversion="${os_version}" --imagetype=initrd --prefix="${prefix}" --destdir="${destdir}" --image_initrd_suffix="${variant}" +} + +#Install Mariner guest initrd +install_initrd_mariner() { + install_initrd "cbl-mariner" } #Install guest initrd for sev install_initrd_sev() { - install_initrd "initrd-sev" "sev" + install_initrd "sev" } #Install kernel component helper @@ -561,6 +595,7 @@ handle_build() { install_firecracker install_image install_initrd + install_initrd_mariner install_initrd_sev install_kernel install_kernel_dragonball_experimental @@ -616,7 +651,7 @@ handle_build() { rootfs-initrd) install_initrd ;; - rootfs-initrd-mariner) ;; + rootfs-initrd-mariner) install_initrd_mariner ;; rootfs-initrd-sev) install_initrd_sev ;; @@ -662,6 +697,7 @@ main() { qemu rootfs-image rootfs-initrd + rootfs-initrd-mariner shim-v2 virtiofsd ) diff --git a/tools/packaging/kata-deploy/scripts/kata-deploy.sh b/tools/packaging/kata-deploy/scripts/kata-deploy.sh index 6bb660198..155bdf1ff 100755 --- a/tools/packaging/kata-deploy/scripts/kata-deploy.sh +++ b/tools/packaging/kata-deploy/scripts/kata-deploy.sh @@ -64,6 +64,11 @@ function install_artifacts() { chmod +x /opt/kata/bin/* [ -d /opt/kata/runtime-rs/bin ] && \ chmod +x /opt/kata/runtime-rs/bin/* + + # Allow Mariner to specify a Mariner guest initrd. + if [ "${HOST_OS:-}" == "cbl-mariner" ]; then + sed -i -E 's|(enable_annotations) = .+|\1 = ["enable_iommu", "initrd"]|' /opt/kata/share/defaults/kata-containers/configuration-clh.toml + fi } function wait_till_node_is_ready() { diff --git a/versions.yaml b/versions.yaml index ec9f8df7a..2a3623878 100644 --- a/versions.yaml +++ b/versions.yaml @@ -122,17 +122,20 @@ assets: url: "https://github.com/kata-containers/kata-containers/tools/osbuilder" architecture: aarch64: - name: "ubuntu" - version: "latest" - ppc64le: - name: "ubuntu" - version: "latest" - s390x: - name: "ubuntu" - version: "latest" - x86_64: name: &default-image-name "ubuntu" - version: "latest" + version: &default-image-version "latest" + ppc64le: + name: *default-image-name + version: *default-image-version + s390x: + name: *default-image-name + version: *default-image-version + x86_64: + name: *default-image-name + version: *default-image-version + tdx: + name: *default-image-name + version: *default-image-version meta: image-type: *default-image-name @@ -156,6 +159,9 @@ assets: x86_64: name: *default-initrd-name version: *default-initrd-version + cbl-mariner: + name: "cbl-mariner" + version: "2.0" sev: name: *glibc-initrd-name version: *glibc-initrd-version From 80c78eadcea44018687e63a84b975386258fdc59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bombo?= Date: Tue, 18 Apr 2023 13:29:38 -0700 Subject: [PATCH 29/87] tests: Use baked-in kernel with Mariner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mariner ships a bleeding-edge kernel that might be ahead of upstream, so we use that to guarantee compatibility with the host. Signed-off-by: Aurélien Bombo --- tests/integration/kubernetes/setup.sh | 8 ++++++++ tools/packaging/kata-deploy/scripts/kata-deploy.sh | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/integration/kubernetes/setup.sh b/tests/integration/kubernetes/setup.sh index 63d9fb682..639826bc8 100755 --- a/tests/integration/kubernetes/setup.sh +++ b/tests/integration/kubernetes/setup.sh @@ -14,6 +14,13 @@ set_runtime_class() { sed -i -e "s|runtimeClassName: kata|runtimeClassName: kata-${KATA_HYPERVISOR}|" ${kubernetes_dir}/runtimeclass_workloads/*.yaml } +set_kernel_path() { + if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then + mariner_kernel_path="/usr/share/cloud-hypervisor/vmlinux.bin" + find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.kernel]' "${mariner_kernel_path}" \; + fi +} + set_initrd_path() { if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then initrd_path="/opt/kata/share/kata-containers/kata-containers-initrd-cbl-mariner.img" @@ -24,6 +31,7 @@ set_initrd_path() { main() { INSTALL_IN_GOPATH=false bash "${repo_root_dir}/ci/install_yq.sh" set_runtime_class + set_kernel_path set_initrd_path } diff --git a/tools/packaging/kata-deploy/scripts/kata-deploy.sh b/tools/packaging/kata-deploy/scripts/kata-deploy.sh index 155bdf1ff..368492bd7 100755 --- a/tools/packaging/kata-deploy/scripts/kata-deploy.sh +++ b/tools/packaging/kata-deploy/scripts/kata-deploy.sh @@ -65,9 +65,9 @@ function install_artifacts() { [ -d /opt/kata/runtime-rs/bin ] && \ chmod +x /opt/kata/runtime-rs/bin/* - # Allow Mariner to specify a Mariner guest initrd. + # Allow Mariner to use custom configuration. if [ "${HOST_OS:-}" == "cbl-mariner" ]; then - sed -i -E 's|(enable_annotations) = .+|\1 = ["enable_iommu", "initrd"]|' /opt/kata/share/defaults/kata-containers/configuration-clh.toml + sed -i -E 's|(enable_annotations) = .+|\1 = ["enable_iommu", "initrd", "kernel"]|' /opt/kata/share/defaults/kata-containers/configuration-clh.toml fi } From 2b59756894e6e50ce85cf1ae4fd46d9566a049b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bombo?= Date: Fri, 2 Jun 2023 13:17:34 -0700 Subject: [PATCH 30/87] tests: Build CLH with glibc for Mariner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This enables building CLH with glibc and the mshv feature as required for Mariner. At test time, it also configures Kata to use that CLH flavor when running Mariner. Signed-off-by: Aurélien Bombo --- .../local-build/kata-deploy-binaries.sh | 44 ++++++++++++++----- .../kata-deploy/scripts/kata-deploy.sh | 6 ++- .../cloud-hypervisor/build-static-clh.sh | 6 +-- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh index 58e62bb48..14910f43f 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh @@ -79,6 +79,7 @@ options: --build= : all cloud-hypervisor + cloud-hypervisor-glibc firecracker kernel kernel-dragonball-experimental @@ -447,26 +448,47 @@ install_firecracker() { sudo install -D --owner root --group root --mode 0744 release-${firecracker_version}-${ARCH}/jailer-${firecracker_version}-${ARCH} "${destdir}/opt/kata/bin/jailer" } -# Install static cloud-hypervisor asset -install_clh() { +install_clh_helper() { + libc="${1}" + features="${2}" + suffix="${3:-""}" + install_cached_tarball_component \ - "cloud-hypervisor" \ - "${jenkins_url}/job/kata-containers-main-clh-$(uname -m)/${cached_artifacts_path}" \ + "cloud-hypervisor${suffix}" \ + "${jenkins_url}/job/kata-containers-main-clh-$(uname -m)${suffix}/${cached_artifacts_path}" \ "$(get_from_kata_deps "assets.hypervisor.cloud_hypervisor.version")" \ "" \ "${final_tarball_name}" \ "${final_tarball_path}" \ && return 0 - if [[ "${ARCH}" == "x86_64" ]]; then - export features="tdx" - fi - info "build static cloud-hypervisor" - "${clh_builder}" + libc="${libc}" features="${features}" "${clh_builder}" info "Install static cloud-hypervisor" mkdir -p "${destdir}/opt/kata/bin/" - sudo install -D --owner root --group root --mode 0744 cloud-hypervisor/cloud-hypervisor "${destdir}/opt/kata/bin/cloud-hypervisor" + sudo install -D --owner root --group root --mode 0744 cloud-hypervisor/cloud-hypervisor "${destdir}/opt/kata/bin/cloud-hypervisor${suffix}" +} + +# Install static cloud-hypervisor asset +install_clh() { + if [[ "${ARCH}" == "x86_64" ]]; then + features="mshv,tdx" + else + features="" + fi + + install_clh_helper "musl" "${features}" +} + +# Install static cloud-hypervisor-glibc asset +install_clh_glibc() { + if [[ "${ARCH}" == "x86_64" ]]; then + features="mshv" + else + features="" + fi + + install_clh_helper "gnu" "${features}" "-glibc" } # Install static virtiofsd asset @@ -613,7 +635,7 @@ handle_build() { cloud-hypervisor) install_clh ;; - cloud-hypervisor-glibc) ;; + cloud-hypervisor-glibc) install_clh_glibc ;; firecracker) install_firecracker ;; diff --git a/tools/packaging/kata-deploy/scripts/kata-deploy.sh b/tools/packaging/kata-deploy/scripts/kata-deploy.sh index 368492bd7..09d27cc65 100755 --- a/tools/packaging/kata-deploy/scripts/kata-deploy.sh +++ b/tools/packaging/kata-deploy/scripts/kata-deploy.sh @@ -67,7 +67,11 @@ function install_artifacts() { # Allow Mariner to use custom configuration. if [ "${HOST_OS:-}" == "cbl-mariner" ]; then - sed -i -E 's|(enable_annotations) = .+|\1 = ["enable_iommu", "initrd", "kernel"]|' /opt/kata/share/defaults/kata-containers/configuration-clh.toml + config_path="/opt/kata/share/defaults/kata-containers/configuration-clh.toml" + clh_path="/opt/kata/bin/cloud-hypervisor-glibc" + sed -i -E 's|(enable_annotations) = .+|\1 = ["enable_iommu", "initrd", "kernel"]|' "${config_path}" + sed -i -E "s|(valid_hypervisor_paths) = .+|\1 = [\"${clh_path}\"]|" "${config_path}" + sed -i -E "s|(path) = \".+/cloud-hypervisor\"|\1 = \"${clh_path}\"|" "${config_path}" fi } diff --git a/tools/packaging/static-build/cloud-hypervisor/build-static-clh.sh b/tools/packaging/static-build/cloud-hypervisor/build-static-clh.sh index 975a517a1..f381897bc 100755 --- a/tools/packaging/static-build/cloud-hypervisor/build-static-clh.sh +++ b/tools/packaging/static-build/cloud-hypervisor/build-static-clh.sh @@ -76,12 +76,12 @@ build_clh_from_source() { if [ -n "${features}" ]; then info "Build cloud-hypervisor enabling the following features: ${features}" - ./scripts/dev_cli.sh build --release --libc musl --features "${features}" + ./scripts/dev_cli.sh build --release --libc "${libc}" --features "${features}" else - ./scripts/dev_cli.sh build --release --libc musl + ./scripts/dev_cli.sh build --release --libc "${libc}" fi rm -f cloud-hypervisor - cp build/cargo_target/$(uname -m)-unknown-linux-musl/release/cloud-hypervisor . + cp build/cargo_target/$(uname -m)-unknown-linux-${libc}/release/cloud-hypervisor . popd } From 0152c9aba5c8134111acb671d05d4722f986d3b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bombo?= Date: Tue, 13 Jun 2023 11:37:18 -0700 Subject: [PATCH 31/87] tools: Introduce `USE_CACHE` environment variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows setting `USE_CACHE=no` to test building e2e during developmet without having to comment code blocks and so forth. Signed-off-by: Aurélien Bombo --- .../local-build/kata-deploy-binaries-in-docker.sh | 1 + .../kata-deploy/local-build/kata-deploy-binaries.sh | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries-in-docker.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries-in-docker.sh index 11589c88a..fc82082c4 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries-in-docker.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries-in-docker.sh @@ -65,6 +65,7 @@ docker run \ --env TDSHIM_CONTAINER_BUILDER="${TDSHIM_CONTAINER_BUILDER:-}" \ --env VIRTIOFSD_CONTAINER_BUILDER="${VIRTIOFSD_CONTAINER_BUILDER:-}" \ --env MEASURED_ROOTFS="${MEASURED_ROOTFS:-}" \ + --env USE_CACHE="${USE_CACHE:-}" \ --rm \ -w ${script_dir} \ build-kata-deploy "${kata_deploy_create}" $@ diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh index 14910f43f..01a2d848f 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh @@ -40,6 +40,7 @@ readonly cached_artifacts_path="lastSuccessfulBuild/artifact/artifacts" ARCH=$(uname -m) MEASURED_ROOTFS=${MEASURED_ROOTFS:-no} +USE_CACHE="${USE_CACHE:-"yes"}" workdir="${WORKDIR:-$PWD}" @@ -115,6 +116,10 @@ cleanup_and_fail() { } install_cached_tarball_component() { + if [ "${USE_CACHE}" != "yes" ]; then + return 1 + fi + local component="${1}" local jenkins_build_url="${2}" local current_version="${3}" From 1c211cd730e51008deb9cd1344e551f104cfb6d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bombo?= Date: Tue, 13 Jun 2023 12:07:49 -0700 Subject: [PATCH 32/87] gha: Swap asset/release in build matrix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This simply displays the asset name first in GH's UI, so that the release name (always "test") is truncated rather than the asset name. Makes things slightly easier to read. e.g. build-asset (cloud-hypervisor-glibc, te... instead of build-asset (test, cloud-hypervisor-gli... Signed-off-by: Aurélien Bombo --- .github/workflows/build-kata-static-tarball-amd64.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-kata-static-tarball-amd64.yaml b/.github/workflows/build-kata-static-tarball-amd64.yaml index b5c7584fe..9a0cd755c 100644 --- a/.github/workflows/build-kata-static-tarball-amd64.yaml +++ b/.github/workflows/build-kata-static-tarball-amd64.yaml @@ -19,8 +19,6 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - stage: - - ${{ inputs.stage }} asset: - cloud-hypervisor - cloud-hypervisor-glibc @@ -46,9 +44,11 @@ jobs: - shim-v2 - tdvf - virtiofsd + stage: + - ${{ inputs.stage }} exclude: - - stage: release - asset: cloud-hypervisor-glibc + - asset: cloud-hypervisor-glibc + stage: release steps: - name: Login to Kata Containers quay.io if: ${{ inputs.push-to-registry == 'yes' }} From 477856c1e369c54c717063554b30e44b5396c89c Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 3 Jul 2023 17:04:27 +0200 Subject: [PATCH 33/87] gha: dragonball: Correctly propagate PATH update cargo/rust is installed in one step, we need to write the PATH update to GITHUBENV so that it becomes visible in the next steps. Fixes: #7221 Signed-off-by: Jeremi Piotrowski --- .github/workflows/static-checks-dragonball.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/static-checks-dragonball.yaml b/.github/workflows/static-checks-dragonball.yaml index 9c8ae694b..61e3fe2c4 100644 --- a/.github/workflows/static-checks-dragonball.yaml +++ b/.github/workflows/static-checks-dragonball.yaml @@ -23,7 +23,7 @@ jobs: if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }} run: | ./ci/install_rust.sh - PATH=$PATH:"$HOME/.cargo/bin" + echo PATH="$HOME/.cargo/bin:$PATH" >> $GITHUB_ENV - name: Run Unit Test if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }} run: | From cc8f0a24e493c84e68c87d45d37a35b02c39b2c1 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Wed, 28 Jun 2023 16:39:10 +0000 Subject: [PATCH 34/87] metrics: Add checkmetrics to gha-run.sh for metrics CI This PR adds checkmetrics installation for gha-run.sh in order to compare results limits as part of the metrics CI. Fixes #7198 Signed-off-by: Gabriela Cervantes --- tests/metrics/gha-run.sh | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index 867161673..d7108eaf7 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -13,6 +13,10 @@ kata_tarball_dir=${2:-kata-artifacts} metrics_dir="$(dirname "$(readlink -f "$0")")" source "${metrics_dir}/../common.bash" +declare -r results_dir="${metrics_dir}/results" +declare -r checkmetrics_dir="${metrics_dir}/cmd/checkmetrics" +declare -r checkmetrics_config_dir="${checkmetrics_dir}/ci_worker" + function create_symbolic_links() { local link_configuration_file="/opt/kata/share/defaults/kata-containers/configuration.toml" local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${KATA_HYPERVISOR}.toml" @@ -83,6 +87,22 @@ function check_containerd_config_for_kata() { fi } +function check_metrics() { + # Ensure we have the latest checkemtrics + pushd "${checkmetrics_dir}" + make + sudo make install + popd + + local cm_base_file="${checkmetrics_config_dir}/checkmetrics-json-${hypervisor}-$(uname -n).toml" + checkmetrics --debug --percentage --basefile "${cm_base_file}" --metricsdir "${results_dir}" + cm_result=$? + if [ "${cm_result}" != 0 ]; then + info "run-metrics-ci: checkmetrics FAILED (${cm_result})" + exit "${cm_result}" + fi +} + function run_test_launchtimes() { info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor" @@ -119,6 +139,8 @@ function main() { run-test-memory-usage-inside-container) run_test_memory_usage_inside_container ;; *) >&2 die "Invalid argument" ;; esac + + check_metrics } main "$@" From 917576e6fb5e67c2325c9158593eed57558a4a6e Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Wed, 28 Jun 2023 16:46:01 +0000 Subject: [PATCH 35/87] metrics: Add double quotes in all variables This PR adds double quotes in all variables to have uniformity across all the gha-run.sh script. Signed-off-by: Gabriela Cervantes --- tests/metrics/gha-run.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index d7108eaf7..fca5c3ff6 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -9,7 +9,7 @@ set -o errexit set -o nounset set -o pipefail -kata_tarball_dir=${2:-kata-artifacts} +kata_tarball_dir="${2:-kata-artifacts}" metrics_dir="$(dirname "$(readlink -f "$0")")" source "${metrics_dir}/../common.bash" @@ -58,7 +58,7 @@ function install_kata() { # Removing previous kata installation sudo rm -rf "${katadir}" - pushd ${kata_tarball_dir} + pushd "${kata_tarball_dir}" sudo tar -xvf "${kata_tarball}" -C "${destdir}" popd @@ -77,9 +77,9 @@ function check_containerd_config_for_kata() { declare -r line2="runtime_type = \"io.containerd.kata.v2\"" declare -r num_lines_containerd=2 declare -r containerd_path="/etc/containerd/config.toml" - local count_matches=$(grep -ic "$line1\|$line2" ${containerd_path}) + local count_matches=$(grep -ic "$line1\|$line2" "${containerd_path}") - if [ $count_matches = $num_lines_containerd ]; then + if [ "${count_matches}" = "${num_lines_containerd}" ]; then info "containerd ok" else info "overwriting containerd configuration w/ a valid one" From e94aaed3c7d009e2fd8ea4b64cf558c2730e430c Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Wed, 28 Jun 2023 17:13:20 +0000 Subject: [PATCH 36/87] ci_worker: Add checkmetrics ci worker for cloud hypervisor This PR adds the checkmetrics ci worker file for cloud hypervisor in order to check the boot times limit. Signed-off-by: Gabriela Cervantes --- ...cs-json-cloud-hypervisor-kata-metric8.toml | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-cloud-hypervisor-kata-metric8.toml diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-cloud-hypervisor-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-cloud-hypervisor-kata-metric8.toml new file mode 100644 index 000000000..af5c8c634 --- /dev/null +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-cloud-hypervisor-kata-metric8.toml @@ -0,0 +1,21 @@ +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# This file contains baseline expectations +# for checked results by checkmetrics tool. +# +# values set specifically for packet.com c1.small worker. + +[[metric]] +name = "boot-times" +type = "json" +description = "measure container lifecycle timings" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result" +checktype = "mean" +midval = 0.54 +minpercent = 10.0 +maxpercent = 10.0 From b481ef18833aed9ccfae019ae4e65b957d1dec77 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Wed, 28 Jun 2023 19:03:37 +0000 Subject: [PATCH 37/87] makefile: Add -buildvcs=false flag to go build This PR adds the -buildvcs=false flag to the go build of checkmetrics. Signed-off-by: Gabriela Cervantes --- ...ric8.toml => checkmetrics-json-clh-kata-metric8.toml} | 6 +++--- tests/metrics/gha-run.sh | 9 ++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) rename tests/metrics/cmd/checkmetrics/ci_worker/{checkmetrics-json-cloud-hypervisor-kata-metric8.toml => checkmetrics-json-clh-kata-metric8.toml} (91%) diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-cloud-hypervisor-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml similarity index 91% rename from tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-cloud-hypervisor-kata-metric8.toml rename to tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml index af5c8c634..7966b69eb 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-cloud-hypervisor-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml @@ -16,6 +16,6 @@ description = "measure container lifecycle timings" # within (inclusive) checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result" checktype = "mean" -midval = 0.54 -minpercent = 10.0 -maxpercent = 10.0 +midval = 0.42 +minpercent = 15.0 +maxpercent = 15.0 diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index fca5c3ff6..05b426fbc 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -8,6 +8,7 @@ set -o errexit set -o nounset set -o pipefail +set -x kata_tarball_dir="${2:-kata-artifacts}" metrics_dir="$(dirname "$(readlink -f "$0")")" @@ -94,7 +95,7 @@ function check_metrics() { sudo make install popd - local cm_base_file="${checkmetrics_config_dir}/checkmetrics-json-${hypervisor}-$(uname -n).toml" + local cm_base_file="${checkmetrics_config_dir}/checkmetrics-json-clh-kata-metric8.toml" checkmetrics --debug --percentage --basefile "${cm_base_file}" --metricsdir "${results_dir}" cm_result=$? if [ "${cm_result}" != 0 ]; then @@ -110,6 +111,10 @@ function run_test_launchtimes() { exit 0 create_symbolic_links bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20 + + if [ "${hypervisor}" = "clh" ]; then + check_metrics + fi } function run_test_memory_usage() { @@ -139,8 +144,6 @@ function main() { run-test-memory-usage-inside-container) run_test_memory_usage_inside_container ;; *) >&2 die "Invalid argument" ;; esac - - check_metrics } main "$@" From 0502354b42bfd3ca22877d93814a596c01024849 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Fri, 30 Jun 2023 20:26:33 +0000 Subject: [PATCH 38/87] checkmetrics: Add checkmetrics json for qemu This PR adds checkmetrics json file for qemu metrics. Signed-off-by: Gabriela Cervantes --- .../checkmetrics-json-qemu-kata-metric8.toml | 21 +++++++++++++++++++ tests/metrics/gha-run.sh | 9 ++++---- 2 files changed, 25 insertions(+), 5 deletions(-) create mode 100644 tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml new file mode 100644 index 000000000..c2b3379a4 --- /dev/null +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml @@ -0,0 +1,21 @@ +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# This file contains baseline expectations +# for checked results by checkmetrics tool. +# +# values set specifically for packet.com c1.small worker. + +[[metric]] +name = "boot-times" +type = "json" +description = "measure container lifecycle timings" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result" +checktype = "mean" +midval = 0.61 +minpercent = 15.0 +maxpercent = 15.0 diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index 05b426fbc..e30a8a7d0 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -22,7 +22,7 @@ function create_symbolic_links() { local link_configuration_file="/opt/kata/share/defaults/kata-containers/configuration.toml" local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${KATA_HYPERVISOR}.toml" - if [ ${KATA_HYPERVISOR} != 'qemu' ] && [ ${KATA_HYPERVISOR} != 'clh' ]; then + if [ "${KATA_HYPERVISOR}" != 'qemu' ] && [ "${KATA_HYPERVISOR}" != 'clh' ]; then die "Failed to set the configuration.toml: '${KATA_HYPERVISOR}' is not recognized as a valid hypervisor name." fi @@ -89,13 +89,14 @@ function check_containerd_config_for_kata() { } function check_metrics() { + KATA_HYPERVISOR="${1}" # Ensure we have the latest checkemtrics pushd "${checkmetrics_dir}" make sudo make install popd - local cm_base_file="${checkmetrics_config_dir}/checkmetrics-json-clh-kata-metric8.toml" + local cm_base_file="${checkmetrics_config_dir}/checkmetrics-json-${KATA_HYPERVISOR}-kata-metric8.toml" checkmetrics --debug --percentage --basefile "${cm_base_file}" --metricsdir "${results_dir}" cm_result=$? if [ "${cm_result}" != 0 ]; then @@ -112,9 +113,7 @@ function run_test_launchtimes() { create_symbolic_links bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20 - if [ "${hypervisor}" = "clh" ]; then - check_metrics - fi + check_metrics "${KATA_HYPERVISOR}" } function run_test_memory_usage() { From 72fd562bd61e80d42650e4603fdaab9a2569b17c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Mon, 3 Jul 2023 22:00:55 +0200 Subject: [PATCH 39/87] gha: release: Use a specific release of hub MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ideally we should never ever use hub again, and switch to a supported / release tool instead. However, in order to get v3.1.3 released, let's just stick to the last released version of hub, as trying to get its release is leading to: ``` curl -s "https://api.github.com/repos/github/hub/releases/latest" { "message": "Moved Permanently", "url": "https://api.github.com/repositories/401025/releases/latest", "documentation_url": "https://docs.github.com/v3/#http-redirects" } ``` And that breaks the release process. :-/ Fixes: #7223 Signed-off-by: Fabiano Fidêncio --- .github/workflows/release.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index c553ca514..a50313fd0 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -72,8 +72,7 @@ jobs: - uses: actions/checkout@v3 - name: install hub run: | - HUB_VER=$(curl -s "https://api.github.com/repos/github/hub/releases/latest" | jq -r .tag_name | sed 's/^v//') - wget -q -O- https://github.com/github/hub/releases/download/v$HUB_VER/hub-linux-amd64-$HUB_VER.tgz | \ + wget -q -O- https://github.com/mislav/hub/releases/download/v2.14.2/hub-linux-amd64-2.14.2.tgz | \ tar xz --strip-components=2 --wildcards '*/bin/hub' && sudo mv hub /usr/local/bin/hub - name: download-artifacts-amd64 From d8b8f7e94d679bc76e64272ba2a50c6a0454d4c0 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Mon, 3 Jul 2023 16:48:41 +0000 Subject: [PATCH 40/87] metrics: Enable launch tests time metrics This PR enables the launch tests metrics for kata CI. Signed-off-by: Gabriela Cervantes --- .../checkmetrics-json-qemu-kata-metric8.toml | 2 +- tests/metrics/gha-run.sh | 23 ++++++++----------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml index c2b3379a4..c243ed2c5 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml @@ -5,7 +5,7 @@ # This file contains baseline expectations # for checked results by checkmetrics tool. # -# values set specifically for packet.com c1.small worker. +# values set specifically for Equinix m3.small.x86. [[metric]] name = "boot-times" diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index e30a8a7d0..a684c4f60 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -8,7 +8,6 @@ set -o errexit set -o nounset set -o pipefail -set -x kata_tarball_dir="${2:-kata-artifacts}" metrics_dir="$(dirname "$(readlink -f "$0")")" @@ -49,8 +48,6 @@ EOF } function install_kata() { - # ToDo: remove the exit once the metrics workflow is stable - exit 0 local kata_tarball="kata-static.tar.xz" declare -r katadir="/opt/kata" declare -r destdir="/" @@ -70,6 +67,15 @@ function install_kata() { check_containerd_config_for_kata restart_containerd_service + install_checkmetrics +} + +function install_checkmetrics() { + # Ensure we have the latest checkmetrics + pushd "${checkmetrics_dir}" + make + sudo make install + popd } function check_containerd_config_for_kata() { @@ -90,26 +96,17 @@ function check_containerd_config_for_kata() { function check_metrics() { KATA_HYPERVISOR="${1}" - # Ensure we have the latest checkemtrics - pushd "${checkmetrics_dir}" - make - sudo make install - popd - local cm_base_file="${checkmetrics_config_dir}/checkmetrics-json-${KATA_HYPERVISOR}-kata-metric8.toml" checkmetrics --debug --percentage --basefile "${cm_base_file}" --metricsdir "${results_dir}" cm_result=$? if [ "${cm_result}" != 0 ]; then - info "run-metrics-ci: checkmetrics FAILED (${cm_result})" - exit "${cm_result}" + die "run-metrics-ci: checkmetrics FAILED (${cm_result})" fi } function run_test_launchtimes() { info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor" - # ToDo: remove the exit once the metrics workflow is stable - exit 0 create_symbolic_links bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20 From 40c46c75eda27e0db5fd1802f26e659c1242620b Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 3 Jul 2023 16:43:55 +0200 Subject: [PATCH 41/87] tests/integration: Perform yq install in run_tests() We only need to install in run_tests() so that the yq install is picked up by kubernets/setup.sh as well. We also need to either use (sudo && INSTALL_IN_GOPATH=false) || (INSTALL_IN_GOPATH=true). Signed-off-by: Jeremi Piotrowski --- tests/integration/gha-run.sh | 10 ++++++++-- tests/integration/kubernetes/setup.sh | 1 - 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/integration/gha-run.sh b/tests/integration/gha-run.sh index b2493e3b9..8fb6c14fa 100755 --- a/tests/integration/gha-run.sh +++ b/tests/integration/gha-run.sh @@ -56,10 +56,16 @@ function get_cluster_credentials() { -n "$(_print_cluster_name)" } -function run_tests() { - INSTALL_IN_GOPATH=false bash "${repo_root_dir}/ci/install_yq.sh" +function ensure_yq() { + : "${GOPATH:=${GITHUB_WORKSPACE}}" + export GOPATH + export PATH="${GOPATH}/bin:${PATH}" + INSTALL_IN_GOPATH=true "${repo_root_dir}/ci/install_yq.sh" +} +function run_tests() { platform="${1}" + ensure_yq sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}|g" "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[+].name' "HOST_OS" diff --git a/tests/integration/kubernetes/setup.sh b/tests/integration/kubernetes/setup.sh index 639826bc8..6984ad286 100755 --- a/tests/integration/kubernetes/setup.sh +++ b/tests/integration/kubernetes/setup.sh @@ -29,7 +29,6 @@ set_initrd_path() { } main() { - INSTALL_IN_GOPATH=false bash "${repo_root_dir}/ci/install_yq.sh" set_runtime_class set_kernel_path set_initrd_path From d6e96ea06dfa7746fd36cc666f555c71b9204a96 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Tue, 4 Jul 2023 11:27:47 +0200 Subject: [PATCH 42/87] tests/integration: Use AzureLinux instead of Mariner as OSSKU value, to get rid of this warning when creating the AKS cluster: WARNING: The osSKU "AzureLinux" should be used going forward instead of "CBLMariner" or "Mariner". The osSKUs "CBLMariner" and "Mariner" will eventually be deprecated. Signed-off-by: Jeremi Piotrowski --- tests/integration/gha-run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/gha-run.sh b/tests/integration/gha-run.sh index 8fb6c14fa..6abdafae2 100755 --- a/tests/integration/gha-run.sh +++ b/tests/integration/gha-run.sh @@ -38,7 +38,7 @@ function create_cluster() { -s "Standard_D4s_v5" \ --node-count 1 \ --generate-ssh-keys \ - $([ "${KATA_HOST_OS}" = "cbl-mariner" ] && echo "--os-sku mariner --workload-runtime KataMshvVmIsolation") + $([ "${KATA_HOST_OS}" = "cbl-mariner" ] && echo "--os-sku AzureLinux --workload-runtime KataMshvVmIsolation") } function install_bats() { From b568c7f7d839509c40b94df38d9abc5ad41fd80d Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Tue, 4 Jul 2023 14:28:29 +0200 Subject: [PATCH 43/87] tests/integration: Provide default value for KATA_HOST_OS Non AKS k8s tests (SEV/SNP/TDX) don't currently set KATA_HOST_OS, so provide a default empty value for the variable so that those tests can run. Signed-off-by: Jeremi Piotrowski --- tests/integration/gha-run.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/integration/gha-run.sh b/tests/integration/gha-run.sh index 6abdafae2..cdbce7211 100755 --- a/tests/integration/gha-run.sh +++ b/tests/integration/gha-run.sh @@ -68,8 +68,10 @@ function run_tests() { ensure_yq sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}|g" "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" - yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[+].name' "HOST_OS" - yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[-1].value' "${KATA_HOST_OS}" + if [ "${KATA_HOST_OS}" = "cbl-mariner" ]; then + yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[+].name' "HOST_OS" + yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[-1].value' "${KATA_HOST_OS}" + fi cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" | grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" || die "Failed to setup the tests image" @@ -131,6 +133,8 @@ function delete_cluster() { } function main() { + export KATA_HOST_OS="${KATA_HOST_OS:-}" + action="${1:-}" case "${action}" in From 476a11194a8227e1a06ecdc805a8fa46601156bc Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Tue, 4 Jul 2023 16:11:54 +0000 Subject: [PATCH 44/87] tests: Enable memory usage metrics tests This PR enables the memory usage metrics tests for kata CI. Fixes #7229 Signed-off-by: Gabriela Cervantes --- tests/metrics/gha-run.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index a684c4f60..a2c531728 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -116,8 +116,6 @@ function run_test_launchtimes() { function run_test_memory_usage() { info "Running memory-usage test using ${KATA_HYPERVISOR} hypervisor" - # ToDo: remove the exit once the metrics workflow is stable - exit 0 create_symbolic_links bash tests/metrics/density/memory_usage.sh 20 5 } From b9d66e0d530d042051f4089d35052d755e460924 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Tue, 4 Jul 2023 17:24:55 +0000 Subject: [PATCH 45/87] metrics: Fix double quotes variables in memory usage script This PR usses double quotes in all the variables as well as general fixes to the memory usage script in order to have uniformity. Signed-off-by: Gabriela Cervantes --- tests/metrics/density/memory_usage.sh | 133 +++++++++++++------------- 1 file changed, 68 insertions(+), 65 deletions(-) diff --git a/tests/metrics/density/memory_usage.sh b/tests/metrics/density/memory_usage.sh index 455148845..a35d01e09 100755 --- a/tests/metrics/density/memory_usage.sh +++ b/tests/metrics/density/memory_usage.sh @@ -12,6 +12,7 @@ # This test uses smem tool to get the memory used. set -e +set -x SCRIPT_PATH=$(dirname "$(readlink -f "$0")") source "${SCRIPT_PATH}/../lib/common.bash" @@ -32,7 +33,7 @@ MEM_TMP_FILE=$(mktemp meminfo.XXXXXXXXXX) PS_TMP_FILE=$(mktemp psinfo.XXXXXXXXXX) function remove_tmp_file() { - rm -rf $MEM_TMP_FILE $PS_TMP_FILE + rm -rf "${MEM_TMP_FILE $PS_TMP_FILE}" } trap remove_tmp_file EXIT @@ -53,14 +54,14 @@ EOF get_runc_pss_memory(){ ctr_runc_shim_path="/usr/local/bin/containerd-shim-runc-v2" - get_pss_memory "$ctr_runc_shim_path" + get_pss_memory "${ctr_runc_shim_path}" } get_runc_individual_memory() { - runc_process_result=$(cat $MEM_TMP_FILE | tr "\n" " " | sed -e 's/\s$//g' | sed 's/ /, /g') + runc_process_result=$(cat "${MEM_TMP_FILE}" | tr "\n" " " | sed -e 's/\s$//g' | sed 's/ /, /g') # Verify runc process result - if [ -z "$runc_process_result" ];then + if [ -z "${runc_process_result}" ];then die "Runc process not found" fi @@ -71,7 +72,7 @@ get_runc_individual_memory() { local json="$(cat << EOF { "runc individual results": [ - $(for ((i=0;i<${NUM_CONTAINERS[@]};++i)); do + $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do printf '%s\n\t\t\t' "${runc_values[i]}" done) ] @@ -90,32 +91,32 @@ get_pss_memory(){ count=0 avg=0 - if [ -z "$ps" ]; then + if [ -z "${ps}" ]; then die "No argument to get_pss_memory()" fi # Save all the processes names # This will be help us to retrieve raw information - echo $ps >> $PS_TMP_FILE + echo "${ps}" >> "${PS_TMP_FILE}" - data=$(sudo "$SMEM_BIN" --no-header -P "^$ps" -c "pss" | sed 's/[[:space:]]//g') + data=$(sudo "${SMEM_BIN}" --no-header -P "^$ps" -c "pss" | sed 's/[[:space:]]//g') # Save all the smem results # This will help us to retrieve raw information - echo $data >> $MEM_TMP_FILE + echo "${data}" >> "${MEM_TMP_FILE}" - for i in $data;do + for i in "${data}"; do if (( i > 0 ));then mem_amount=$(( i + mem_amount )) (( count++ )) fi done - if (( $count > 0 ));then - avg=$(bc -l <<< "scale=2; $mem_amount / $count") + if (( "${count}" > 0 ));then + avg=$(bc -l <<< "scale=2; ${mem_amount} / ${count}") fi - echo "$avg" + echo "${avg}" } ppid() { @@ -140,26 +141,26 @@ get_pss_memory_virtiofsd() { die "virtiofsd_path not provided" fi - echo "${virtiofsd_path}" >> $PS_TMP_FILE + echo "${virtiofsd_path}" >> "${PS_TMP_FILE}" - virtiofsd_pids=$(ps aux | grep [v]irtiofsd | awk '{print $2}') + virtiofsd_pids=$(ps aux | grep virtiofsd | awk '{print $2}') data=$(sudo smem --no-header -P "^${virtiofsd_path}" -c pid -c "pid pss") - for p in ${virtiofsd_pids}; do - parent_pid=$(ppid ${p}) + for p in "${virtiofsd_pids}"; do + parent_pid=$(ppid "${p}") cmd="$(cat /proc/${p}/cmdline | tr -d '\0')" cmd_parent="$(cat /proc/${parent_pid}/cmdline | tr -d '\0')" if [ "${cmd}" != "${cmd_parent}" ]; then pss_parent=$(printf "%s" "${data}" | grep "\s^${p}" | awk '{print $2}') - fork=$(pgrep -P ${p}) + fork=$(pgrep -P "${p}") pss_fork=$(printf "%s" "${data}" | grep "^\s*${fork}" | awk '{print $2}') pss_process=$((pss_fork + pss_parent)) # Save all the smem results # This will help us to retrieve raw information - echo "${pss_process}" >>$MEM_TMP_FILE + echo "${pss_process}" >>"${MEM_TMP_FILE}" if ((pss_process > 0)); then mem_amount=$((pss_process + mem_amount)) @@ -168,22 +169,22 @@ get_pss_memory_virtiofsd() { fi done - if (( $count > 0 ));then - avg=$(bc -l <<< "scale=2; $mem_amount / $count") + if (( "${count}" > 0 ));then + avg=$(bc -l <<< "scale=2; ${mem_amount} / ${count}") fi echo "${avg}" } get_individual_memory(){ # Getting all the individual container information - first_process_name=$(cat $PS_TMP_FILE | awk 'NR==1' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g') - first_process_result=$(cat $MEM_TMP_FILE | awk 'NR==1' | sed 's/ /, /g') + first_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==1' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g') + first_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==1' | sed 's/ /, /g') - second_process_name=$(cat $PS_TMP_FILE | awk 'NR==2' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g') - second_process_result=$(cat $MEM_TMP_FILE | awk 'NR==2' | sed 's/ /, /g') + second_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==2' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g') + second_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==2' | sed 's/ /, /g') - third_process_name=$(cat $PS_TMP_FILE | awk 'NR==3' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g') - third_process_result=$(cat $MEM_TMP_FILE | awk 'NR==3' | sed 's/ /, /g') + third_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==3' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g') + third_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==3' | sed 's/ /, /g') read -r -a first_values <<< "${first_process_result}" read -r -a second_values <<< "${second_process_result}" @@ -193,20 +194,20 @@ get_individual_memory(){ local json="$(cat << EOF { - "$first_process_name memory": [ - $(for ((i=0;i<${NUM_CONTAINERS[@]};++i)); do + "${first_process_name memory}": [ + $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do [ -n "${first_values[i]}" ] && printf '%s\n\t\t\t' "${first_values[i]}" done) ], - "$second_process_name memory": [ - $(for ((i=0;i<${NUM_CONTAINERS[@]};++i)); do + "${second_process_name memory}": [ + $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do [ -n "${second_values[i]}" ] && printf '%s\n\t\t\t' "${second_values[i]}" done) ], - "$third_process_name memory": [ - $(for ((i=0;i<${NUM_CONTAINERS[@]};++i)); do + "${third_process_name memory}": [ + $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do [ -n "${third_values[i]}" ] && printf '%s\n\t\t\t' "${third_values[i]}" done) @@ -227,40 +228,40 @@ get_docker_memory_usage(){ containers=() - for ((i=1; i<= NUM_CONTAINERS; i++)); do + for ((i=1; i<"${NUM_CONTAINERS[@]}"; i++)); do containers+=($(random_name)) - ${CTR_EXE} run --runtime "${CTR_RUNTIME}" -d ${IMAGE} ${containers[-1]} ${CMD} + sudo "${CTR_EXE}" run --runtime "${CTR_RUNTIME}" -d "${IMAGE}" "${containers[-1]}" "${CMD}" done - if [ "$AUTO_MODE" == "auto" ]; then + if [ "${AUTO_MODE}" == "auto" ]; then if (( ksm_on != 1 )); then die "KSM not enabled, cannot use auto mode" fi echo "Entering KSM settle auto detect mode..." - wait_ksm_settle $WAIT_TIME + wait_ksm_settle "${WAIT_TIME}" else # If KSM is enabled, then you normally want to sleep long enough to # let it do its work and for the numbers to 'settle'. - echo "napping $WAIT_TIME s" - sleep "$WAIT_TIME" + echo "napping ${WAIT_TIME} s" + sleep "${WAIT_TIME}" fi metrics_json_start_array # Check the runtime in order in order to determine which process will # be measured about PSS - if [ "$RUNTIME" == "runc" ]; then + if [ "${RUNTIME}" == "runc" ]; then runc_workload_mem="$(get_runc_pss_memory)" - memory_usage="$runc_workload_mem" + memory_usage="${runc_workload_mem}" local json="$(cat << EOF { "average": { - "Result": $memory_usage, + "Result": ${memory_usage}, "Units" : "KB" }, "runc": { - "Result": $runc_workload_mem, + "Result": ${runc_workload_mem}, "Units" : "KB" } } @@ -276,39 +277,39 @@ EOF # Now if you do not have enough rights # the smem failure to read the stats will also be trapped. - hypervisor_mem="$(get_pss_memory "$HYPERVISOR_PATH")" - if [ "$hypervisor_mem" == "0" ]; then - die "Failed to find PSS for $HYPERVISOR_PATH" + hypervisor_mem="$(get_pss_memory "${HYPERVISOR_PATH}")" + if [ "${hypervisor_mem}" == "0" ]; then + die "Failed to find PSS for ${HYPERVISOR_PATH}" fi - virtiofsd_mem="$(get_pss_memory_virtiofsd "$VIRTIOFSD_PATH")" - if [ "$virtiofsd_mem" == "0" ]; then - echo >&2 "WARNING: Failed to find PSS for $VIRTIOFSD_PATH" + virtiofsd_mem="$(get_pss_memory_virtiofsd "${VIRTIOFSD_PATH}")" + if [ "${virtiofsd_mem}" == "0" ]; then + echo >&2 "WARNING: Failed to find PSS for ${VIRTIOFSD_PATH}" fi - shim_mem="$(get_pss_memory "$SHIM_PATH")" - if [ "$shim_mem" == "0" ]; then - die "Failed to find PSS for $SHIM_PATH" + shim_mem="$(get_pss_memory "${SHIM_PATH}")" + if [ "${shim_mem}" == "0" ]; then + die "Failed to find PSS for ${SHIM_PATH}" fi - mem_usage="$(bc -l <<< "scale=2; $hypervisor_mem +$virtiofsd_mem + $shim_mem")" - memory_usage="$mem_usage" + mem_usage="$(bc -l <<< "scale=2; ${hypervisor_mem} +${virtiofsd_mem} + ${shim_mem}")" + memory_usage="${mem_usage}" local json="$(cat << EOF { "average": { - "Result": $mem_usage, + "Result": ${mem_usage}, "Units" : "KB" }, "qemus": { - "Result": $hypervisor_mem, + "Result": ${hypervisor_mem}, "Units" : "KB" }, "virtiofsds": { - "Result": $virtiofsd_mem, + "Result": ${virtiofsd_mem}, "Units" : "KB" }, "shims": { - "Result": $shim_mem, + "Result": ${shim_mem}, "Units" : "KB" } } @@ -327,12 +328,12 @@ save_config(){ local json="$(cat << EOF { - "containers": $NUM_CONTAINERS, - "ksm": $ksm_on, - "auto": "$AUTO_MODE", - "waittime": $WAIT_TIME, - "image": "$IMAGE", - "command": "$CMD" + "containers": "${NUM_CONTAINERS}", + "ksm": "${ksm_on}", + "auto": "${AUTO_MODE}", + "waittime": "${WAIT_TIME}", + "image": "${IMAGE}", + "command": "${CMD}" } EOF @@ -352,10 +353,12 @@ main(){ #Check for KSM before reporting test name, as it can modify it check_for_ksm + sudo systemctl restart containerd + init_env check_cmds "${SMEM_BIN}" bc - check_images "$IMAGE" + check_images "${IMAGE}" if [ "${CTR_RUNTIME}" == "io.containerd.kata.v2" ]; then export RUNTIME="kata-runtime" From d8f90e89d5e0075b11c1a7c8f8cef1b231e5e06c Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Tue, 4 Jul 2023 19:53:17 +0000 Subject: [PATCH 46/87] metrics: Rename function at memory usage script This PR renames the function name for the memory usage script. Signed-off-by: Gabriela Cervantes --- tests/metrics/density/memory_usage.sh | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/metrics/density/memory_usage.sh b/tests/metrics/density/memory_usage.sh index a35d01e09..3a7f89d5f 100755 --- a/tests/metrics/density/memory_usage.sh +++ b/tests/metrics/density/memory_usage.sh @@ -33,7 +33,7 @@ MEM_TMP_FILE=$(mktemp meminfo.XXXXXXXXXX) PS_TMP_FILE=$(mktemp psinfo.XXXXXXXXXX) function remove_tmp_file() { - rm -rf "${MEM_TMP_FILE $PS_TMP_FILE}" + rm -rf "${MEM_TMP_FILE}" "${PS_TMP_FILE}" } trap remove_tmp_file EXIT @@ -99,7 +99,7 @@ get_pss_memory(){ # This will be help us to retrieve raw information echo "${ps}" >> "${PS_TMP_FILE}" - data=$(sudo "${SMEM_BIN}" --no-header -P "^$ps" -c "pss" | sed 's/[[:space:]]//g') + data=$(sudo "${SMEM_BIN}" --no-header -P "^${ps}" -c "pss" | sed 's/[[:space:]]//g') # Save all the smem results # This will help us to retrieve raw information @@ -220,7 +220,7 @@ EOF } # Try to work out the 'average memory footprint' of a container. -get_docker_memory_usage(){ +get_memory_usage(){ hypervisor_mem=0 virtiofsd_mem=0 shim_mem=0 @@ -277,16 +277,16 @@ EOF # Now if you do not have enough rights # the smem failure to read the stats will also be trapped. - hypervisor_mem="$(get_pss_memory "${HYPERVISOR_PATH}")" + hypervisor_mem="$(get_pss_memory ${HYPERVISOR_PATH})" if [ "${hypervisor_mem}" == "0" ]; then die "Failed to find PSS for ${HYPERVISOR_PATH}" fi - virtiofsd_mem="$(get_pss_memory_virtiofsd "${VIRTIOFSD_PATH}")" + virtiofsd_mem="$(get_pss_memory_virtiofsd ${VIRTIOFSD_PATH})" if [ "${virtiofsd_mem}" == "0" ]; then echo >&2 "WARNING: Failed to find PSS for ${VIRTIOFSD_PATH}" fi - shim_mem="$(get_pss_memory "${SHIM_PATH}")" + shim_mem="$(get_pss_memory ${SHIM_PATH})" if [ "${shim_mem}" == "0" ]; then die "Failed to find PSS for ${SHIM_PATH}" fi @@ -353,8 +353,6 @@ main(){ #Check for KSM before reporting test name, as it can modify it check_for_ksm - sudo systemctl restart containerd - init_env check_cmds "${SMEM_BIN}" bc @@ -370,7 +368,7 @@ main(){ metrics_json_init save_config - get_docker_memory_usage + get_memory_usage if [ "$RUNTIME" == "runc" ]; then get_runc_individual_memory From 35d096b607573cc1854b6ba083a551d91ae03799 Mon Sep 17 00:00:00 2001 From: David Esparza Date: Mon, 26 Jun 2023 16:53:23 -0600 Subject: [PATCH 47/87] metrics: Adds blogbench and webtool metrics tests This PR adds blogbench and webtooling metrics checks to this repo. The function running the test intentionally returns zero, so the test will be enabled in another PR once the workflow is green. Fixes: #7069 Signed-off-by: David Esparza --- .github/workflows/run-metrics.yaml | 2 + tests/metrics/gha-run.sh | 10 + tests/metrics/storage/blogbench.sh | 124 ++++++++ .../storage/blogbench_dockerfile/Dockerfile | 32 ++ .../web-tooling-dockerfile/Dockerfile.in | 28 ++ tests/metrics/storage/webtooling.sh | 283 ++++++++++++++++++ 6 files changed, 479 insertions(+) create mode 100755 tests/metrics/storage/blogbench.sh create mode 100644 tests/metrics/storage/blogbench_dockerfile/Dockerfile create mode 100755 tests/metrics/storage/web-tooling-dockerfile/Dockerfile.in create mode 100755 tests/metrics/storage/webtooling.sh diff --git a/.github/workflows/run-metrics.yaml b/.github/workflows/run-metrics.yaml index 4ef44cb4d..d8b8c3976 100644 --- a/.github/workflows/run-metrics.yaml +++ b/.github/workflows/run-metrics.yaml @@ -40,3 +40,5 @@ jobs: - name: run memory usage inside container test run: bash tests/metrics/gha-run.sh run-test-memory-usage-inside-container + - name: run blogbench test + run: bash tests/metrics/gha-run.sh run-test-blogbench diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index a684c4f60..1671c5624 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -131,6 +131,15 @@ function run_test_memory_usage_inside_container() { bash tests/metrics/density/memory_usage_inside_container.sh 5 } +function run_test_blogbench() { + info "Running Blogbench test using ${KATA_HYPERVISOR} hypervisor" + + # ToDo: remove the exit once the metrics workflow is stable + exit 0 + create_symbolic_links + bash tests/metrics/storage/blogbench.sh +} + function main() { action="${1:-}" case "${action}" in @@ -138,6 +147,7 @@ function main() { run-test-launchtimes) run_test_launchtimes ;; run-test-memory-usage) run_test_memory_usage ;; run-test-memory-usage-inside-container) run_test_memory_usage_inside_container ;; + run-test-blogbench) run_test_blogbench ;; *) >&2 die "Invalid argument" ;; esac } diff --git a/tests/metrics/storage/blogbench.sh b/tests/metrics/storage/blogbench.sh new file mode 100755 index 000000000..19a960103 --- /dev/null +++ b/tests/metrics/storage/blogbench.sh @@ -0,0 +1,124 @@ +#!/bin/bash +# +# Copyright (c) 2018-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Description of the test: +# This test runs the 'blogbench', and extracts the 'scores' for reads +# and writes +# Note - the scores are *not* normalised for the number of iterations run, +# they are total scores for all iterations (this is the blogbench default output) + +set -e + +# General env +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +TEST_NAME="blogbench" +IMAGE="docker.io/library/local-blogbench:latest" +DOCKERFILE="${SCRIPT_PATH}/blogbench_dockerfile/Dockerfile" + +# Number of iterations for blogbench to run - note, results are not +# scaled to iterations - more iterations results in bigger results +ITERATIONS="${ITERATIONS:-30}" + +# Directory to run the test on +# This is run inside of the container +TESTDIR="${TESTDIR:-/tmp}" +CMD="blogbench -i ${ITERATIONS} -d ${TESTDIR}" + +function main() { + # Check tools/commands dependencies + cmds=("awk" "docker") + + init_env + check_cmds "${cmds[@]}" + check_ctr_images "${IMAGE}" "${DOCKERFILE}" + metrics_json_init + + local output=$(sudo -E ${CTR_EXE} run --rm --runtime=${CTR_RUNTIME} ${IMAGE} test ${CMD}) + + # Save configuration + metrics_json_start_array + + local frequency=$(echo "${output}" | grep "Frequency" | cut -d "=" -f2 | cut -d ' ' -f2) + local iterations=$(echo "${output}" | grep -w "iterations" | cut -d ' ' -f3) + local spawing_writers=$(echo "${output}" | grep -w "writers" | cut -d ' ' -f2) + local spawing_rewriters=$(echo "${output}" | grep -w "rewriters" | cut -d ' ' -f2) + local spawing_commenters=$(echo "${output}" | grep -w "commenters" | cut -d ' ' -f2) + local spawing_readers=$(echo "${output}" | grep -w "readers" | cut -d ' ' -f2) + + local json="$(cat << EOF + { + "Frequency" : ${frequency}, + "Iterations" : ${iterations}, + "Number of spawing writers" : ${spawing_writers}, + "Number of spawing rewriters" : ${spawing_rewriters}, + "Number of spawing commenters" : ${spawing_commenters}, + "Number of spawing readers" : ${spawing_readers} + } +EOF +)" + metrics_json_add_array_element "${json}" + metrics_json_end_array "Config" + + # Save results + metrics_json_start_array + + local writes=$(tail -2 <<< "${output}" | head -1 | awk '{print $5}') + local reads=$(tail -1 <<< "${output}" | awk '{print $6}') + + # Obtaining other Blogbench results + local -r data=$(echo "${output}" | tail -n +12 | head -n -3) + local nb_blogs=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $1} ' | tr '\t' ',' | sed '$ s/.$//') + local r_articles=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $2} ' | tr '\t' ',' | sed '$ s/.$//') + local w_articles=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $3} ' | tr '\t' ',' | sed '$ s/.$//') + local r_pictures=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $4} ' | tr '\t' ',' | sed '$ s/.$//') + local w_pictures=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $5} ' | tr '\t' ',' | sed '$ s/.$//') + local r_comments=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $6} ' | tr '\t' ',' | sed '$ s/.$//') + local w_comments=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $7} ' | tr '\t' ',' | sed '$ s/.$//') + + local json="$(cat << EOF + { + "write": { + "Result" : "${writes}", + "Units" : "items" + }, + "read": { + "Result" : "${reads}", + "Units" : "items" + }, + "Nb blogs": { + "Result" : "${nb_blogs}" + }, + "R articles": { + "Result" : "${r_articles}" + }, + "W articles": { + "Result" : "${w_articles}" + }, + "R pictures": { + "Result" : "${r_pictures}" + }, + "W pictures": { + "Result" : "${w_pictures}" + }, + "R comments": { + "Result" : "${r_comments}" + }, + "W comments": { + "Result" : "${w_comments}" + } + } +EOF +)" + + metrics_json_add_array_element "${json}" + metrics_json_end_array "Results" + metrics_json_save + clean_env_ctr +} + +main "$@" diff --git a/tests/metrics/storage/blogbench_dockerfile/Dockerfile b/tests/metrics/storage/blogbench_dockerfile/Dockerfile new file mode 100644 index 000000000..593063798 --- /dev/null +++ b/tests/metrics/storage/blogbench_dockerfile/Dockerfile @@ -0,0 +1,32 @@ +# Copyright (c) 2018-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Set up an Ubuntu image with 'blogbench' installed + +# Usage: FROM [image name] +# hadolint ignore=DL3007 +FROM docker.io/library/ubuntu:latest + +# Version of the Dockerfile +LABEL DOCKERFILE_VERSION="1.0" + +# URL for blogbench test and blogbench version +ENV BLOGBENCH_URL "https://download.pureftpd.org/pub/blogbench" +ENV BLOGBENCH_VERSION 1.1 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential curl && \ + apt-get remove -y unattended-upgrades && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/ && \ + curl -OkL "${BLOGBENCH_URL}/blogbench-${BLOGBENCH_VERSION}.tar.gz" && \ + tar xzf "blogbench-${BLOGBENCH_VERSION}.tar.gz" -C / +WORKDIR "/blogbench-${BLOGBENCH_VERSION}" +RUN arch="$(uname -m)" && \ + export arch && \ + ./configure --build="${arch}" && \ + make && \ + make install-strip + +CMD ["/bin/bash"] diff --git a/tests/metrics/storage/web-tooling-dockerfile/Dockerfile.in b/tests/metrics/storage/web-tooling-dockerfile/Dockerfile.in new file mode 100755 index 000000000..8190d8560 --- /dev/null +++ b/tests/metrics/storage/web-tooling-dockerfile/Dockerfile.in @@ -0,0 +1,28 @@ +# Copyright (c) 2020-2021 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Set up an Ubuntu image with 'web tooling' installed + +# Usage: FROM [image name] +# hadolint ignore=DL3007 +FROM @UBUNTU_REGISTRY@/ubuntu:latest + +# Version of the Dockerfile +LABEL DOCKERFILE_VERSION="1.0" + +# URL for web tooling test +ENV WEB_TOOLING_URL "https://github.com/v8/web-tooling-benchmark" +ENV NODEJS_VERSION "setup_14.x" + +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential git curl sudo && \ + apt-get remove -y unattended-upgrades && \ + curl -OkL https://deb.nodesource.com/${NODEJS_VERSION} && chmod +x ${NODEJS_VERSION} && ./${NODEJS_VERSION} && \ + apt-get install -y --no-install-recommends nodejs && \ + apt-get clean && rm -rf /var/lib/apt/lists && \ + git clone ${WEB_TOOLING_URL} /web-tooling-benchmark +WORKDIR /web-tooling-benchmark/ +RUN npm install --unsafe-perm + +CMD ["/bin/bash"] diff --git a/tests/metrics/storage/webtooling.sh b/tests/metrics/storage/webtooling.sh new file mode 100755 index 000000000..f82849618 --- /dev/null +++ b/tests/metrics/storage/webtooling.sh @@ -0,0 +1,283 @@ +#!/bin/bash +# +# Copyright (c) 2020-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Description of the test: +# This test runs the 'web tooling benchmark' +# https://github.com/v8/web-tooling-benchmark + +set -o pipefail + +# General env +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" +NUM_CONTAINERS="${1:-}" + +IMAGE="docker.io/library/local-web-tooling:latest" +DOCKERFILE="${SCRIPT_PATH}/web-tooling-dockerfile/Dockerfile" + +# Directory to run the test inside of the container +TESTDIR="${TESTDIR:-/testdir}" +file_path="/web-tooling-benchmark" +file_name="output" + +# Directory where the webtooling results are stored +TMP_DIR=$(mktemp --tmpdir -d webtool.XXXXXXXXXX) + +# Options to control the start of the workload using a trigger-file +dst_dir="/host" +src_dir=$(mktemp --tmpdir -d webtool.XXXXXXXXXX) +trigger_file="$RANDOM.txt" +guest_trigger_file="$dst_dir/$trigger_file" +host_trigger_file="$src_dir/$trigger_file" +start_script="webtooling_start.sh" + +# CMD points to the script that starts the workload +CMD="$dst_dir/$start_script" +MOUNT_OPTIONS="type=bind,src=$src_dir,dst=$dst_dir,options=rbind:ro" +PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}" + +# This timeout is related with the amount of time that +# webtool benchmark needs to run inside the container +timeout=600 +INITIAL_NUM_PIDS=1 + +cpu_period="100000" +cpu_quota="200000" + +function remove_tmp_dir() { + rm -rf "$TMP_DIR" + rm -rf "$src_dir" +} + +trap remove_tmp_dir EXIT + +# Show help about this script +function help(){ +cat << EOF +Usage: $0 + Description: + : Number of containers to run. +EOF +} + +# script used to launch the workload +function create_start_script() { + local script="${src_dir/$start_script}" + rm -rf "${script}" + +cat <>"${script}" +#!/bin/bash +mkdir -p "${TESTDIR}" + +until [ -f ${guest_trigger_file} ]; do + sleep 1 +done +pushd "${file_path}" +node dist/cli.js > "${file_name}" +EOF + chmod +x "${script}" +} + +function verify_task_is_completed_on_all_containers() { + local containers=( $(sudo -E "${CTR_EXE}" c list -q) ) + local sleep_secs=10 + local max=$(bc <<<"${timeout} / ${sleep_secs}") + local wip_list=() + local count=1 + local sum=0 + local i="" + + while (( ${sum} < ${NUM_CONTAINERS} )); do + + for i in "${containers[@]}"; do + # Only check containers that have not completed the workload at this step + num_pids=$(sudo -E "${CTR_EXE}" t metrics "${i}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2) + + if [ "${num_pids}" -lt "${INITIAL_NUM_PIDS}" ]; then + ((sum++)) + else + wip_list+=("${i}") + fi + done + + # hold the list of containers that are still running the workload + containers=(${wip_list[*]}) + wip_list=() + + info "loop ${count} of ${max}: sleeping for ${sleep_secs} seconds" + sleep "${sleep_secs}" + ((count++)) + done +} + +function check_containers_are_up() { + info "Verify that the containers are running" + local containers_launched=0 + while (( $containers_launched < ${NUM_CONTAINERS} )); do + containers_launched="$(sudo -E ${CTR_EXE} t list | grep -c "RUNNING")" + sleep 1 + done +} + +function save_config() { + metrics_json_start_array + + local json="$(cat << EOF + { + "containers": "${NUM_CONTAINERS}", + "image": "${IMAGE}", + "units": "runs/s" + } +EOF +)" + metrics_json_add_array_element "${json}" + metrics_json_end_array "Config" +} + +function main() { + # Verify enough arguments + if [ $# != 1 ]; then + echo >&2 "error: Not enough arguments [$@]" + help + exit 1 + fi + + local i=0 + local containers=() + local cmds=("docker") + local not_started_count=$NUM_CONTAINERS + + restart_containerd_service + # Check tools/commands dependencies + init_env + check_cmds "${cmds[@]}" + check_ctr_images "$IMAGE" "$DOCKERFILE" + metrics_json_init + save_config + create_start_script + rm -rf "${host_trigger_file}" + + info "Creating ${NUM_CONTAINERS} containers" + + for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do + containers+=($(random_name)) + # Web tool benchmark needs 2 cpus to run completely in its cpu utilization + sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" --cpu-quota "${cpu_quota}" --cpu-period "${cpu_period}" --mount="${MOUNT_OPTIONS}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}" + ((not_started_count--)) + info "${not_started_count} remaining containers" + done + + # Check that the requested number of containers are running + local timeout_launch="10" + check_containers_are_up & pid=$! + (sleep "${timeout_launch}" && kill -HUP ${pid}) 2>/dev/null & pid_tout=$! + + if wait $pid 2>/dev/null; then + pkill -HUP -P "${pid_tout}" + wait "${pid_tout}" + else + warn "Time out exceeded" + return 1 + fi + + # Get the initial number of pids in a single container before the workload starts + INITIAL_NUM_PIDS=$(sudo -E "${CTR_EXE}" t metrics "${containers[-1]}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2) + ((INITIAL_NUM_PIDS++)) + + # Launch webtooling benchmark + local pids=() + local j=0 + for i in "${containers[@]}"; do + $(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD}") & + pids[${j}]=$! + ((j++)) + done + + # wait for all pids + for pid in ${pids[*]}; do + wait "${pid}" + done + + touch "${host_trigger_file}" + info "All containers are running the workload..." + + # Verify that all containers have completed the assigned task + verify_task_is_completed_on_all_containers & pid=$! + (sleep "$timeout" && kill -HUP $pid) 2>/dev/null & pid_tout=$! + if wait ${pid} 2>/dev/null; then + pkill -HUP -P "${pid_tout}" + wait "${pid_tout}" + else + warn "Time out exceeded" + return 1 + fi + + RESULTS_CMD="cat ${file_path}/${file_name}" + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec --exec-id "${RANDOM}" "${i}" sh -c "${RESULTS_CMD}" >> "${TMP_DIR}/results" + done + + # Save configuration + metrics_json_start_array + + local output=$(cat "${TMP_DIR}/results") + local cut_results="cut -d':' -f2 | sed -e 's/^[ \t]*//'| cut -d ' ' -f1 | tr '\n' ',' | sed 's/.$//'" + + local acorn=$(echo "${output}" | grep -w "acorn" | eval "${cut_results}") + local babel=$(echo "${output}" | grep -w "babel" | sed '/babel-minify/d' | eval "${cut_results}") + local babel_minify=$(echo "${output}" | grep -w "babel-minify" | eval "${cut_results}") + local babylon=$(echo "${output}" | grep -w "babylon" | eval "${cut_results}") + local buble=$(echo "${output}" | grep -w "buble" | eval "${cut_results}") + local chai=$(echo "${output}" | grep -w "chai" | eval "${cut_results}") + local coffeescript=$(echo "${output}" | grep -w "coffeescript" | eval "${cut_results}") + local espree=$(echo "${output}" | grep -w "espree" | eval "${cut_results}") + local esprima=$(echo "${output}" | grep -w "esprima" | eval "${cut_results}") + local jshint=$(echo "${output}" | grep -w "jshint" | eval "${cut_results}") + local lebab=$(echo "${output}" | grep -w "lebab" | eval "${cut_results}") + local postcss=$(echo "${output}" | grep -w "postcss" | eval "${cut_results}") + local prepack=$(echo "${output}" | grep -w "prepack" | eval "${cut_results}") + local prettier=$(echo "${output}" | grep -w "prettier" | eval "${cut_results}") + local source_map=$(echo "${output}" | grep -w "source-map" | eval "${cut_results}") + local terser=$(echo "${output}" | grep -w "terser" | eval "${cut_results}") + local typescript=$(echo "${output}" | grep -w "typescript" | eval "${cut_results}") + local uglify_js=$(echo "${output}" | grep -w "uglify-js" | eval "${cut_results}") + local geometric_mean=$(echo "${output}" | grep -w "Geometric" | eval "${cut_results}") + local average_tps=$(echo "${geometric_mean}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) + local tps=$(echo "${average_tps}*${NUM_CONTAINERS}" | bc -l) + + local json="$(cat << EOF + { + "Acorn" : "${acorn}", + "Babel" : "${babel}", + "Babel minify" : "${babel_minify}", + "Babylon" : "${babylon}", + "Buble" : "${buble}", + "Chai" : "${chai}", + "Coffeescript" : "${coffeescript}", + "Espree" : "${espree}", + "Esprima" : "${esprima}", + "Jshint" : "${jshint}", + "Lebab" : "${lebab}", + "Postcss" : "${postcss}", + "Prepack" : "${prepack}", + "Prettier" : "${prettier}", + "Source map" : "${source_map}", + "Terser" : "${terser}", + "Typescript" : "${typescript}", + "Uglify js" : "${uglify_js}", + "Geometric mean" : "${geometric_mean}", + "Average TPS" : "${average_tps}", + "TPS" : "${tps}" + } +EOF +)" + metrics_json_add_array_element "${json}" + metrics_json_end_array "Results" + metrics_json_save + clean_env_ctr +} + +main "$@" From 18968f428fdf2d9f919c0402eff402d1484a0804 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Tue, 4 Jul 2023 20:06:25 +0000 Subject: [PATCH 48/87] metrics: Add function to have uniformity This PR adds the function name before the function to have uniformity across all the test. Signed-off-by: Gabriela Cervantes --- tests/metrics/density/memory_usage.sh | 33 ++++++++++++++------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/tests/metrics/density/memory_usage.sh b/tests/metrics/density/memory_usage.sh index 3a7f89d5f..aadf5a7c2 100755 --- a/tests/metrics/density/memory_usage.sh +++ b/tests/metrics/density/memory_usage.sh @@ -39,7 +39,7 @@ function remove_tmp_file() { trap remove_tmp_file EXIT # Show help about this script -help(){ +function help(){ cat << EOF Usage: $0 [auto] Description: @@ -52,12 +52,12 @@ EOF } -get_runc_pss_memory(){ +function get_runc_pss_memory(){ ctr_runc_shim_path="/usr/local/bin/containerd-shim-runc-v2" get_pss_memory "${ctr_runc_shim_path}" } -get_runc_individual_memory() { +function get_runc_individual_memory() { runc_process_result=$(cat "${MEM_TMP_FILE}" | tr "\n" " " | sed -e 's/\s$//g' | sed 's/ /, /g') # Verify runc process result @@ -85,7 +85,7 @@ EOF # This function measures the PSS average # memory of a process. -get_pss_memory(){ +function get_pss_memory(){ ps="$1" mem_amount=0 count=0 @@ -106,7 +106,7 @@ get_pss_memory(){ echo "${data}" >> "${MEM_TMP_FILE}" for i in "${data}"; do - if (( i > 0 ));then + if (( $i > 0 ));then mem_amount=$(( i + mem_amount )) (( count++ )) fi @@ -119,7 +119,7 @@ get_pss_memory(){ echo "${avg}" } -ppid() { +function ppid() { local pid pid=$(ps -p "${1:-nopid}" -o ppid=) echo "${pid//[[:blank:]]/}" @@ -131,7 +131,7 @@ ppid() { # virtiofsd forks itself so, smem sees the process # two times, this function sum both pss values: # pss_virtiofsd=pss_fork + pss_parent -get_pss_memory_virtiofsd() { +function get_pss_memory_virtiofsd() { mem_amount=0 count=0 avg=0 @@ -175,7 +175,7 @@ get_pss_memory_virtiofsd() { echo "${avg}" } -get_individual_memory(){ +function get_individual_memory(){ # Getting all the individual container information first_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==1' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g') first_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==1' | sed 's/ /, /g') @@ -194,19 +194,19 @@ get_individual_memory(){ local json="$(cat << EOF { - "${first_process_name memory}": [ + "${first_process_name} memory": [ $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do [ -n "${first_values[i]}" ] && printf '%s\n\t\t\t' "${first_values[i]}" done) ], - "${second_process_name memory}": [ + "${second_process_name} memory": [ $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do [ -n "${second_values[i]}" ] && printf '%s\n\t\t\t' "${second_values[i]}" done) ], - "${third_process_name memory}": [ + "${third_process_name} memory": [ $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do [ -n "${third_values[i]}" ] && printf '%s\n\t\t\t' "${third_values[i]}" @@ -220,7 +220,7 @@ EOF } # Try to work out the 'average memory footprint' of a container. -get_memory_usage(){ +function get_memory_usage(){ hypervisor_mem=0 virtiofsd_mem=0 shim_mem=0 @@ -228,9 +228,10 @@ get_memory_usage(){ containers=() - for ((i=1; i<"${NUM_CONTAINERS[@]}"; i++)); do + info "Creating ${NUM_CONTAINERS} containers" + for ((i=1; i<="${NUM_CONTAINERS}"; i++)); do containers+=($(random_name)) - sudo "${CTR_EXE}" run --runtime "${CTR_RUNTIME}" -d "${IMAGE}" "${containers[-1]}" "${CMD}" + sudo "${CTR_EXE}" run --runtime "${CTR_RUNTIME}" -d "${IMAGE}" "${containers[-1]}" sh -c "${CMD}" done if [ "${AUTO_MODE}" == "auto" ]; then @@ -323,7 +324,7 @@ EOF clean_env_ctr } -save_config(){ +function save_config(){ metrics_json_start_array local json="$(cat << EOF @@ -342,7 +343,7 @@ EOF metrics_json_end_array "Config" } -main(){ +function main(){ # Verify enough arguments if [ $# != 2 ] && [ $# != 3 ];then echo >&2 "error: Not enough arguments [$@]" From 1c3dbafbf0714acd7ea67c3aad7ac705d2376b76 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Wed, 5 Jul 2023 18:17:57 +0000 Subject: [PATCH 49/87] metrics: Fix function of how to retrieve multiple values This PR fixes the function of how to add multiple values of pss memory. Signed-off-by: Gabriela Cervantes --- tests/metrics/density/memory_usage.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/metrics/density/memory_usage.sh b/tests/metrics/density/memory_usage.sh index aadf5a7c2..b6cc827e3 100755 --- a/tests/metrics/density/memory_usage.sh +++ b/tests/metrics/density/memory_usage.sh @@ -99,13 +99,14 @@ function get_pss_memory(){ # This will be help us to retrieve raw information echo "${ps}" >> "${PS_TMP_FILE}" - data=$(sudo "${SMEM_BIN}" --no-header -P "^${ps}" -c "pss" | sed 's/[[:space:]]//g') + data=$(sudo "${SMEM_BIN}" --no-header -P "^${ps}" -c "pss" | sed 's/[[:space:]]//g' | tr '\n' ' ' | sed 's/[[:blank:]]*$//') # Save all the smem results # This will help us to retrieve raw information echo "${data}" >> "${MEM_TMP_FILE}" - for i in "${data}"; do + gral_data=$(echo "${data// /+}" | bc) + for i in "${gral_data}"; do if (( $i > 0 ));then mem_amount=$(( i + mem_amount )) (( count++ )) @@ -143,7 +144,7 @@ function get_pss_memory_virtiofsd() { echo "${virtiofsd_path}" >> "${PS_TMP_FILE}" - virtiofsd_pids=$(ps aux | grep virtiofsd | awk '{print $2}') + virtiofsd_pids=$(ps aux | grep [v]irtiofsd | awk '{print $2}' | head -1) data=$(sudo smem --no-header -P "^${virtiofsd_path}" -c pid -c "pid pss") for p in "${virtiofsd_pids}"; do From 78086ed1feff71be7c13f4f6fbeaa0df198f5bc0 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Wed, 5 Jul 2023 19:19:04 +0000 Subject: [PATCH 50/87] checkmetrics: Add memory usage value for clh This PR adds the memory usage value for clh. Signed-off-by: Gabriela Cervantes --- .../checkmetrics-json-clh-kata-metric8.toml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml index 7966b69eb..9c593320d 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml @@ -19,3 +19,17 @@ checktype = "mean" midval = 0.42 minpercent = 15.0 maxpercent = 15.0 + +[[metric]] +name = "memory-footprint" +type = "json" +description = "measure memory usage" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"memory-footprint\".Results | .[] | .average.Result" +checktype = "mean" +midval = 2518364.00 +minpercent = 15.0 +maxpercent = 15.0 + From 5a61065ab73f84c4dc08006613301af41d4f7f52 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Wed, 5 Jul 2023 19:21:03 +0000 Subject: [PATCH 51/87] checkmetrics: Add checkmetrics value for memory usage in qemu This PR adds the checkmetrics value for memory usage in qemu. Signed-off-by: Gabriela Cervantes --- .../checkmetrics-json-clh-kata-metric8.toml | 1 - .../checkmetrics-json-qemu-kata-metric8.toml | 13 +++++++++++++ tests/metrics/density/memory_usage.sh | 1 - 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml index 9c593320d..4ac4f5cf7 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml @@ -32,4 +32,3 @@ checktype = "mean" midval = 2518364.00 minpercent = 15.0 maxpercent = 15.0 - diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml index c243ed2c5..102f7d34d 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml @@ -19,3 +19,16 @@ checktype = "mean" midval = 0.61 minpercent = 15.0 maxpercent = 15.0 + +[[metric]] +name = "memory-footprint" +type = "json" +description = "measure memory usage" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"memory-footprint\".Results | .[] | .average.Result" +checktype = "mean" +midval = 2435844.00 +minpercent = 15.0 +maxpercent = 15.0 diff --git a/tests/metrics/density/memory_usage.sh b/tests/metrics/density/memory_usage.sh index b6cc827e3..57d2ce3cd 100755 --- a/tests/metrics/density/memory_usage.sh +++ b/tests/metrics/density/memory_usage.sh @@ -12,7 +12,6 @@ # This test uses smem tool to get the memory used. set -e -set -x SCRIPT_PATH=$(dirname "$(readlink -f "$0")") source "${SCRIPT_PATH}/../lib/common.bash" From 1ca17c2f70b9b54bc04acdfd1080b3caf5cac0e3 Mon Sep 17 00:00:00 2001 From: David Esparza Date: Wed, 5 Jul 2023 11:48:44 -0600 Subject: [PATCH 52/87] metrics: storing metrics workflow artifacts This PR enables storing metrics workflow artifacts in two separated flavours: clh and qemu. Fixes: #7239 Signed-off-by: David Esparza --- .github/workflows/run-metrics.yaml | 11 +++++++++++ tests/common.bash | 14 ++++++++++++++ tests/metrics/gha-run.sh | 6 ++++++ 3 files changed, 31 insertions(+) diff --git a/.github/workflows/run-metrics.yaml b/.github/workflows/run-metrics.yaml index d8b8c3976..4be83d83f 100644 --- a/.github/workflows/run-metrics.yaml +++ b/.github/workflows/run-metrics.yaml @@ -42,3 +42,14 @@ jobs: - name: run blogbench test run: bash tests/metrics/gha-run.sh run-test-blogbench + + - name: make metrics tarball ${{ matrix.vmm }} + run: bash tests/metrics/gha-run.sh make-tarball-results + + - name: archive metrics results ${{ matrix.vmm }} + uses: actions/upload-artifact@v3 + with: + name: metrics-artifacts-${{ matrix.vmm }} + path: results-${{ matrix.vmm }}.tar.gz + retention-days: 1 + if-no-files-found: error diff --git a/tests/common.bash b/tests/common.bash index d4aa44684..090bd7fd9 100644 --- a/tests/common.bash +++ b/tests/common.bash @@ -240,3 +240,17 @@ restart_containerd_service() { clean_env_ctr return 0 } + +# @path_results: path to the input metric-results folder +# @tarball_fname: path and filename to the output tarball +function compress_metrics_results_dir() +{ + local path_results="${1:-results}" + local tarball_fname="${2:-}" + + [ -z "${tarball_fname}" ] && die "Missing the tarball filename or the path to save the tarball results is incorrect." + [ ! -d "${path_results}" ] && die "Missing path to the results folder." + + cd "${path_results}" && tar -czf "${tarball_fname}" *.json && cd - + info "tarball generated: ${tarball_fname}" +} diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index 1671c5624..8a05826b4 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -12,6 +12,7 @@ set -o pipefail kata_tarball_dir="${2:-kata-artifacts}" metrics_dir="$(dirname "$(readlink -f "$0")")" source "${metrics_dir}/../common.bash" +source "${metrics_dir}/lib/common.bash" declare -r results_dir="${metrics_dir}/results" declare -r checkmetrics_dir="${metrics_dir}/cmd/checkmetrics" @@ -104,6 +105,10 @@ function check_metrics() { fi } +function make_tarball_results() { + compress_metrics_results_dir "${metrics_dir}/results" "${GITHUB_WORKSPACE}/results-${KATA_HYPERVISOR}.tar.gz" +} + function run_test_launchtimes() { info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor" @@ -144,6 +149,7 @@ function main() { action="${1:-}" case "${action}" in install-kata) install_kata ;; + make-tarball-results) make_tarball_results ;; run-test-launchtimes) run_test_launchtimes ;; run-test-memory-usage) run_test_memory_usage ;; run-test-memory-usage-inside-container) run_test_memory_usage_inside_container ;; From 4e396e7285dbcccd0a168ebd0124fad35e9ab958 Mon Sep 17 00:00:00 2001 From: David Esparza Date: Wed, 5 Jul 2023 20:59:21 -0600 Subject: [PATCH 53/87] metrics: Add function keyword to to helper metrics functions Use the 'function' keyword to prevent bash aliases from colliding with other function's name. Signed-off-by: David Esparza --- tests/metrics/lib/common.bash | 92 +++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 41 deletions(-) diff --git a/tests/metrics/lib/common.bash b/tests/metrics/lib/common.bash index ad7b2a5c4..0bb31030d 100755 --- a/tests/metrics/lib/common.bash +++ b/tests/metrics/lib/common.bash @@ -47,14 +47,14 @@ quay.io/libpod" # # cmds=(“cmd1” “cmd2”) # check_cmds "${cmds[@]}" -check_cmds() +function check_cmds() { local cmd req_cmds=( "$@" ) for cmd in "${req_cmds[@]}"; do if ! command -v "$cmd" > /dev/null 2>&1; then die "command $cmd not available" fi - echo "command: $cmd: yes" + info "command: $cmd: yes" done } @@ -68,19 +68,20 @@ check_cmds() # # images=(“img1” “img2”) # check_imgs "${images[@]}" -check_images() +function check_images() { local img req_images=( "$@" ) for img in "${req_images[@]}"; do - echo "ctr pull'ing: $img" + info "ctr pull'ing: $img" if ! sudo "${CTR_EXE}" image pull "$img"; then die "Failed to pull image $img" fi - echo "ctr pull'd: $img" + info "ctr pull'd: $img" done } -generate_build_dockerfile() { +function generate_build_dockerfile() +{ local dockerfile="$1" local image="$2" local map_key="$3" @@ -99,14 +100,14 @@ generate_build_dockerfile() { # This function performs a build on the image names # passed in, to ensure that we have the latest changes from # the dockerfiles -build_dockerfile_image() +function build_dockerfile_image() { local image="$1" local dockerfile_path="$2" local dockerfile_dir=${2%/*} if [ -f "$dockerfile_path" ]; then - echo "docker building $image" + info "docker building $image" if ! sudo "${DOCKER_EXE}" build --build-arg http_proxy="${http_proxy}" --build-arg https_proxy="${https_proxy}" --label "$image" --tag "${image}" -f "$dockerfile_path" "$dockerfile_dir"; then die "Failed to docker build image $image" fi @@ -119,7 +120,7 @@ build_dockerfile_image() # This function removes the ctr image, builds a new one using a dockerfile # and imports the image from docker to ctr -check_ctr_images() +function check_ctr_images() { local ctr_image="$1" local dockerfile_path="$2" @@ -138,7 +139,7 @@ check_ctr_images() # A one time (per uber test cycle) init that tries to get the # system to a 'known state' as much as possible -metrics_onetime_init() +function metrics_onetime_init() { # The onetime init must be called once, and only once if [ ! -z "$onetime_init_done" ]; then @@ -155,14 +156,14 @@ metrics_onetime_init() # Print a banner to the logs noting clearly which test # we are about to run -test_banner() +function test_banner() { - echo -e "\n===== starting test [$1] =====" + info -e "\n===== starting test [$1] =====" } # Initialization/verification environment. This function makes # minimal steps for metrics/tests execution. -init_env() +function init_env() { test_banner "${TEST_NAME}" @@ -183,7 +184,8 @@ init_env() # This function checks if there are containers or # shim/proxy/hypervisor processes up, if found, they are # killed to start test with clean environment. -kill_processes_before_start() { +function kill_processes_before_start() +{ DOCKER_PROCS=$(sudo "${DOCKER_EXE}" ps -q) [[ -n "${DOCKER_PROCS}" ]] && clean_env @@ -195,26 +197,29 @@ kill_processes_before_start() { # Generate a random name - generally used when creating containers, but can # be used for any other appropriate purpose -random_name() { +function random_name() +{ mktemp -u kata-XXXXXX } -show_system_ctr_state() { - echo "Showing system state:" - echo " --Check containers--" +function show_system_ctr_state() +{ + info "Showing system state:" + info " --Check containers--" sudo "${CTR_EXE}" c list - echo " --Check tasks--" + info " --Check tasks--" sudo "${CTR_EXE}" task list local processes="containerd-shim-kata-v2" for p in ${processes}; do - echo " --pgrep ${p}--" + info " --pgrep ${p}--" pgrep -a ${p} done } -common_init(){ +function common_init() +{ if [ "$CTR_RUNTIME" == "io.containerd.kata.v2" ] || [ "$RUNTIME" == "containerd-shim-kata-v2" ]; then extract_kata_env else @@ -225,17 +230,18 @@ common_init(){ fi } - # Save the current KSM settings so we can restore them later -save_ksm_settings(){ - echo "saving KSM settings" +function save_ksm_settings() +{ + info "saving KSM settings" ksm_stored_run=$(cat ${KSM_ENABLE_FILE}) ksm_stored_pages=$(cat ${KSM_ENABLE_FILE}) ksm_stored_sleep=$(cat ${KSM_ENABLE_FILE}) } -set_ksm_aggressive(){ - echo "setting KSM to aggressive mode" +function set_ksm_aggressive() +{ + info "setting KSM to aggressive mode" # Flip the run off/on to ensure a restart/rescan sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}" sudo bash -c "echo ${KSM_AGGRESIVE_PAGES} > ${KSM_PAGES_FILE}" @@ -245,7 +251,7 @@ set_ksm_aggressive(){ if [ "${KATA_HYPERVISOR}" == "qemu" ]; then # Disable virtio-fs and save whether it was enabled previously set_virtio_out=$(sudo -E PATH="$PATH" "${LIB_DIR}/../../.ci/set_kata_config.sh" shared_fs virtio-9p) - echo "${set_virtio_out}" + info "${set_virtio_out}" grep -q "already" <<< "${set_virtio_out}" || was_virtio_fs=true; fi } @@ -256,8 +262,9 @@ restore_virtio_fs(){ info "Not restoring virtio-fs since it wasn't enabled previously" } -restore_ksm_settings(){ - echo "restoring KSM settings" +function restore_ksm_settings() +{ + info "restoring KSM settings" # First turn off the run to ensure if we are then re-enabling # that any changes take effect sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}" @@ -267,15 +274,17 @@ restore_ksm_settings(){ [ "${KATA_HYPERVISOR}" == "qemu" ] && restore_virtio_fs } -disable_ksm(){ - echo "disabling KSM" +function disable_ksm() +{ + info "disabling KSM" sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}" [ "${KATA_HYPERVISOR}" == "qemu" ] && restore_virtio_fs } # See if KSM is enabled. # If so, amend the test name to reflect that -check_for_ksm(){ +function check_for_ksm() +{ if [ ! -f ${KSM_ENABLE_FILE} ]; then return fi @@ -294,7 +303,8 @@ check_for_ksm(){ # a full scan has managed to do few new merges) # # arg1 - timeout in seconds -wait_ksm_settle(){ +function wait_ksm_settle() +{ [[ "$RUNTIME" == "runc" ]] || [[ "$CTR_RUNTIME" == "io.containerd.runc.v2" ]] && return local t pcnt local oldscan=-1 newscan @@ -305,7 +315,7 @@ wait_ksm_settle(){ # Wait some time for KSM to kick in to avoid early dismissal for ((t=0; t<5; t++)); do pages=$(cat "${KSM_PAGES_SHARED}") - [[ "$pages" -ne 0 ]] && echo "Discovered KSM activity" && break + [[ "$pages" -ne 0 ]] && info "Discovered KSM activity" && break sleep 1 done @@ -315,13 +325,13 @@ wait_ksm_settle(){ newscan=$(cat /sys/kernel/mm/ksm/full_scans) newpages=$(cat "${KSM_PAGES_SHARED}") - [[ "$newpages" -eq 0 ]] && echo "No need to wait for KSM to settle" && return + [[ "$newpages" -eq 0 ]] && info "No need to wait for KSM to settle" && return if (( newscan != oldscan )); then - echo -e "\nnew full_scan ($oldscan to $newscan)" + info -e "\nnew full_scan ($oldscan to $newscan)" # Do we have a previous scan to compare with - echo "check pages $oldpages to $newpages" + info "check pages $oldpages to $newpages" if (( oldpages != -1 )); then # avoid divide by zero problems @@ -330,14 +340,14 @@ wait_ksm_settle(){ # abs() pcnt=$(( $pcnt * -1 )) - echo "$oldpages to $newpages is ${pcnt}%" + info "$oldpages to $newpages is ${pcnt}%" if (( $pcnt <= 5 )); then - echo "KSM stabilised at ${t}s" + info "KSM stabilised at ${t}s" return fi else - echo "$oldpages KSM pages... waiting" + info "$oldpages KSM pages... waiting" fi fi oldscan=$newscan @@ -347,7 +357,7 @@ wait_ksm_settle(){ fi sleep 1 done - echo "Timed out after ${1}s waiting for KSM to settle" + info "Timed out after ${1}s waiting for KSM to settle" } common_init From cc3993d860339f4fcf669429a9e5948755dfaa75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 6 Jul 2023 11:23:17 +0200 Subject: [PATCH 54/87] gha: Pass event specific info from the caller workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's ensure we're not relying, on any of the called workflows, on event specific information. Right now, the two information we've been relying on are: * PR number, coming from github.event.pull_request.number * Commit hash, coming from github.event.pull_request.head.sha As we want to, in the future, add nightly jobs, which will be triggered by a different event (thus, having different fields populated), we should ensure that those are not used unless it's in the "top action" that's trigerred by the event. Signed-off-by: Fabiano Fidêncio --- .../build-kata-static-tarball-amd64.yaml | 7 +++-- .../build-kata-static-tarball-arm64.yaml | 7 +++-- .../build-kata-static-tarball-s390x.yaml | 7 +++-- .github/workflows/ci-on-push.yaml | 27 +++++++++++++------ .github/workflows/payload-after-push.yaml | 9 +++++++ .../publish-kata-deploy-payload-amd64.yaml | 5 +++- .../publish-kata-deploy-payload-arm64.yaml | 5 +++- .../publish-kata-deploy-payload-s390x.yaml | 5 +++- .github/workflows/run-k8s-tests-on-aks.yaml | 10 +++++-- .github/workflows/run-k8s-tests-on-sev.yaml | 5 +++- .github/workflows/run-k8s-tests-on-snp.yaml | 5 +++- .github/workflows/run-k8s-tests-on-tdx.yaml | 5 +++- .github/workflows/run-metrics.yaml | 5 +++- 13 files changed, 79 insertions(+), 23 deletions(-) diff --git a/.github/workflows/build-kata-static-tarball-amd64.yaml b/.github/workflows/build-kata-static-tarball-amd64.yaml index b5c7584fe..419db1b64 100644 --- a/.github/workflows/build-kata-static-tarball-amd64.yaml +++ b/.github/workflows/build-kata-static-tarball-amd64.yaml @@ -13,6 +13,9 @@ on: required: false type: string default: no + commit-hash: + required: false + type: string jobs: build-asset: @@ -60,7 +63,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} fetch-depth: 0 # This is needed in order to keep the commit ids history - name: Build ${{ matrix.asset }} @@ -88,7 +91,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-artifacts uses: actions/download-artifact@v3 with: diff --git a/.github/workflows/build-kata-static-tarball-arm64.yaml b/.github/workflows/build-kata-static-tarball-arm64.yaml index 1fc981733..2ad97a0ba 100644 --- a/.github/workflows/build-kata-static-tarball-arm64.yaml +++ b/.github/workflows/build-kata-static-tarball-arm64.yaml @@ -9,6 +9,9 @@ on: required: false type: string default: no + commit-hash: + required: false + type: string jobs: build-asset: @@ -41,7 +44,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} fetch-depth: 0 # This is needed in order to keep the commit ids history - name: Build ${{ matrix.asset }} run: | @@ -72,7 +75,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-artifacts uses: actions/download-artifact@v3 with: diff --git a/.github/workflows/build-kata-static-tarball-s390x.yaml b/.github/workflows/build-kata-static-tarball-s390x.yaml index 58186ab8c..cf2831033 100644 --- a/.github/workflows/build-kata-static-tarball-s390x.yaml +++ b/.github/workflows/build-kata-static-tarball-s390x.yaml @@ -9,6 +9,9 @@ on: required: false type: string default: no + commit-hash: + required: false + type: string jobs: build-asset: @@ -37,7 +40,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} fetch-depth: 0 # This is needed in order to keep the commit ids history - name: Build ${{ matrix.asset }} run: | @@ -69,7 +72,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-artifacts uses: actions/download-artifact@v3 with: diff --git a/.github/workflows/ci-on-push.yaml b/.github/workflows/ci-on-push.yaml index 9f7c82eaf..8a7aa9673 100644 --- a/.github/workflows/ci-on-push.yaml +++ b/.github/workflows/ci-on-push.yaml @@ -12,23 +12,28 @@ on: - synchronize - reopened - labeled +env: + COMMIT_HASH: ${{ github.event.pull_request.head.sha }} + PR_NUMBER: ${{ github.event.pull_requesst.number }} jobs: build-kata-static-tarball-amd64: if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml with: - tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }} + tarball-suffix: -${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }} + commit-hash: ${{ env.COMMIT_HASH }} publish-kata-deploy-payload-amd64: if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} needs: build-kata-static-tarball-amd64 uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml with: - tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }} + tarball-suffix: -${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }} registry: ghcr.io repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64 + tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-amd64 + commit-hash: ${{ env.COMMIT_HASH }} secrets: inherit run-k8s-tests-on-aks: @@ -38,7 +43,8 @@ jobs: with: registry: ghcr.io repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64 + tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-amd64 + commit-hash: ${{ env.COMMIT_HASH }} secrets: inherit run-k8s-tests-on-sev: @@ -48,7 +54,8 @@ jobs: with: registry: ghcr.io repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64 + tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-amd64 + commit-hash: ${{ env.COMMIT_HASH }} run-k8s-tests-on-snp: if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} @@ -57,7 +64,9 @@ jobs: with: registry: ghcr.io repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64 + tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-amd64 + pr-number: ${{ env.PR_NUMBER }} + commit-hash: ${{ env.COMMIT_HASH }} run-k8s-tests-on-tdx: if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} @@ -66,11 +75,13 @@ jobs: with: registry: ghcr.io repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64 + tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-amd64 + commit-hash: ${{ env.COMMIT_HASH }} run-metrics-tests: if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} needs: build-kata-static-tarball-amd64 uses: ./.github/workflows/run-metrics.yaml with: - tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }} + tarball-suffix: -${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-amd64 + commit-hash: ${{ env.COMMIT_HASH }} diff --git a/.github/workflows/payload-after-push.yaml b/.github/workflows/payload-after-push.yaml index 97bb309b1..31c0b70bf 100644 --- a/.github/workflows/payload-after-push.yaml +++ b/.github/workflows/payload-after-push.yaml @@ -5,22 +5,28 @@ on: - main - stable-* +env: + COMMIT_HASH: $GITHUB_REF + jobs: build-assets-amd64: uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml with: + commit-hash: ${{ env.COMMIT_HASH }} push-to-registry: yes secrets: inherit build-assets-arm64: uses: ./.github/workflows/build-kata-static-tarball-arm64.yaml with: + commit-hash: ${{ env.COMMIT_HASH }} push-to-registry: yes secrets: inherit build-assets-s390x: uses: ./.github/workflows/build-kata-static-tarball-s390x.yaml with: + commit-hash: ${{ env.COMMIT_HASH }} push-to-registry: yes secrets: inherit @@ -28,6 +34,7 @@ jobs: needs: build-assets-amd64 uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml with: + commit-hash: ${{ env.COMMIT_HASH }} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-amd64 @@ -37,6 +44,7 @@ jobs: needs: build-assets-arm64 uses: ./.github/workflows/publish-kata-deploy-payload-arm64.yaml with: + commit-hash: ${{ env.COMMIT_HASH }} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-arm64 @@ -46,6 +54,7 @@ jobs: needs: build-assets-s390x uses: ./.github/workflows/publish-kata-deploy-payload-s390x.yaml with: + commit-hash: ${{ env.COMMIT_HASH }} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-s390x diff --git a/.github/workflows/publish-kata-deploy-payload-amd64.yaml b/.github/workflows/publish-kata-deploy-payload-amd64.yaml index 91c7a0612..b5ba900d8 100644 --- a/.github/workflows/publish-kata-deploy-payload-amd64.yaml +++ b/.github/workflows/publish-kata-deploy-payload-amd64.yaml @@ -14,6 +14,9 @@ on: tag: required: true type: string + commit-hash: + required: false + type: string jobs: kata-payload: @@ -21,7 +24,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-kata-tarball uses: actions/download-artifact@v3 diff --git a/.github/workflows/publish-kata-deploy-payload-arm64.yaml b/.github/workflows/publish-kata-deploy-payload-arm64.yaml index c4fd32477..6c35ed8a3 100644 --- a/.github/workflows/publish-kata-deploy-payload-arm64.yaml +++ b/.github/workflows/publish-kata-deploy-payload-arm64.yaml @@ -14,6 +14,9 @@ on: tag: required: true type: string + commit-hash: + required: false + type: string jobs: kata-payload: @@ -25,7 +28,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-kata-tarball uses: actions/download-artifact@v3 diff --git a/.github/workflows/publish-kata-deploy-payload-s390x.yaml b/.github/workflows/publish-kata-deploy-payload-s390x.yaml index 2a0ea8071..ee7fa3fd7 100644 --- a/.github/workflows/publish-kata-deploy-payload-s390x.yaml +++ b/.github/workflows/publish-kata-deploy-payload-s390x.yaml @@ -14,6 +14,9 @@ on: tag: required: true type: string + commit-hash: + required: false + type: string jobs: kata-payload: @@ -25,7 +28,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-kata-tarball uses: actions/download-artifact@v3 diff --git a/.github/workflows/run-k8s-tests-on-aks.yaml b/.github/workflows/run-k8s-tests-on-aks.yaml index a39c2bbcd..d8658270a 100644 --- a/.github/workflows/run-k8s-tests-on-aks.yaml +++ b/.github/workflows/run-k8s-tests-on-aks.yaml @@ -11,6 +11,12 @@ on: tag: required: true type: string + pr-number: + required: true + type: string + commit-hash: + required: false + type: string jobs: run-k8s-tests: @@ -31,13 +37,13 @@ jobs: DOCKER_REGISTRY: ${{ inputs.registry }} DOCKER_REPO: ${{ inputs.repo }} DOCKER_TAG: ${{ inputs.tag }} - GH_PR_NUMBER: ${{ github.event.pull_request.number }} + GH_PR_NUMBER: ${{ inputs.pr-number }} KATA_HOST_OS: ${{ matrix.host_os }} KATA_HYPERVISOR: ${{ matrix.vmm }} steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: Download Azure CLI run: bash tests/integration/gha-run.sh install-azure-cli diff --git a/.github/workflows/run-k8s-tests-on-sev.yaml b/.github/workflows/run-k8s-tests-on-sev.yaml index 52ab7f955..3fc4ca835 100644 --- a/.github/workflows/run-k8s-tests-on-sev.yaml +++ b/.github/workflows/run-k8s-tests-on-sev.yaml @@ -11,6 +11,9 @@ on: tag: required: true type: string + commit-hash: + required: false + type: string jobs: run-k8s-tests: @@ -29,7 +32,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: Run tests timeout-minutes: 30 diff --git a/.github/workflows/run-k8s-tests-on-snp.yaml b/.github/workflows/run-k8s-tests-on-snp.yaml index 535c6de6d..8aa1763d2 100644 --- a/.github/workflows/run-k8s-tests-on-snp.yaml +++ b/.github/workflows/run-k8s-tests-on-snp.yaml @@ -11,6 +11,9 @@ on: tag: required: true type: string + commit-hash: + required: false + type: string jobs: run-k8s-tests: @@ -29,7 +32,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: Run tests timeout-minutes: 30 diff --git a/.github/workflows/run-k8s-tests-on-tdx.yaml b/.github/workflows/run-k8s-tests-on-tdx.yaml index 886b1c026..ccbc16db7 100644 --- a/.github/workflows/run-k8s-tests-on-tdx.yaml +++ b/.github/workflows/run-k8s-tests-on-tdx.yaml @@ -11,6 +11,9 @@ on: tag: required: true type: string + commit-hash: + required: false + type: string jobs: run-k8s-tests: @@ -29,7 +32,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: Run tests timeout-minutes: 30 diff --git a/.github/workflows/run-metrics.yaml b/.github/workflows/run-metrics.yaml index d8b8c3976..f55c0b3dd 100644 --- a/.github/workflows/run-metrics.yaml +++ b/.github/workflows/run-metrics.yaml @@ -5,6 +5,9 @@ on: tarball-suffix: required: false type: string + commit-hash: + required: false + type: string jobs: run-metrics: @@ -20,7 +23,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-kata-tarball uses: actions/download-artifact@v3 From 106e305717c228576994ab08bb6641d9da7f2aa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 6 Jul 2023 11:39:06 +0200 Subject: [PATCH 55/87] gha: Create a re-usable `ci.yaml` file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is based on the `ci-on-push.yaml` file, and it's called from ther The reason to split on a new file is that we can easily introduce a `ci-nightly.yaml` file and re-use the `ci.yaml` file there as well. Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-on-push.yaml | 67 ++----------------------- .github/workflows/ci.yaml | 81 +++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 64 deletions(-) create mode 100644 .github/workflows/ci.yaml diff --git a/.github/workflows/ci-on-push.yaml b/.github/workflows/ci-on-push.yaml index 8a7aa9673..1b10f40f6 100644 --- a/.github/workflows/ci-on-push.yaml +++ b/.github/workflows/ci-on-push.yaml @@ -17,71 +17,10 @@ env: PR_NUMBER: ${{ github.event.pull_requesst.number }} jobs: - build-kata-static-tarball-amd64: + kata-containers-ci-on-push: if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml + uses: ./.github/workflows/ci.yaml with: - tarball-suffix: -${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }} commit-hash: ${{ env.COMMIT_HASH }} - - publish-kata-deploy-payload-amd64: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - needs: build-kata-static-tarball-amd64 - uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml - with: - tarball-suffix: -${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }} - registry: ghcr.io - repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-amd64 - commit-hash: ${{ env.COMMIT_HASH }} - secrets: inherit - - run-k8s-tests-on-aks: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - needs: publish-kata-deploy-payload-amd64 - uses: ./.github/workflows/run-k8s-tests-on-aks.yaml - with: - registry: ghcr.io - repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-amd64 - commit-hash: ${{ env.COMMIT_HASH }} - secrets: inherit - - run-k8s-tests-on-sev: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - needs: publish-kata-deploy-payload-amd64 - uses: ./.github/workflows/run-k8s-tests-on-sev.yaml - with: - registry: ghcr.io - repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-amd64 - commit-hash: ${{ env.COMMIT_HASH }} - - run-k8s-tests-on-snp: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - needs: publish-kata-deploy-payload-amd64 - uses: ./.github/workflows/run-k8s-tests-on-snp.yaml - with: - registry: ghcr.io - repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-amd64 pr-number: ${{ env.PR_NUMBER }} - commit-hash: ${{ env.COMMIT_HASH }} - - run-k8s-tests-on-tdx: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - needs: publish-kata-deploy-payload-amd64 - uses: ./.github/workflows/run-k8s-tests-on-tdx.yaml - with: - registry: ghcr.io - repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-amd64 - commit-hash: ${{ env.COMMIT_HASH }} - - run-metrics-tests: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - needs: build-kata-static-tarball-amd64 - uses: ./.github/workflows/run-metrics.yaml - with: - tarball-suffix: -${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-amd64 - commit-hash: ${{ env.COMMIT_HASH }} + tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }} diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 000000000..7eb1491a5 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,81 @@ +name: Run the Kata Containers CI +on: + workflow_call: + inputs: + commit-hash: + required: true + type: string + pr-number: + required: true + type: string + tag: + required: true + type: string + +jobs: + build-kata-static-tarball-amd64: + uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml + with: + tarball-suffix: -${{ inputs.tag }} + commit-hash: ${{ inputs.commit-hash }} + + publish-kata-deploy-payload-amd64: + needs: build-kata-static-tarball-amd64 + uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml + with: + tarball-suffix: -${{ inputs.tag }} + registry: ghcr.io + repo: ${{ github.repository_owner }}/kata-deploy-ci + tag: ${{ inputs.tag }}-amd64 + commit-hash: ${{ inputs.commit-hash }} + secrets: inherit + + run-k8s-tests-on-aks: + if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} + needs: publish-kata-deploy-payload-amd64 + uses: ./.github/workflows/run-k8s-tests-on-aks.yaml + with: + registry: ghcr.io + repo: ${{ github.repository_owner }}/kata-deploy-ci + tag: ${{ inputs.tag }}-amd64 + commit-hash: ${{ inputs.commit-hash }} + secrets: inherit + + run-k8s-tests-on-sev: + if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} + needs: publish-kata-deploy-payload-amd64 + uses: ./.github/workflows/run-k8s-tests-on-sev.yaml + with: + registry: ghcr.io + repo: ${{ github.repository_owner }}/kata-deploy-ci + tag: ${{ inputs.tag }}-amd64 + commit-hash: ${{ inputs.commit-hash }} + + run-k8s-tests-on-snp: + if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} + needs: publish-kata-deploy-payload-amd64 + uses: ./.github/workflows/run-k8s-tests-on-snp.yaml + with: + registry: ghcr.io + repo: ${{ github.repository_owner }}/kata-deploy-ci + tag: ${{ inputs.tag }}-amd64 + pr-number: ${{ inputs.pr-number }} + commit-hash: ${{ inputs.commit-hash }} + + run-k8s-tests-on-tdx: + if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} + needs: publish-kata-deploy-payload-amd64 + uses: ./.github/workflows/run-k8s-tests-on-tdx.yaml + with: + registry: ghcr.io + repo: ${{ github.repository_owner }}/kata-deploy-ci + tag: ${{ inputs.tag }}-amd64 + commit-hash: ${{ inputs.commit-hash }} + + run-metrics-tests: + if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} + needs: build-kata-static-tarball-amd64 + uses: ./.github/workflows/run-metrics.yaml + with: + tarball-suffix: -${{ inputs.tag }}-amd64 + commit-hash: ${{ inputs.commit-hash }} From 7c0de8703cae2204bcd9f96a6838e83db484be38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 5 Jul 2023 19:50:32 +0200 Subject: [PATCH 56/87] gha: k8s: Ensure tests are running on a specific namespace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's make sure we run our tests in a specific namespace, as in case of any kind of issue, we will just get rid of the namespace itself, which will take care of cleaning up any leftover from failing tests. One important thing to mention is why we can get rid of the `namespace: ${namespace}` on the tests that are already using it, and let's do it in parts: * namespace: default We can easily get rid of this as that's the default namespace where pods are created, so it was a no-op so far. * namespace: test-quota-ns My understanding is that we'd need this in order to get a clean namespace where we'd be setting a quota for. Doing this in the namespace that's only used for tests should **not** cause any side-effect on the tests, as we're running those in serial and there's no other pods running on the `kata-containers-k8s-tests` namespace Last but not least, we're not dynamically creating namespaces as the tests are not running in parallel, **never**, not in the case of having 2 tests being ran at same time, neither in the case of having 2 jobs being scheduled to the same machine. Fixes: #6864 Signed-off-by: Fabiano Fidêncio --- tests/integration/gha-run.sh | 14 ++++++++++++++ tests/integration/kubernetes/k8s-pod-quota.bats | 6 ++---- .../runtimeclass_workloads/pod-custom-dns.yaml | 1 - .../kubernetes/runtimeclass_workloads/pod-oom.yaml | 1 - .../pod-quota-deployment.yaml | 1 - .../runtimeclass_workloads/resource-quota.yaml | 1 - .../runtimeclass_workloads/tests-namespace.yaml | 4 ++++ 7 files changed, 20 insertions(+), 8 deletions(-) create mode 100644 tests/integration/kubernetes/runtimeclass_workloads/tests-namespace.yaml diff --git a/tests/integration/gha-run.sh b/tests/integration/gha-run.sh index 103ce2cda..11499283f 100755 --- a/tests/integration/gha-run.sh +++ b/tests/integration/gha-run.sh @@ -58,6 +58,12 @@ function get_cluster_credentials() { function run_tests() { platform="${1}" + # Emsure we're in the default namespace + kubectl config set-context --current --namespace=default + + # Delete any spurious tests namespace that was left behind + kubectl delete namespace kata-containers-k8s-tests &> /dev/null || true + sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}|g" "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" | grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" || die "Failed to setup the tests image" @@ -80,6 +86,10 @@ function run_tests() { sleep 60s fi + # Create a new namespace for the tests and switch to it + kubectl apply -f ${integration_dir}/kubernetes/runtimeclass_workloads/tests-namespace.yaml + kubectl config set-context --current --namespace=kata-containers-k8s-tests + pushd "${integration_dir}/kubernetes" bash setup.sh bash run_kubernetes_tests.sh @@ -89,6 +99,10 @@ function run_tests() { function cleanup() { platform="${1}" + # Switch back to the default namespace and delete the tests one + kubectl config set-context --current --namespace=default + kubectl delete namespace kata-containers-k8s-tests + if [ "${platform}" = "tdx" ]; then deploy_spec="-k "${tools_dir}/packaging/kata-deploy/kata-deploy/overlays/k3s"" cleanup_spec="-k "${tools_dir}/packaging/kata-deploy/kata-cleanup/overlays/k3s"" diff --git a/tests/integration/kubernetes/k8s-pod-quota.bats b/tests/integration/kubernetes/k8s-pod-quota.bats index addc37bb3..d9a527725 100644 --- a/tests/integration/kubernetes/k8s-pod-quota.bats +++ b/tests/integration/kubernetes/k8s-pod-quota.bats @@ -14,13 +14,12 @@ setup() { @test "Pod quota" { resource_name="pod-quota" deployment_name="deploymenttest" - namespace="test-quota-ns" # Create the resourcequota kubectl create -f "${pod_config_dir}/resource-quota.yaml" # View information about resourcequota - kubectl get -n "$namespace" resourcequota "$resource_name" \ + kubectl get resourcequota "$resource_name" \ --output=yaml | grep 'pods: "2"' # Create deployment @@ -28,10 +27,9 @@ setup() { # View deployment kubectl wait --for=condition=Available --timeout=$timeout \ - -n "$namespace" deployment/${deployment_name} + deployment/${deployment_name} } teardown() { - kubectl delete -n "$namespace" deployment "$deployment_name" kubectl delete -f "${pod_config_dir}/resource-quota.yaml" } diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml index 680577a5f..6341b0b1f 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml +++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml @@ -6,7 +6,6 @@ apiVersion: v1 kind: Pod metadata: - namespace: default name: custom-dns-test spec: terminationGracePeriodSeconds: 0 diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml index 672c54e68..90fc28667 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml +++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml @@ -8,7 +8,6 @@ apiVersion: v1 kind: Pod metadata: name: pod-oom - namespace: default spec: runtimeClassName: kata restartPolicy: Never diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml index ecdaf5e64..9383349d7 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml +++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml @@ -7,7 +7,6 @@ apiVersion: apps/v1 kind: Deployment metadata: name: deploymenttest - namespace: test-quota-ns spec: selector: matchLabels: diff --git a/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml b/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml index a8d84d9ad..8ae0a1998 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml +++ b/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml @@ -14,7 +14,6 @@ items: kind: ResourceQuota metadata: name: pod-quota - namespace: test-quota-ns spec: hard: pods: "2" diff --git a/tests/integration/kubernetes/runtimeclass_workloads/tests-namespace.yaml b/tests/integration/kubernetes/runtimeclass_workloads/tests-namespace.yaml new file mode 100644 index 000000000..916003d13 --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/tests-namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: kata-containers-k8s-tests From e067d183336ce0bb4f79d922787d44bc1e6144f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 24 May 2023 17:31:58 +0200 Subject: [PATCH 57/87] gha: Add a nightly CI job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The idea is to mimic what's been done with Jenkins and the "Green CI" effort, but now using our GHA and the GHA infrastructure. Fixes: #7247 Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-nightly.yaml | 20 ++++++++++++++++++++ .github/workflows/ci.yaml | 5 ----- 2 files changed, 20 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/ci-nightly.yaml diff --git a/.github/workflows/ci-nightly.yaml b/.github/workflows/ci-nightly.yaml new file mode 100644 index 000000000..d3a604e50 --- /dev/null +++ b/.github/workflows/ci-nightly.yaml @@ -0,0 +1,20 @@ +name: Kata Containers Nightly CI +on: + schedule: + cron: '0 0 * * *' + +env: + COMMIT_HASH: ${GITHUB_REF} + +jobs: + set-fake-pr-number: + runs-on: ubuntu-latest + run: | + echo "PR_NUMBER=$(date +%Y%m%d%H%M%S)" >> "$GITHUB_ENV" + + kata-containers-ci-on-push: + uses: ./.github/workflows/ci.yaml + with: + commit-hash: ${{ env.COMMIT_HASH }} + pr-number: ${{ env.PR_NUMBER }} + tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-nightly diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7eb1491a5..7a1bcdd22 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -31,7 +31,6 @@ jobs: secrets: inherit run-k8s-tests-on-aks: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} needs: publish-kata-deploy-payload-amd64 uses: ./.github/workflows/run-k8s-tests-on-aks.yaml with: @@ -42,7 +41,6 @@ jobs: secrets: inherit run-k8s-tests-on-sev: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} needs: publish-kata-deploy-payload-amd64 uses: ./.github/workflows/run-k8s-tests-on-sev.yaml with: @@ -52,7 +50,6 @@ jobs: commit-hash: ${{ inputs.commit-hash }} run-k8s-tests-on-snp: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} needs: publish-kata-deploy-payload-amd64 uses: ./.github/workflows/run-k8s-tests-on-snp.yaml with: @@ -63,7 +60,6 @@ jobs: commit-hash: ${{ inputs.commit-hash }} run-k8s-tests-on-tdx: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} needs: publish-kata-deploy-payload-amd64 uses: ./.github/workflows/run-k8s-tests-on-tdx.yaml with: @@ -73,7 +69,6 @@ jobs: commit-hash: ${{ inputs.commit-hash }} run-metrics-tests: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} needs: build-kata-static-tarball-amd64 uses: ./.github/workflows/run-metrics.yaml with: From 6acce83e12ca049d489b5ad51a8fd78455696221 Mon Sep 17 00:00:00 2001 From: Gabriela Cervantes Date: Wed, 5 Jul 2023 19:27:24 +0000 Subject: [PATCH 58/87] metrics: Fix the call to check_metrics function This PR fixes the call to check_metrics function as KATA_HYPERVISOR is not needed to be passed. Signed-off-by: Gabriela Cervantes --- .../ci_worker/checkmetrics-json-clh-kata-metric8.toml | 8 ++++---- .../ci_worker/checkmetrics-json-qemu-kata-metric8.toml | 8 ++++---- tests/metrics/gha-run.sh | 5 ++--- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml index 4ac4f5cf7..562b2c83b 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml @@ -17,8 +17,8 @@ description = "measure container lifecycle timings" checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result" checktype = "mean" midval = 0.42 -minpercent = 15.0 -maxpercent = 15.0 +minpercent = 20.0 +maxpercent = 20.0 [[metric]] name = "memory-footprint" @@ -30,5 +30,5 @@ description = "measure memory usage" checkvar = ".\"memory-footprint\".Results | .[] | .average.Result" checktype = "mean" midval = 2518364.00 -minpercent = 15.0 -maxpercent = 15.0 +minpercent = 20.0 +maxpercent = 20.0 diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml index 102f7d34d..c6bc85147 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml @@ -17,8 +17,8 @@ description = "measure container lifecycle timings" checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result" checktype = "mean" midval = 0.61 -minpercent = 15.0 -maxpercent = 15.0 +minpercent = 20.0 +maxpercent = 20.0 [[metric]] name = "memory-footprint" @@ -30,5 +30,5 @@ description = "measure memory usage" checkvar = ".\"memory-footprint\".Results | .[] | .average.Result" checktype = "mean" midval = 2435844.00 -minpercent = 15.0 -maxpercent = 15.0 +minpercent = 20.0 +maxpercent = 20.0 diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index a2c531728..14317211e 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -95,7 +95,6 @@ function check_containerd_config_for_kata() { } function check_metrics() { - KATA_HYPERVISOR="${1}" local cm_base_file="${checkmetrics_config_dir}/checkmetrics-json-${KATA_HYPERVISOR}-kata-metric8.toml" checkmetrics --debug --percentage --basefile "${cm_base_file}" --metricsdir "${results_dir}" cm_result=$? @@ -109,8 +108,6 @@ function run_test_launchtimes() { create_symbolic_links bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20 - - check_metrics "${KATA_HYPERVISOR}" } function run_test_memory_usage() { @@ -118,6 +115,8 @@ function run_test_memory_usage() { create_symbolic_links bash tests/metrics/density/memory_usage.sh 20 5 + + check_metrics } function run_test_memory_usage_inside_container() { From de83cd9de79226b15519e07871dcdee635d501c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 6 Jul 2023 21:46:44 +0200 Subject: [PATCH 59/87] gha: ci: Use $VAR instead of ${{ env.VAR }} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise we'll get the following error from the workflow: ``` The workflow is not valid. .github/workflows/ci-on-push.yaml (Line: 24, Col: 20): Unrecognized named-value: 'env'. Located at position 1 within expression: env.COMMIT_HASH .github/workflows/ci-on-push.yaml (Line: 25, Col: 18): Unrecognized named-value: 'env'. Located at position 1 within expression: env.PR_NUMBER ``` Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-nightly.yaml | 6 +++--- .github/workflows/ci-on-push.yaml | 6 +++--- .github/workflows/payload-after-push.yaml | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci-nightly.yaml b/.github/workflows/ci-nightly.yaml index d3a604e50..83c3a1726 100644 --- a/.github/workflows/ci-nightly.yaml +++ b/.github/workflows/ci-nightly.yaml @@ -15,6 +15,6 @@ jobs: kata-containers-ci-on-push: uses: ./.github/workflows/ci.yaml with: - commit-hash: ${{ env.COMMIT_HASH }} - pr-number: ${{ env.PR_NUMBER }} - tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }}-nightly + commit-hash: $COMMIT_HASH + pr-number: $PR_NUMBER + tag: $PR_NUMBER-$COMMIT_HASH-nightly diff --git a/.github/workflows/ci-on-push.yaml b/.github/workflows/ci-on-push.yaml index 1b10f40f6..539829967 100644 --- a/.github/workflows/ci-on-push.yaml +++ b/.github/workflows/ci-on-push.yaml @@ -21,6 +21,6 @@ jobs: if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} uses: ./.github/workflows/ci.yaml with: - commit-hash: ${{ env.COMMIT_HASH }} - pr-number: ${{ env.PR_NUMBER }} - tag: ${{ env.PR_NUMBER }}-${{ env.COMMIT_HASH }} + commit-hash: $COMMIT_HASH + pr-number: $PR_NUMBER + tag: $PR_NUMBER-$COMMIT_HASH diff --git a/.github/workflows/payload-after-push.yaml b/.github/workflows/payload-after-push.yaml index 31c0b70bf..fabd987b7 100644 --- a/.github/workflows/payload-after-push.yaml +++ b/.github/workflows/payload-after-push.yaml @@ -12,21 +12,21 @@ jobs: build-assets-amd64: uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml with: - commit-hash: ${{ env.COMMIT_HASH }} + commit-hash: $COMMIT_HASH push-to-registry: yes secrets: inherit build-assets-arm64: uses: ./.github/workflows/build-kata-static-tarball-arm64.yaml with: - commit-hash: ${{ env.COMMIT_HASH }} + commit-hash: $COMMIT_HASH push-to-registry: yes secrets: inherit build-assets-s390x: uses: ./.github/workflows/build-kata-static-tarball-s390x.yaml with: - commit-hash: ${{ env.COMMIT_HASH }} + commit-hash: $COMMIT_HASH push-to-registry: yes secrets: inherit @@ -34,7 +34,7 @@ jobs: needs: build-assets-amd64 uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml with: - commit-hash: ${{ env.COMMIT_HASH }} + commit-hash: $COMMIT_HASH registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-amd64 @@ -44,7 +44,7 @@ jobs: needs: build-assets-arm64 uses: ./.github/workflows/publish-kata-deploy-payload-arm64.yaml with: - commit-hash: ${{ env.COMMIT_HASH }} + commit-hash: $COMMIT_HASH registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-arm64 @@ -54,7 +54,7 @@ jobs: needs: build-assets-s390x uses: ./.github/workflows/publish-kata-deploy-payload-s390x.yaml with: - commit-hash: ${{ env.COMMIT_HASH }} + commit-hash: $COMMIT_HASH registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-s390x From 5c0269dc5aa00133809943f6d32bb6908a93f7b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 6 Jul 2023 21:50:21 +0200 Subject: [PATCH 60/87] gha: ci: Add pr-number input to the correct job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It must have been an input for the AKS jobs, not the SNP one. Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7a1bcdd22..1ceef41fc 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -38,6 +38,7 @@ jobs: repo: ${{ github.repository_owner }}/kata-deploy-ci tag: ${{ inputs.tag }}-amd64 commit-hash: ${{ inputs.commit-hash }} + pr-number: ${{ inputs.pr-number }} secrets: inherit run-k8s-tests-on-sev: @@ -56,7 +57,6 @@ jobs: registry: ghcr.io repo: ${{ github.repository_owner }}/kata-deploy-ci tag: ${{ inputs.tag }}-amd64 - pr-number: ${{ inputs.pr-number }} commit-hash: ${{ inputs.commit-hash }} run-k8s-tests-on-tdx: From 8a0a66655da325355b3d04f4add434e8b96f4c69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 6 Jul 2023 21:54:33 +0200 Subject: [PATCH 61/87] gha: ci: schedule expects a list, not a map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit And because of that we need to declare '- cron', instead of 'cron'. Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-nightly.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-nightly.yaml b/.github/workflows/ci-nightly.yaml index 83c3a1726..1096d27e4 100644 --- a/.github/workflows/ci-nightly.yaml +++ b/.github/workflows/ci-nightly.yaml @@ -1,7 +1,7 @@ name: Kata Containers Nightly CI on: schedule: - cron: '0 0 * * *' + - cron: '0 0 * * *' env: COMMIT_HASH: ${GITHUB_REF} From ddf4afb96148c85e90e8ab1f90323108bb7ad832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 6 Jul 2023 21:57:26 +0200 Subject: [PATCH 62/87] gha: ci: Fix set-fake-pr-number job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It has to have steps declared, and we need to make it a dependency for the nightly kata-containers-ci-on-push job. Fixes: #7247 Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-nightly.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-nightly.yaml b/.github/workflows/ci-nightly.yaml index 1096d27e4..56bb5c959 100644 --- a/.github/workflows/ci-nightly.yaml +++ b/.github/workflows/ci-nightly.yaml @@ -9,10 +9,13 @@ env: jobs: set-fake-pr-number: runs-on: ubuntu-latest - run: | - echo "PR_NUMBER=$(date +%Y%m%d%H%M%S)" >> "$GITHUB_ENV" + steps: + - name: Set fake PR number + run: | + echo "PR_NUMBER=$(date +%Y%m%d%H%M%S)" >> "$GITHUB_ENV" kata-containers-ci-on-push: + needs: set-fake-pr-number uses: ./.github/workflows/ci.yaml with: commit-hash: $COMMIT_HASH From 1a7bbcd398fa50683e8168c821f4fd72ecaccbd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 6 Jul 2023 22:29:00 +0200 Subject: [PATCH 63/87] gha: ci: Fix typo pull_requesst -> pull_request MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thanks David Esparza for pointing this one out. Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-on-push.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-on-push.yaml b/.github/workflows/ci-on-push.yaml index 539829967..eccd2b384 100644 --- a/.github/workflows/ci-on-push.yaml +++ b/.github/workflows/ci-on-push.yaml @@ -14,7 +14,7 @@ on: - labeled env: COMMIT_HASH: ${{ github.event.pull_request.head.sha }} - PR_NUMBER: ${{ github.event.pull_requesst.number }} + PR_NUMBER: ${{ github.event.pull_request.number }} jobs: kata-containers-ci-on-push: From c45f646b9d57c14169b61bbff6fe9035878fc484 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Bombo?= Date: Wed, 5 Jul 2023 14:45:09 -0700 Subject: [PATCH 64/87] gha: k8s: Ensure cluster doesn't exist before creating it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cluster cleanup step will sometimes fail to run, meaning the next run would fail in the cluster creation step. This PR addresses that. Example: https://github.com/kata-containers/kata-containers/actions/runs/5349582743/jobs/9867845852 Fixes: #7242 Signed-off-by: Aurélien Bombo --- tests/integration/gha-run.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/gha-run.sh b/tests/integration/gha-run.sh index 103ce2cda..a9a084e49 100755 --- a/tests/integration/gha-run.sh +++ b/tests/integration/gha-run.sh @@ -31,6 +31,9 @@ function login_azure() { } function create_cluster() { + # First, ensure that the cluster didn't fail to get cleaned up from a previous run. + delete_cluster || true + az aks create \ -g "kataCI" \ -n "$(_print_cluster_name)" \ @@ -115,8 +118,7 @@ function delete_cluster() { az aks delete \ -g "kataCI" \ -n "$(_print_cluster_name)" \ - --yes \ - --no-wait + --yes } function main() { From 11e3ccfa4d0d1053d9e6de9a0f1587674b6be3e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 6 Jul 2023 23:59:18 +0200 Subject: [PATCH 65/87] gha: ci: Avoid using env unless it's really needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit de83cd9de79226b15519e07871dcdee635d501c6 tried to solve an issue, but it clearly seems that I'm using env wrongly, as what ended up being passed as input was "$VAR", instead of the content of the VAR variable. As we can simply avoid using those here, let's do it and save us a headache. Fixes: #7247 Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-nightly.yaml | 9 +++------ .github/workflows/ci-on-push.yaml | 10 +++------- .github/workflows/payload-after-push.yaml | 15 ++++++--------- 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/.github/workflows/ci-nightly.yaml b/.github/workflows/ci-nightly.yaml index 56bb5c959..81380a7d1 100644 --- a/.github/workflows/ci-nightly.yaml +++ b/.github/workflows/ci-nightly.yaml @@ -3,9 +3,6 @@ on: schedule: - cron: '0 0 * * *' -env: - COMMIT_HASH: ${GITHUB_REF} - jobs: set-fake-pr-number: runs-on: ubuntu-latest @@ -18,6 +15,6 @@ jobs: needs: set-fake-pr-number uses: ./.github/workflows/ci.yaml with: - commit-hash: $COMMIT_HASH - pr-number: $PR_NUMBER - tag: $PR_NUMBER-$COMMIT_HASH-nightly + commit-hash: ${GITHUB_REF} + pr-number: ${PR_NUMBER} + tag: ${PR_NUMBER}-${GITHUB_REF}-nightly diff --git a/.github/workflows/ci-on-push.yaml b/.github/workflows/ci-on-push.yaml index eccd2b384..d5549d458 100644 --- a/.github/workflows/ci-on-push.yaml +++ b/.github/workflows/ci-on-push.yaml @@ -12,15 +12,11 @@ on: - synchronize - reopened - labeled -env: - COMMIT_HASH: ${{ github.event.pull_request.head.sha }} - PR_NUMBER: ${{ github.event.pull_request.number }} - jobs: kata-containers-ci-on-push: if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} uses: ./.github/workflows/ci.yaml with: - commit-hash: $COMMIT_HASH - pr-number: $PR_NUMBER - tag: $PR_NUMBER-$COMMIT_HASH + commit-hash: ${{ github.event.pull_request.sha }} + pr-number: ${{ github.event.pull_request.number }} + tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }} diff --git a/.github/workflows/payload-after-push.yaml b/.github/workflows/payload-after-push.yaml index fabd987b7..2e4e71d4b 100644 --- a/.github/workflows/payload-after-push.yaml +++ b/.github/workflows/payload-after-push.yaml @@ -5,28 +5,25 @@ on: - main - stable-* -env: - COMMIT_HASH: $GITHUB_REF - jobs: build-assets-amd64: uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml with: - commit-hash: $COMMIT_HASH + commit-hash: ${GITHUB_REF} push-to-registry: yes secrets: inherit build-assets-arm64: uses: ./.github/workflows/build-kata-static-tarball-arm64.yaml with: - commit-hash: $COMMIT_HASH + commit-hash: ${GITHUB_REF} push-to-registry: yes secrets: inherit build-assets-s390x: uses: ./.github/workflows/build-kata-static-tarball-s390x.yaml with: - commit-hash: $COMMIT_HASH + commit-hash: ${GITHUB_REF} push-to-registry: yes secrets: inherit @@ -34,7 +31,7 @@ jobs: needs: build-assets-amd64 uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml with: - commit-hash: $COMMIT_HASH + commit-hash: ${GITHUB_REF} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-amd64 @@ -44,7 +41,7 @@ jobs: needs: build-assets-arm64 uses: ./.github/workflows/publish-kata-deploy-payload-arm64.yaml with: - commit-hash: $COMMIT_HASH + commit-hash: ${GITHUB_REF} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-arm64 @@ -54,7 +51,7 @@ jobs: needs: build-assets-s390x uses: ./.github/workflows/publish-kata-deploy-payload-s390x.yaml with: - commit-hash: $COMMIT_HASH + commit-hash: ${GITHUB_REF} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-s390x From c5b4164cb1671cfa8af705149c6996d19d417ba6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Fri, 7 Jul 2023 11:47:51 +0200 Subject: [PATCH 66/87] gha: ci: Fix tarball-suffix passed to the metrics tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of passing "-${{ inputs.tag }}-amd64", we must only pass "-${{ inputs.tag }}". This is a regression introduced by 106e305717c228576994ab08bb6641d9da7f2aa6. Fixes: #7247 Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1ceef41fc..faec7fca4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -72,5 +72,5 @@ jobs: needs: build-kata-static-tarball-amd64 uses: ./.github/workflows/run-metrics.yaml with: - tarball-suffix: -${{ inputs.tag }}-amd64 + tarball-suffix: -${{ inputs.tag }} commit-hash: ${{ inputs.commit-hash }} From 1d05b9cc71188d8fd8e105980445676385232306 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Fri, 7 Jul 2023 11:57:31 +0200 Subject: [PATCH 67/87] gha: ci: Pass down secrets to ci-on-push / ci-nightly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have to do this, otherwise we cannot log into azure. This is a regression introduced by 106e305717c228576994ab08bb6641d9da7f2aa6. Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-nightly.yaml | 1 + .github/workflows/ci-on-push.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/ci-nightly.yaml b/.github/workflows/ci-nightly.yaml index 81380a7d1..16f8d16d6 100644 --- a/.github/workflows/ci-nightly.yaml +++ b/.github/workflows/ci-nightly.yaml @@ -18,3 +18,4 @@ jobs: commit-hash: ${GITHUB_REF} pr-number: ${PR_NUMBER} tag: ${PR_NUMBER}-${GITHUB_REF}-nightly + secrets: inherit diff --git a/.github/workflows/ci-on-push.yaml b/.github/workflows/ci-on-push.yaml index d5549d458..418d04650 100644 --- a/.github/workflows/ci-on-push.yaml +++ b/.github/workflows/ci-on-push.yaml @@ -20,3 +20,4 @@ jobs: commit-hash: ${{ github.event.pull_request.sha }} pr-number: ${{ github.event.pull_request.number }} tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }} + secrets: inherit From 86904909aa8186d674d136cad8d0e6c70ec1cabc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Fri, 7 Jul 2023 14:40:24 +0200 Subject: [PATCH 68/87] gha: ci: Avoid using env also in the ci-nightly and payload-after-push MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The latter workflow is breaking as it doesn't recognise ${GITHUB_REF}, the former would most likely break as well, but it didn't get triggered yet. The error we're facing is: ``` Determining the checkout info /usr/bin/git branch --list --remote origin/${GITHUB_REF} /usr/bin/git tag --list ${GITHUB_REF} Error: A branch or tag with the name '${GITHUB_REF}' could not be found ``` Fixes: #7247 Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-nightly.yaml | 14 +++----------- .github/workflows/payload-after-push.yaml | 12 ++++++------ 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci-nightly.yaml b/.github/workflows/ci-nightly.yaml index 16f8d16d6..9e36b9233 100644 --- a/.github/workflows/ci-nightly.yaml +++ b/.github/workflows/ci-nightly.yaml @@ -4,18 +4,10 @@ on: - cron: '0 0 * * *' jobs: - set-fake-pr-number: - runs-on: ubuntu-latest - steps: - - name: Set fake PR number - run: | - echo "PR_NUMBER=$(date +%Y%m%d%H%M%S)" >> "$GITHUB_ENV" - kata-containers-ci-on-push: - needs: set-fake-pr-number uses: ./.github/workflows/ci.yaml with: - commit-hash: ${GITHUB_REF} - pr-number: ${PR_NUMBER} - tag: ${PR_NUMBER}-${GITHUB_REF}-nightly + commit-hash: ${{ github.ref }} + pr-number: ${{ github.ref }} + tag: ${{ github.ref }}-nightly secrets: inherit diff --git a/.github/workflows/payload-after-push.yaml b/.github/workflows/payload-after-push.yaml index 2e4e71d4b..574c6bff2 100644 --- a/.github/workflows/payload-after-push.yaml +++ b/.github/workflows/payload-after-push.yaml @@ -9,21 +9,21 @@ jobs: build-assets-amd64: uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml with: - commit-hash: ${GITHUB_REF} + commit-hash: ${{ github.ref }} push-to-registry: yes secrets: inherit build-assets-arm64: uses: ./.github/workflows/build-kata-static-tarball-arm64.yaml with: - commit-hash: ${GITHUB_REF} + commit-hash: ${{ github.ref }} push-to-registry: yes secrets: inherit build-assets-s390x: uses: ./.github/workflows/build-kata-static-tarball-s390x.yaml with: - commit-hash: ${GITHUB_REF} + commit-hash: ${{ github.ref }} push-to-registry: yes secrets: inherit @@ -31,7 +31,7 @@ jobs: needs: build-assets-amd64 uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml with: - commit-hash: ${GITHUB_REF} + commit-hash: ${{ github.ref }} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-amd64 @@ -41,7 +41,7 @@ jobs: needs: build-assets-arm64 uses: ./.github/workflows/publish-kata-deploy-payload-arm64.yaml with: - commit-hash: ${GITHUB_REF} + commit-hash: ${{ github.ref }} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-arm64 @@ -51,7 +51,7 @@ jobs: needs: build-assets-s390x uses: ./.github/workflows/publish-kata-deploy-payload-s390x.yaml with: - commit-hash: ${GITHUB_REF} + commit-hash: ${{ github.ref }} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-s390x From 0ad298895ee330912147603fa5627c2d4942b5c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Fri, 7 Jul 2023 17:52:39 +0200 Subject: [PATCH 69/87] gha: ci: Fix refernce passed to checkout@v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On cc3993d860339f4fcf669429a9e5948755dfaa75 we introduced a regression, where we started passing inputs.commit-hash, instead of github.event.pull_request.head.sha. However, we have been setting commit-hash to github.event.pull_request.sha, meaning that we're mssing a `.head.` there. github.event.pull_request.sha is empty for the pull_request_target event, leading the CI to pull the content from `main` instead of the content from the PR. Fixes: #7247 Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-on-push.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-on-push.yaml b/.github/workflows/ci-on-push.yaml index 418d04650..76c4dc38d 100644 --- a/.github/workflows/ci-on-push.yaml +++ b/.github/workflows/ci-on-push.yaml @@ -17,7 +17,7 @@ jobs: if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} uses: ./.github/workflows/ci.yaml with: - commit-hash: ${{ github.event.pull_request.sha }} + commit-hash: ${{ github.event.pull_request.head.sha }} pr-number: ${{ github.event.pull_request.number }} tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }} secrets: inherit From 275c84e7b5af36ea0e5f984c012b4fa4ed06dd6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 6 Jun 2023 13:28:41 +0200 Subject: [PATCH 70/87] Revert "agent: fix the issue of exec hang with a backgroud process" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 25d2fb0fdecf5d76fcb8a4ecde5c54c0d4c3e240. The reason we're reverting the commit is because it to check whether it's the cause for the regression on devmapper tests. Fixes: #7253 Depends-on: github.com/kata-containers/tests#5705 Signed-off-by: Fabiano Fidêncio --- src/agent/rustjail/src/process.rs | 2 +- src/agent/src/rpc.rs | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/agent/rustjail/src/process.rs b/src/agent/rustjail/src/process.rs index cdecae130..0e7fe73ef 100644 --- a/src/agent/rustjail/src/process.rs +++ b/src/agent/rustjail/src/process.rs @@ -161,7 +161,7 @@ impl Process { pub fn notify_term_close(&mut self) { let notify = self.term_exit_notifier.clone(); - notify.notify_waiters(); + notify.notify_one(); } pub fn close_stdin(&mut self) { diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index 8299ed34b..988cce5b7 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -595,16 +595,15 @@ impl AgentService { let cid = req.container_id; let eid = req.exec_id; - let term_exit_notifier; + let mut term_exit_notifier = Arc::new(tokio::sync::Notify::new()); let reader = { let s = self.sandbox.clone(); let mut sandbox = s.lock().await; let p = sandbox.find_container_process(cid.as_str(), eid.as_str())?; - term_exit_notifier = p.term_exit_notifier.clone(); - if p.term_master.is_some() { + term_exit_notifier = p.term_exit_notifier.clone(); p.get_reader(StreamType::TermMaster) } else if stdout { if p.parent_stdout.is_some() { From a79505b6672f8f795947b2b265328aeaea955d84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Sat, 8 Jul 2023 01:02:51 +0200 Subject: [PATCH 71/87] gha: k8s: dragonball: Skip k8s-number-cpus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's skip the k8s-number-cpus, as the test is currently failing. We've an issue opened for that, and we'll be working on re-enabling it as soon as possible. Reference: https://github.com/kata-containers/kata-containers/issues/7270 Fixes: #7253 Signed-off-by: Fabiano Fidêncio --- tests/integration/kubernetes/k8s-number-cpus.bats | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/integration/kubernetes/k8s-number-cpus.bats b/tests/integration/kubernetes/k8s-number-cpus.bats index 338963f6d..1f4283a7d 100644 --- a/tests/integration/kubernetes/k8s-number-cpus.bats +++ b/tests/integration/kubernetes/k8s-number-cpus.bats @@ -9,6 +9,8 @@ load "${BATS_TEST_DIRNAME}/../../common.bash" load "${BATS_TEST_DIRNAME}/tests_common.sh" setup() { + [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "Test is currently failing, aee: https://github.com/kata-containers/kata-containers/issues/7270" + pod_name="cpu-test" container_name="c1" get_pod_config_dir @@ -16,6 +18,8 @@ setup() { # Skip on aarch64 due to missing cpu hotplug related functionality. @test "Check number of cpus" { + [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "Test is currently failing, see: https://github.com/kata-containers/kata-containers/issues/7270" + # Create pod kubectl create -f "${pod_config_dir}/pod-number-cpu.yaml" @@ -40,6 +44,8 @@ setup() { } teardown() { + [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "Test is currently failing, see: https://github.com/kata-containers/kata-containers/issues/7270" + # Debugging information kubectl describe "pod/$pod_name" From 828a7218383cdab795333b7e1df71a1647b960d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Sat, 8 Jul 2023 11:01:19 +0200 Subject: [PATCH 72/87] gha: k8s: dragonball: Skip k8s-oom MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's skip the k8s-oom, as the test is currently failing. We've an issue opened for that, and we'll be working on re-enabling it as soon as possible. Reference: https://github.com/kata-containers/kata-containers/issues/7271 Fixes: #7253 Signed-off-by: Fabiano Fidêncio --- tests/integration/kubernetes/k8s-oom.bats | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/integration/kubernetes/k8s-oom.bats b/tests/integration/kubernetes/k8s-oom.bats index f89b761f8..5f45ad78a 100644 --- a/tests/integration/kubernetes/k8s-oom.bats +++ b/tests/integration/kubernetes/k8s-oom.bats @@ -9,11 +9,15 @@ load "${BATS_TEST_DIRNAME}/../../common.bash" load "${BATS_TEST_DIRNAME}/tests_common.sh" setup() { + [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "Test is currently failing, see: https://github.com/kata-containers/kata-containers/issues/7271" + pod_name="pod-oom" get_pod_config_dir } @test "Test OOM events for pods" { + [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "Test is currently failing, see: https://github.com/kata-containers/kata-containers/issues/7271" + # Create pod kubectl create -f "${pod_config_dir}/$pod_name.yaml" @@ -29,6 +33,8 @@ setup() { } teardown() { + [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "Test is currently failing, see: https://github.com/kata-containers/kata-containers/issues/7271" + # Debugging information kubectl describe "pod/$pod_name" kubectl get "pod/$pod_name" -o yaml From 38f0aaa516c22fb7d294e8799ad882c571ec3710 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Sat, 8 Jul 2023 14:43:41 +0200 Subject: [PATCH 73/87] Revert "gha: k8s: dragonball: Skip k8s-number-cpus" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit a79505b6672f8f795947b2b265328aeaea955d84. Signed-off-by: Fabiano Fidêncio --- tests/integration/kubernetes/k8s-number-cpus.bats | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/integration/kubernetes/k8s-number-cpus.bats b/tests/integration/kubernetes/k8s-number-cpus.bats index 1f4283a7d..338963f6d 100644 --- a/tests/integration/kubernetes/k8s-number-cpus.bats +++ b/tests/integration/kubernetes/k8s-number-cpus.bats @@ -9,8 +9,6 @@ load "${BATS_TEST_DIRNAME}/../../common.bash" load "${BATS_TEST_DIRNAME}/tests_common.sh" setup() { - [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "Test is currently failing, aee: https://github.com/kata-containers/kata-containers/issues/7270" - pod_name="cpu-test" container_name="c1" get_pod_config_dir @@ -18,8 +16,6 @@ setup() { # Skip on aarch64 due to missing cpu hotplug related functionality. @test "Check number of cpus" { - [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "Test is currently failing, see: https://github.com/kata-containers/kata-containers/issues/7270" - # Create pod kubectl create -f "${pod_config_dir}/pod-number-cpu.yaml" @@ -44,8 +40,6 @@ setup() { } teardown() { - [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "Test is currently failing, see: https://github.com/kata-containers/kata-containers/issues/7270" - # Debugging information kubectl describe "pod/$pod_name" From 96e9374d4b778f7fb542b50165d8e4a9165e3f3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Sat, 8 Jul 2023 14:55:16 +0200 Subject: [PATCH 74/87] dragonball: Don't fail if a request asks for more CPUs than allowed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's take the same approach of the go runtime, instead, and allocate the maximum allowed number of vcpus instead. Fixes: #7270 Signed-off-by: Fabiano Fidêncio --- src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs b/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs index c7c9cb082..91f9406b8 100644 --- a/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs +++ b/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs @@ -341,7 +341,10 @@ impl DragonballInner { // cannot exceed maximum value if new_vcpus > self.config.cpu_info.default_maxvcpus { - return Err(anyhow!("resize vcpu error: cannot greater than maxvcpus")); + warn!( + sl!(), + "Cannot allocate more vcpus than the max allowed number of vcpus. The maximum allowed amount of vcpus will be used instead."); + return Ok((current_vcpus, self.config.cpu_info.default_maxvcpus)); } Ok((current_vcpus, new_vcpus)) From 37a95567896a65873b45b94962b63c72c5eb4fb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Mon, 10 Jul 2023 09:30:17 +0200 Subject: [PATCH 75/87] gha: ci: nightly: Use github.sha to get the last commit reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As we need to pass down the commit sha to the jobs that will be triggered from the `schedule` event, we must be careful on what exactly we're using there. At first we were using ${{ github.ref }}, but this turns out to be the **branch name**, rather than the commit hash. In order to actually get the commit hash, Let's use ${{ github.sha }} instead, as described by https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows# Fixes: #7247 Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-nightly.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-nightly.yaml b/.github/workflows/ci-nightly.yaml index 9e36b9233..9948bc910 100644 --- a/.github/workflows/ci-nightly.yaml +++ b/.github/workflows/ci-nightly.yaml @@ -7,7 +7,7 @@ jobs: kata-containers-ci-on-push: uses: ./.github/workflows/ci.yaml with: - commit-hash: ${{ github.ref }} - pr-number: ${{ github.ref }} - tag: ${{ github.ref }}-nightly + commit-hash: ${{ github.sha }} + pr-number: ${{ github.sha }} + tag: ${{ github.sha }}-nightly secrets: inherit From 0c1cbd01d8066c82e56bf204da1584af50318d02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Mon, 10 Jul 2023 09:38:25 +0200 Subject: [PATCH 76/87] gha: ci: after-push: Use github.sha to get the last commit reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As we need to pass down the commit sha to the jobs that will be triggered from the `push` event, we must be careful on what exactly we're using there. At first we were using ${{ github.ref }}, but this turns out to be the **branch name**, rather than the commit hash. In order to actually get the commit hash, Let's use ${{ github.sha }} instead. Fixes: #7247 Signed-off-by: Fabiano Fidêncio --- .github/workflows/payload-after-push.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/payload-after-push.yaml b/.github/workflows/payload-after-push.yaml index 574c6bff2..871d73388 100644 --- a/.github/workflows/payload-after-push.yaml +++ b/.github/workflows/payload-after-push.yaml @@ -9,21 +9,21 @@ jobs: build-assets-amd64: uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml with: - commit-hash: ${{ github.ref }} + commit-hash: ${{ github.sha }} push-to-registry: yes secrets: inherit build-assets-arm64: uses: ./.github/workflows/build-kata-static-tarball-arm64.yaml with: - commit-hash: ${{ github.ref }} + commit-hash: ${{ github.sha }} push-to-registry: yes secrets: inherit build-assets-s390x: uses: ./.github/workflows/build-kata-static-tarball-s390x.yaml with: - commit-hash: ${{ github.ref }} + commit-hash: ${{ github.sha }} push-to-registry: yes secrets: inherit @@ -31,7 +31,7 @@ jobs: needs: build-assets-amd64 uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml with: - commit-hash: ${{ github.ref }} + commit-hash: ${{ github.sha }} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-amd64 @@ -41,7 +41,7 @@ jobs: needs: build-assets-arm64 uses: ./.github/workflows/publish-kata-deploy-payload-arm64.yaml with: - commit-hash: ${{ github.ref }} + commit-hash: ${{ github.sha }} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-arm64 @@ -51,7 +51,7 @@ jobs: needs: build-assets-s390x uses: ./.github/workflows/publish-kata-deploy-payload-s390x.yaml with: - commit-hash: ${{ github.ref }} + commit-hash: ${{ github.sha }} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-s390x From 28c29b248d173bd92c72e21bfb64d33d67c37fb2 Mon Sep 17 00:00:00 2001 From: Yushuo Date: Sun, 9 Jul 2023 22:17:10 +0800 Subject: [PATCH 77/87] bugfix: plus default_memory when calculating mem size We've noticed this caused regressions with the k8s-oom tests, and then decided to take a step back and do this in the same way it was done before 67972ec48a40434fd7c3883c4538cc1d328ede40. Moreover, this step back is also more reasonable in terms of the controlling logic. And by doing this we can re-enable the k8s-oom.bats tests, which is done as part of this PR. Fixes: #7271 Depends-on: github.com/kata-containers/tests#5705 Signed-off-by: Yushuo --- src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs | 7 ++++++- tests/integration/kubernetes/k8s-oom.bats | 6 ------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs index 53eccc52b..2a8b6e600 100644 --- a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs +++ b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs @@ -105,7 +105,12 @@ impl InitialSizeManager { hv.cpu_info.default_vcpus = self.resource.vcpu as i32 } if self.resource.mem_mb > 0 { - hv.memory_info.default_memory = self.resource.mem_mb; + // since the memory overhead introduced by kata-agent and system components + // will really affect the amount of memory the user can use, so we choose to + // plus the default_memory here, instead of overriding it. + // (if we override the default_memory here, and user apllications still + // use memory as they orignally expected, it would be easy to OOM.) + hv.memory_info.default_memory += self.resource.mem_mb; } Ok(()) } diff --git a/tests/integration/kubernetes/k8s-oom.bats b/tests/integration/kubernetes/k8s-oom.bats index 5f45ad78a..f89b761f8 100644 --- a/tests/integration/kubernetes/k8s-oom.bats +++ b/tests/integration/kubernetes/k8s-oom.bats @@ -9,15 +9,11 @@ load "${BATS_TEST_DIRNAME}/../../common.bash" load "${BATS_TEST_DIRNAME}/tests_common.sh" setup() { - [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "Test is currently failing, see: https://github.com/kata-containers/kata-containers/issues/7271" - pod_name="pod-oom" get_pod_config_dir } @test "Test OOM events for pods" { - [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "Test is currently failing, see: https://github.com/kata-containers/kata-containers/issues/7271" - # Create pod kubectl create -f "${pod_config_dir}/$pod_name.yaml" @@ -33,8 +29,6 @@ setup() { } teardown() { - [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "Test is currently failing, see: https://github.com/kata-containers/kata-containers/issues/7271" - # Debugging information kubectl describe "pod/$pod_name" kubectl get "pod/$pod_name" -o yaml From 1776b18fa08ef6f0d3a3f34c142ea8570cf21430 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 6 Jul 2023 21:35:14 +0200 Subject: [PATCH 78/87] gha: Do not run all the tests if only docs are updated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We should not go through the trouble of running all our tests on AKS / Azure / baremetal machines in case a PR only changes our documentation. Fixes: #7258 Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-on-push.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci-on-push.yaml b/.github/workflows/ci-on-push.yaml index 76c4dc38d..6d4cc7fc0 100644 --- a/.github/workflows/ci-on-push.yaml +++ b/.github/workflows/ci-on-push.yaml @@ -12,6 +12,8 @@ on: - synchronize - reopened - labeled + paths-ignore: + - 'docs/**' jobs: kata-containers-ci-on-push: if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} From 1363fbbf129bc10e18413fa93bee1d63033b83ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Mon, 10 Jul 2023 17:29:48 +0200 Subject: [PATCH 79/87] README: Add badge for our Nightly CI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will help folks to monitor the history of the failing tests, as we've done in Jenkins with the "Green Effort CI". Fixes: #7279 Signed-off-by: Fabiano Fidêncio --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7ea956d9d..78a62179c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -[![CI | Publish Kata Containers payload](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml/badge.svg)](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml) +[![CI | Publish Kata Containers payload](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml/badge.svg)](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml) [![Kata Containers Nightly CI](https://github.com/kata-containers/kata-containers/actions/workflows/ci-nightly.yaml/badge.svg)](https://github.com/kata-containers/kata-containers/actions/workflows/ci-nightly.yaml) # Kata Containers From b99ff302678f4436c7c06250c52506aaac7eb8fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 11 Jul 2023 09:59:13 +0200 Subject: [PATCH 80/87] gha: nightly: Fix name size limit for AKS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Passing the commit hash as the "pr-number" has shown problematic as it would make the AKS cluster name longer than what's accepted by AKS. One easy way to solve this is just passing "nightly" as the PR number, as that's only used to create the cluster. Fixes: #7247 Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-nightly.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-nightly.yaml b/.github/workflows/ci-nightly.yaml index 9948bc910..a5b82e8b6 100644 --- a/.github/workflows/ci-nightly.yaml +++ b/.github/workflows/ci-nightly.yaml @@ -8,6 +8,6 @@ jobs: uses: ./.github/workflows/ci.yaml with: commit-hash: ${{ github.sha }} - pr-number: ${{ github.sha }} + pr-number: "nightly" tag: ${{ github.sha }}-nightly secrets: inherit From d780cc08f4f83ae52574c902a0276fe0e5a9881f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 11 Jul 2023 10:42:40 +0200 Subject: [PATCH 81/87] gha: nightly: Also use `workflow_dispatch` to trigger it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a very nice suggestion from Steve Horsman, as with that we can manually trigger the workflow anytime we need to test it, instead of waiting for a full day for it to be retriggered via the `schedule` event. Signed-off-by: Fabiano Fidêncio --- .github/workflows/ci-nightly.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci-nightly.yaml b/.github/workflows/ci-nightly.yaml index a5b82e8b6..9a47ce0e4 100644 --- a/.github/workflows/ci-nightly.yaml +++ b/.github/workflows/ci-nightly.yaml @@ -2,6 +2,7 @@ name: Kata Containers Nightly CI on: schedule: - cron: '0 0 * * *' + workflow_dispatch: jobs: kata-containers-ci-on-push: From e17587b0232fe05945830ca8dcf93c1c18776284 Mon Sep 17 00:00:00 2001 From: stevenhorsman Date: Tue, 11 Jul 2023 14:27:53 +0100 Subject: [PATCH 82/87] Revert "tests: Build Mariner rootfs initrd" This reverts commit 532755ce31d069dafaa1427c55757f0973381647. --- tests/integration/gha-run.sh | 3 +- tests/integration/kubernetes/setup.sh | 1 - tools/packaging/guest-image/build_image.sh | 63 ++++++++++--------- .../local-build/kata-deploy-binaries.sh | 56 +++-------------- versions.yaml | 22 +++---- 5 files changed, 52 insertions(+), 93 deletions(-) diff --git a/tests/integration/gha-run.sh b/tests/integration/gha-run.sh index 46225bd17..c5cd573d6 100755 --- a/tests/integration/gha-run.sh +++ b/tests/integration/gha-run.sh @@ -9,8 +9,7 @@ set -o nounset set -o pipefail integration_dir="$(dirname "$(readlink -f "$0")")" -repo_root_dir="$(cd "${integration_dir}/../../" && pwd)" -tools_dir="${repo_root_dir}/tools" +tools_dir="${integration_dir}/../../tools" function _print_cluster_name() { short_sha="$(git rev-parse --short=12 HEAD)" diff --git a/tests/integration/kubernetes/setup.sh b/tests/integration/kubernetes/setup.sh index 6984ad286..614f38827 100755 --- a/tests/integration/kubernetes/setup.sh +++ b/tests/integration/kubernetes/setup.sh @@ -8,7 +8,6 @@ set -o nounset set -o pipefail kubernetes_dir=$(dirname "$(readlink -f "$0")") -repo_root_dir="$(cd "${kubernetes_dir}/../../../" && pwd)" set_runtime_class() { sed -i -e "s|runtimeClassName: kata|runtimeClassName: kata-${KATA_HYPERVISOR}|" ${kubernetes_dir}/runtimeclass_workloads/*.yaml diff --git a/tools/packaging/guest-image/build_image.sh b/tools/packaging/guest-image/build_image.sh index fad664651..230538d1c 100755 --- a/tools/packaging/guest-image/build_image.sh +++ b/tools/packaging/guest-image/build_image.sh @@ -22,44 +22,45 @@ readonly osbuilder_dir="$(cd "${repo_root_dir}/tools/osbuilder" && pwd)" export GOPATH=${GOPATH:-${HOME}/go} arch_target="$(uname -m)" -final_artifact_name="kata-containers" +final_image_name="kata-containers" +final_initrd_name="kata-containers-initrd" image_initrd_extension=".img" build_initrd() { info "Build initrd" - info "initrd os: $os_name" - info "initrd os version: $os_version" + info "initrd os: $initrd_distro" + info "initrd os version: $initrd_os_version" sudo -E PATH="$PATH" make initrd \ - DISTRO="$os_name" \ + DISTRO="$initrd_distro" \ DEBUG="${DEBUG:-}" \ - OS_VERSION="${os_version}" \ + OS_VERSION="${initrd_os_version}" \ ROOTFS_BUILD_DEST="${builddir}/initrd-image" \ USE_DOCKER=1 \ AGENT_INIT="yes" - mv "kata-containers-initrd.img" "${install_dir}/${artifact_name}" + mv "kata-containers-initrd.img" "${install_dir}/${initrd_name}" ( cd "${install_dir}" - ln -sf "${artifact_name}" "${final_artifact_name}${image_initrd_extension}" + ln -sf "${initrd_name}" "${final_initrd_name}${image_initrd_extension}" ) } build_image() { info "Build image" - info "image os: $os_name" - info "image os version: $os_version" + info "image os: $img_distro" + info "image os version: $img_os_version" sudo -E PATH="${PATH}" make image \ - DISTRO="${os_name}" \ + DISTRO="${img_distro}" \ DEBUG="${DEBUG:-}" \ USE_DOCKER="1" \ - IMG_OS_VERSION="${os_version}" \ + IMG_OS_VERSION="${img_os_version}" \ ROOTFS_BUILD_DEST="${builddir}/rootfs-image" - mv -f "kata-containers.img" "${install_dir}/${artifact_name}" + mv -f "kata-containers.img" "${install_dir}/${image_name}" if [ -e "root_hash.txt" ]; then cp root_hash.txt "${install_dir}/" fi ( cd "${install_dir}" - ln -sf "${artifact_name}" "${final_artifact_name}${image_initrd_extension}" + ln -sf "${image_name}" "${final_image_name}${image_initrd_extension}" ) } @@ -73,8 +74,6 @@ Usage: ${script_name} [options] Options: - --osname=${os_name} - --osversion=${os_version} --imagetype=${image_type} --prefix=${prefix} --destdir=${destdir} @@ -95,20 +94,33 @@ main() { case "$opt" in -) case "${OPTARG}" in - osname=*) - os_name=${OPTARG#*=} - ;; - osversion=*) - os_version=${OPTARG#*=} - ;; imagetype=image) image_type=image + #image information + img_distro=$(get_from_kata_deps "assets.image.architecture.${arch_target}.name") + img_os_version=$(get_from_kata_deps "assets.image.architecture.${arch_target}.version") + image_name="kata-${img_distro}-${img_os_version}.${image_type}" ;; imagetype=initrd) image_type=initrd + #initrd information + initrd_distro=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.name") + initrd_os_version=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.version") + initrd_name="kata-${initrd_distro}-${initrd_os_version}.${image_type}" ;; image_initrd_suffix=*) image_initrd_suffix=${OPTARG#*=} + if [ "${image_initrd_suffix}" == "sev" ]; then + initrd_distro=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.sev.name") + initrd_os_version=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.sev.version") + initrd_name="kata-${initrd_distro}-${initrd_os_version}-${image_initrd_suffix}.${image_type}" + final_initrd_name="${final_initrd_name}-${image_initrd_suffix}" + elif [ "${image_initrd_suffix}" == "tdx" ]; then + img_distro=$(get_from_kata_deps "assets.image.architecture.${arch_target}.name") + img_os_version=$(get_from_kata_deps "assets.image.architecture.${arch_target}.version") + image_name="kata-${img_distro}-${img_os_version}-${image_initrd_suffix}.${image_type}" + final_image_name="${final_image_name}-${image_initrd_suffix}" + fi ;; prefix=*) prefix=${OPTARG#*=} @@ -137,16 +149,7 @@ main() { echo "build ${image_type}" - if [ "${image_type}" = "initrd" ]; then - final_artifact_name+="-initrd" - fi - if [ -n "${image_initrd_suffix}" ]; then - artifact_name="kata-${os_name}-${os_version}-${image_initrd_suffix}.${image_type}" - final_artifact_name+="-${image_initrd_suffix}" - else - artifact_name="kata-${os_name}-${os_version}.${image_type}" - fi install_dir="${destdir}/${prefix}/share/kata-containers/" readonly install_dir diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh index 2342d4b02..61905c86c 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh @@ -99,7 +99,6 @@ options: rootfs-image rootfs-image-tdx rootfs-initrd - rootfs-initrd-mariner rootfs-initrd-sev shim-v2 tdvf @@ -143,13 +142,8 @@ install_cached_tarball_component() { #Install guest image install_image() { - local variant="${1:-}" - - image_type="image" - if [ -n "${variant}" ]; then - image_type+="-${variant}" - fi - + local image_type="${1:-"image"}" + local initrd_suffix="${2:-""}" local jenkins="${jenkins_url}/job/kata-containers-main-rootfs-${image_type}-$(uname -m)/${cached_artifacts_path}" local component="rootfs-${image_type}" @@ -164,39 +158,25 @@ install_image() { install_cached_tarball_component \ "${component}" \ "${jenkins}" \ - "${osbuilder_last_commit}-${guest_image_last_commit}-${agent_last_commit}-${libs_last_commit}-${gperf_version}-${libseccomp_version}-${rust_version}-${image_type}" \ + "${osbuilder_last_commit}-${guest_image_last_commit}-${agent_last_commit}-${libs_last_commit}-${gperf_version}-${libseccomp_version}-${rust_version}-image" \ "" \ "${final_tarball_name}" \ "${final_tarball_path}" \ && return 0 info "Create image" - - if [ -n "${variant}" ]; then - os_name="$(get_from_kata_deps "assets.image.architecture.${ARCH}.${variant}.name")" - os_version="$(get_from_kata_deps "assets.image.architecture.${ARCH}.${variant}.version")" - else - os_name="$(get_from_kata_deps "assets.image.architecture.${ARCH}.name")" - os_version="$(get_from_kata_deps "assets.image.architecture.${ARCH}.version")" - fi - - "${rootfs_builder}" --osname="${os_name}" --osversion="${os_version}" --imagetype=image --prefix="${prefix}" --destdir="${destdir}" --image_initrd_suffix="${variant}" + "${rootfs_builder}" --imagetype=image --prefix="${prefix}" --destdir="${destdir}" --image_initrd_suffix="${initrd_suffix}" } #Install guest image for tdx install_image_tdx() { - install_image "tdx" + install_image "image-tdx" "tdx" } #Install guest initrd install_initrd() { - local variant="${1:-}" - - initrd_type="initrd" - if [ -n "${variant}" ]; then - initrd_type+="-${variant}" - fi - + local initrd_type="${1:-"initrd"}" + local initrd_suffix="${2:-""}" local jenkins="${jenkins_url}/job/kata-containers-main-rootfs-${initrd_type}-$(uname -m)/${cached_artifacts_path}" local component="rootfs-${initrd_type}" @@ -218,26 +198,12 @@ install_initrd() { && return 0 info "Create initrd" - - if [ -n "${variant}" ]; then - os_name="$(get_from_kata_deps "assets.initrd.architecture.${ARCH}.${variant}.name")" - os_version="$(get_from_kata_deps "assets.initrd.architecture.${ARCH}.${variant}.version")" - else - os_name="$(get_from_kata_deps "assets.initrd.architecture.${ARCH}.name")" - os_version="$(get_from_kata_deps "assets.initrd.architecture.${ARCH}.version")" - fi - - "${rootfs_builder}" --osname="${os_name}" --osversion="${os_version}" --imagetype=initrd --prefix="${prefix}" --destdir="${destdir}" --image_initrd_suffix="${variant}" -} - -#Install Mariner guest initrd -install_initrd_mariner() { - install_initrd "cbl-mariner" + "${rootfs_builder}" --imagetype=initrd --prefix="${prefix}" --destdir="${destdir}" --image_initrd_suffix="${initrd_suffix}" } #Install guest initrd for sev install_initrd_sev() { - install_initrd "sev" + install_initrd "initrd-sev" "sev" } #Install kernel component helper @@ -622,7 +588,6 @@ handle_build() { install_firecracker install_image install_initrd - install_initrd_mariner install_initrd_sev install_kernel install_kernel_dragonball_experimental @@ -678,7 +643,7 @@ handle_build() { rootfs-initrd) install_initrd ;; - rootfs-initrd-mariner) install_initrd_mariner ;; + rootfs-initrd-mariner) ;; rootfs-initrd-sev) install_initrd_sev ;; @@ -724,7 +689,6 @@ main() { qemu rootfs-image rootfs-initrd - rootfs-initrd-mariner shim-v2 virtiofsd ) diff --git a/versions.yaml b/versions.yaml index 6695834ca..c26a68134 100644 --- a/versions.yaml +++ b/versions.yaml @@ -122,20 +122,17 @@ assets: url: "https://github.com/kata-containers/kata-containers/tools/osbuilder" architecture: aarch64: - name: &default-image-name "ubuntu" - version: &default-image-version "latest" + name: "ubuntu" + version: "latest" ppc64le: - name: *default-image-name - version: *default-image-version + name: "ubuntu" + version: "latest" s390x: - name: *default-image-name - version: *default-image-version + name: "ubuntu" + version: "latest" x86_64: - name: *default-image-name - version: *default-image-version - tdx: - name: *default-image-name - version: *default-image-version + name: &default-image-name "ubuntu" + version: "latest" meta: image-type: *default-image-name @@ -159,9 +156,6 @@ assets: x86_64: name: *default-initrd-name version: *default-initrd-version - cbl-mariner: - name: "cbl-mariner" - version: "2.0" sev: name: *glibc-initrd-name version: *glibc-initrd-version From 1c058d1d910a3358ca7f5277a993b684cbae217f Mon Sep 17 00:00:00 2001 From: stevenhorsman Date: Tue, 11 Jul 2023 15:08:39 +0100 Subject: [PATCH 83/87] packaging: Bump kernel version - Bump kernel version to reflect that they are changes - We've some how gone out of sync with main, so just add a + Signed-off-by: stevenhorsman --- tools/packaging/kernel/kata_config_version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/packaging/kernel/kata_config_version b/tools/packaging/kernel/kata_config_version index d5e55fec1..8cccbbf4a 100644 --- a/tools/packaging/kernel/kata_config_version +++ b/tools/packaging/kernel/kata_config_version @@ -1 +1 @@ -109cc +109cc+ From 7188a60e2556c1515b80acedd8274abbd03f3e2f Mon Sep 17 00:00:00 2001 From: stevenhorsman Date: Tue, 11 Jul 2023 16:59:16 +0100 Subject: [PATCH 84/87] runtime: Fix bad merge - Fix the HotPlug type Signed-off-by: stevenhorsman --- src/runtime/pkg/katautils/config.go | 6 +----- src/runtime/virtcontainers/hypervisor.go | 3 +-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index bcea3e71d..1397954aa 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -136,7 +136,6 @@ type hypervisor struct { MemSlots uint32 `toml:"memory_slots"` DefaultBridges uint32 `toml:"default_bridges"` Msize9p uint32 `toml:"msize_9p"` - PCIeRootPort uint32 `toml:"pcie_root_port"` GuestPreAttestationGRPCTimeout uint32 `toml:"guest_pre_attestation_grpc_timeout"` SEVGuestPolicy uint32 `toml:"sev_guest_policy"` SNPGuestPolicy uint64 `toml:"snp_guest_policy"` @@ -159,7 +158,7 @@ type hypervisor struct { DisableImageNvdimm bool `toml:"disable_image_nvdimm"` HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"` - ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"` + ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"` DisableVhostNet bool `toml:"disable_vhost_net"` GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` ConfidentialGuest bool `toml:"confidential_guest"` @@ -889,7 +888,6 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, HotPlugVFIO: h.hotPlugVFIO(), ColdPlugVFIO: h.coldPlugVFIO(), - PCIeRootPort: h.PCIeRootPort, DisableVhostNet: h.DisableVhostNet, EnableVhostUserStore: h.EnableVhostUserStore, VhostUserStorePath: h.vhostUserStorePath(), @@ -1340,7 +1338,6 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus, ColdPlugVFIO: defaultColdPlugVFIO, HotPlugVFIO: defaultHotPlugVFIO, - PCIeRootPort: defaultPCIeRootPort, GuestHookPath: defaultGuestHookPath, VhostUserStorePath: defaultVhostUserStorePath, VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect, @@ -1363,7 +1360,6 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { SEVGuestPolicy: defaultSEVGuestPolicy, SNPGuestPolicy: defaultSNPGuestPolicy, SEVCertChainPath: defaultSEVCertChainPath, - VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect, } } diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 299118292..f2c86c6a6 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -389,7 +389,6 @@ type HypervisorConfig struct { Gid uint32 SEVGuestPolicy uint32 SNPGuestPolicy uint64 - PCIeRootPort uint32 NumVCPUs uint32 RemoteHypervisorTimeout uint32 IOMMUPlatform bool @@ -421,7 +420,7 @@ type HypervisorConfig struct { DisableGuestSeLinux bool LegacySerial bool HotPlugVFIO config.PCIePort - ColdPlugVFIO hv.PCIePort + ColdPlugVFIO config.PCIePort VFIODevices []config.DeviceInfo VhostUserBlkDevices []config.DeviceInfo } From 15647a000e89a87acd06cff28867fee9c5d5d84f Mon Sep 17 00:00:00 2001 From: stevenhorsman Date: Tue, 11 Jul 2023 19:55:36 +0100 Subject: [PATCH 85/87] runtime: Ignore cyclomatic complexity Ignore cyclomatic complexity failure. I have fixed this in my PR waiting to forward port remote-hypervisor support into main Signed-off-by: stevenhorsman --- src/runtime/virtcontainers/sandbox.go | 1 + 1 file changed, 1 insertion(+) diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 9ef0feb90..a0c078012 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -524,6 +524,7 @@ func createSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Fac return s, nil } +//nolint:gocyclo func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factory) (sb *Sandbox, retErr error) { span, ctx := katatrace.Trace(ctx, nil, "newSandbox", sandboxTracingTags, map[string]string{"sandbox_id": sandboxConfig.ID}) defer span.End() From 68a364abfa28a1e8b55916f50db60cc64a1a14e5 Mon Sep 17 00:00:00 2001 From: stevenhorsman Date: Tue, 11 Jul 2023 20:12:50 +0100 Subject: [PATCH 86/87] agent: Reflect AGENT_CONFIG change AGENT_CONFIG was changed to not be a lazy type, so we need to remove the .read().await calls on it Signed-off-by: stevenhorsman --- src/agent/src/image_rpc.rs | 16 +++++++--------- src/agent/src/rpc.rs | 2 +- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/agent/src/image_rpc.rs b/src/agent/src/image_rpc.rs index 571994a0d..eadeabae5 100644 --- a/src/agent/src/image_rpc.rs +++ b/src/agent/src/image_rpc.rs @@ -57,18 +57,17 @@ impl ImageService { env::set_var("CC_IMAGE_WORK_DIR", KATA_CC_IMAGE_WORK_DIR); let mut image_client = ImageClient::default(); - let image_policy_file = &AGENT_CONFIG.read().await.image_policy_file; + let image_policy_file = &AGENT_CONFIG.image_policy_file; if !image_policy_file.is_empty() { image_client.config.file_paths.sigstore_config = image_policy_file.clone(); } - let simple_signing_sigstore_config = - &AGENT_CONFIG.read().await.simple_signing_sigstore_config; + let simple_signing_sigstore_config = &AGENT_CONFIG.simple_signing_sigstore_config; if !simple_signing_sigstore_config.is_empty() { image_client.config.file_paths.sigstore_config = simple_signing_sigstore_config.clone(); } - let image_registry_auth_file = &AGENT_CONFIG.read().await.image_registry_auth_file; + let image_registry_auth_file = &AGENT_CONFIG.image_registry_auth_file; if !image_registry_auth_file.is_empty() { image_client.config.file_paths.auth_file = image_registry_auth_file.clone(); } @@ -159,12 +158,12 @@ impl ImageService { async fn pull_image(&self, req: &image::PullImageRequest) -> Result { env::set_var("OCICRYPT_KEYPROVIDER_CONFIG", OCICRYPT_CONFIG_PATH); - let https_proxy = &AGENT_CONFIG.read().await.https_proxy; + let https_proxy = &AGENT_CONFIG.https_proxy; if !https_proxy.is_empty() { env::set_var("HTTPS_PROXY", https_proxy); } - let no_proxy = &AGENT_CONFIG.read().await.no_proxy; + let no_proxy = &AGENT_CONFIG.no_proxy; if !no_proxy.is_empty() { env::set_var("NO_PROXY", no_proxy); } @@ -179,7 +178,7 @@ impl ImageService { return Ok(image.to_owned()); } - let aa_kbc_params = &AGENT_CONFIG.read().await.aa_kbc_params; + let aa_kbc_params = &AGENT_CONFIG.aa_kbc_params; if !aa_kbc_params.is_empty() { match self.attestation_agent_started.compare_exchange_weak( false, @@ -200,8 +199,7 @@ impl ImageService { self.image_client.lock().await.config.auth = !aa_kbc_params.is_empty(); // Read enable signature verification from the agent config and set it in the image_client - let enable_signature_verification = - &AGENT_CONFIG.read().await.enable_signature_verification; + let enable_signature_verification = &AGENT_CONFIG.enable_signature_verification; info!( sl!(), "enable_signature_verification set to: {}", enable_signature_verification diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index e3c38f84c..ef52a346f 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -229,7 +229,7 @@ impl AgentService { let dev_major_minor = format!("{}:{}", specdev.major, specdev.minor); if specdev.path == TRUSTED_STORAGE_DEVICE { - let data_integrity = AGENT_CONFIG.read().await.data_integrity; + let data_integrity = AGENT_CONFIG.data_integrity; info!( sl!(), "trusted_store device major:min {}, enable data integrity {}", From e16235584cc061ab90f8491f7b4127aaa8a29c5f Mon Sep 17 00:00:00 2001 From: stevenhorsman Date: Tue, 11 Jul 2023 20:30:47 +0100 Subject: [PATCH 87/87] agent: Update logger `sl` was switched from a macro to a function, so update the CoCo specifics uses of it Signed-off-by: stevenhorsman --- src/agent/src/image_rpc.rs | 22 ++++++++++------------ src/agent/src/rpc.rs | 4 ++-- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/agent/src/image_rpc.rs b/src/agent/src/image_rpc.rs index eadeabae5..d9cd45b84 100644 --- a/src/agent/src/image_rpc.rs +++ b/src/agent/src/image_rpc.rs @@ -38,11 +38,9 @@ const KATA_CC_IMAGE_WORK_DIR: &str = "/run/image/"; const KATA_CC_PAUSE_BUNDLE: &str = "/pause_bundle"; const CONFIG_JSON: &str = "config.json"; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger() - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger().new(o!("subsystem" => "cgroups")) } pub struct ImageService { @@ -87,7 +85,7 @@ impl ImageService { return Err(anyhow!("Pause image not present in rootfs")); } - info!(sl!(), "use guest pause image cid {:?}", cid); + info!(sl(), "use guest pause image cid {:?}", cid); let pause_bundle = Path::new(CONTAINER_BASE).join(cid); let pause_rootfs = pause_bundle.join("rootfs"); let pause_config = pause_bundle.join(CONFIG_JSON); @@ -187,12 +185,12 @@ impl ImageService { Ordering::SeqCst, ) { Ok(_) => Self::init_attestation_agent()?, - Err(_) => info!(sl!(), "Attestation Agent already running"), + Err(_) => info!(sl(), "Attestation Agent already running"), } } // If the attestation-agent is being used, then enable the authenticated credentials support info!( - sl!(), + sl(), "image_client.config.auth set to: {}", !aa_kbc_params.is_empty() ); @@ -201,7 +199,7 @@ impl ImageService { // Read enable signature verification from the agent config and set it in the image_client let enable_signature_verification = &AGENT_CONFIG.enable_signature_verification; info!( - sl!(), + sl(), "enable_signature_verification set to: {}", enable_signature_verification ); self.image_client.lock().await.config.security_validate = *enable_signature_verification; @@ -213,7 +211,7 @@ impl ImageService { let decrypt_config = format!("provider:attestation-agent:{}", aa_kbc_params); - info!(sl!(), "pull image {:?}, bundle path {:?}", cid, bundle_path); + info!(sl(), "pull image {:?}, bundle path {:?}", cid, bundle_path); // Image layers will store at KATA_CC_IMAGE_WORK_DIR, generated bundles // with rootfs and config.json will store under CONTAINER_BASE/cid. let res = self @@ -226,13 +224,13 @@ impl ImageService { match res { Ok(image) => { info!( - sl!(), + sl(), "pull and unpack image {:?}, cid: {:?}, with image-rs succeed. ", image, cid ); } Err(e) => { error!( - sl!(), + sl(), "pull and unpack image {:?}, cid: {:?}, with image-rs failed with {:?}. ", image, cid, diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index ef52a346f..4e7429e49 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -231,7 +231,7 @@ impl AgentService { if specdev.path == TRUSTED_STORAGE_DEVICE { let data_integrity = AGENT_CONFIG.data_integrity; info!( - sl!(), + sl(), "trusted_store device major:min {}, enable data integrity {}", dev_major_minor, data_integrity.to_string() @@ -718,7 +718,7 @@ impl AgentService { .join(container_id) .join(CONFIG_JSON); debug!( - sl!(), + sl(), "Image bundle config path: {:?}", image_oci_config_path );