diff --git a/Gopkg.lock b/Gopkg.lock index b9deb339e..4f39d6879 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -412,11 +412,11 @@ revision = "2f1d1f20f75d5404f53b9edf6b53ed5505508675" [[projects]] - digest = "1:b3ab3b3615583d7a374d4803ea2b71f173a4d8f4d594210e0fc8e9fc7c5a49c9" + digest = "1:f0e85729c00a427cef8de798b33b9a35b3117ee8760f6f00fc3cf3a5bb98f844" name = "github.com/intel/govmm" packages = ["qemu"] pruneopts = "NUT" - revision = "8cba5a8e5f2816f26f9dc34b8ea968279a5a76eb" + revision = "ee21903287393441c7bb53a0b0d39b8fa4075221" [[projects]] digest = "1:6da9487ef0cc0cca3eeb1a24e3bb018ce2e07b7c2fc4d50975b54efdcc942118" diff --git a/Gopkg.toml b/Gopkg.toml index e5d65bec2..f0a2d5abc 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -48,7 +48,7 @@ [[constraint]] name = "github.com/intel/govmm" - revision = "8cba5a8e5f2816f26f9dc34b8ea968279a5a76eb" + revision = "ee21903287393441c7bb53a0b0d39b8fa4075221" [[constraint]] name = "github.com/kata-containers/agent" diff --git a/cli/config/configuration-qemu.toml.in b/cli/config/configuration-qemu.toml.in index aa7775569..d7ab5e80d 100644 --- a/cli/config/configuration-qemu.toml.in +++ b/cli/config/configuration-qemu.toml.in @@ -89,6 +89,12 @@ default_memory = @DEFMEMSZ@ # Default 0 #memory_offset = 0 +# Specifies virtio-mem will be enabled or not. +# Please note that this option should be used with the command +# "echo 1 > /proc/sys/vm/overcommit_memory". +# Default false +#enable_virtio_mem = true + # Disable block device from being used for a container's rootfs. # In case of a storage driver like devicemapper where a container's # root file system is backed by a block device, the block device is passed diff --git a/pkg/katautils/config-settings.go b/pkg/katautils/config-settings.go index 17e6141c2..9bae10f7e 100644 --- a/pkg/katautils/config-settings.go +++ b/pkg/katautils/config-settings.go @@ -27,6 +27,7 @@ const defaultMaxVCPUCount uint32 = 0 const defaultMemSize uint32 = 2048 // MiB const defaultMemSlots uint32 = 10 const defaultMemOffset uint32 = 0 // MiB +const defaultVirtioMem bool = false const defaultBridgesCount uint32 = 1 const defaultInterNetworkingModel = "tcfilter" const defaultDisableBlockDeviceUse bool = false diff --git a/pkg/katautils/config.go b/pkg/katautils/config.go index 9badc6820..6f7c14c66 100644 --- a/pkg/katautils/config.go +++ b/pkg/katautils/config.go @@ -114,6 +114,7 @@ type hypervisor struct { DisableBlockDeviceUse bool `toml:"disable_block_device_use"` MemPrealloc bool `toml:"enable_mem_prealloc"` HugePages bool `toml:"enable_hugepages"` + VirtioMem bool `toml:"enable_virtio_mem"` FileBackedMemRootDir string `toml:"file_mem_backend"` Swap bool `toml:"enable_swap"` Debug bool `toml:"enable_debug"` @@ -623,6 +624,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { MemorySize: h.defaultMemSz(), MemSlots: h.defaultMemSlots(), MemOffset: h.defaultMemOffset(), + VirtioMem: h.VirtioMem, EntropySource: h.GetEntropySource(), DefaultBridges: h.defaultBridges(), DisableBlockDeviceUse: h.DisableBlockDeviceUse, @@ -773,6 +775,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { MemorySize: h.defaultMemSz(), MemSlots: h.defaultMemSlots(), MemOffset: h.defaultMemOffset(), + VirtioMem: h.VirtioMem, EntropySource: h.GetEntropySource(), DefaultBridges: h.defaultBridges(), DisableBlockDeviceUse: h.DisableBlockDeviceUse, @@ -1054,6 +1057,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { DefaultMaxVCPUs: defaultMaxVCPUCount, MemorySize: defaultMemSize, MemOffset: defaultMemOffset, + VirtioMem: defaultVirtioMem, DisableBlockDeviceUse: defaultDisableBlockDeviceUse, DefaultBridges: defaultBridgesCount, MemPrealloc: defaultEnableMemPrealloc, diff --git a/vendor/github.com/intel/govmm/qemu/qemu_arch_base.go b/vendor/github.com/intel/govmm/qemu/qemu_arch_base.go index f295099ce..d73c34f9a 100644 --- a/vendor/github.com/intel/govmm/qemu/qemu_arch_base.go +++ b/vendor/github.com/intel/govmm/qemu/qemu_arch_base.go @@ -1,4 +1,4 @@ -// +build !s390x,!s390x_test +// +build !s390x /* // Copyright contributors to the Virtual Machine Manager for Go project diff --git a/vendor/github.com/intel/govmm/qemu/qemus390x.go b/vendor/github.com/intel/govmm/qemu/qemu_s390x.go similarity index 99% rename from vendor/github.com/intel/govmm/qemu/qemus390x.go rename to vendor/github.com/intel/govmm/qemu/qemu_s390x.go index 0aeffceeb..6e54f9b71 100644 --- a/vendor/github.com/intel/govmm/qemu/qemus390x.go +++ b/vendor/github.com/intel/govmm/qemu/qemu_s390x.go @@ -1,4 +1,4 @@ -// +build s390x s390x_test +// +build s390x /* // Copyright contributors to the Virtual Machine Manager for Go project diff --git a/vendor/github.com/intel/govmm/qemu/qmp.go b/vendor/github.com/intel/govmm/qemu/qmp.go index 473ce4afa..00e9fed27 100644 --- a/vendor/github.com/intel/govmm/qemu/qmp.go +++ b/vendor/github.com/intel/govmm/qemu/qmp.go @@ -946,6 +946,17 @@ func (q *QMP) ExecuteBlockdevDel(ctx context.Context, blockdevID string) error { return q.executeCommand(ctx, "x-blockdev-del", args, nil) } +// ExecuteChardevDel deletes a char device by sending a chardev-remove command. +// chardevID is the id of the char device to be deleted. Typically, this will +// match the id passed to ExecuteCharDevUnixSocketAdd. It must be a valid QMP id. +func (q *QMP) ExecuteChardevDel(ctx context.Context, chardevID string) error { + args := map[string]interface{}{ + "id": chardevID, + } + + return q.executeCommand(ctx, "chardev-remove", args, nil) +} + // ExecuteNetdevAdd adds a Net device to a QEMU instance // using the netdev_add command. netdevID is the id of the device to add. // Must be valid QMP identifier. @@ -1124,6 +1135,27 @@ func (q *QMP) ExecutePCIDeviceAdd(ctx context.Context, blockdevID, devID, driver return q.executeCommand(ctx, "device_add", args, nil) } +// ExecutePCIVhostUserDevAdd adds a vhost-user device to a QEMU instance using the device_add command. +// This function can be used to hot plug vhost-user devices on PCI(E) bridges. +// It receives the bus and the device address on its parent bus. bus is optional. +// devID is the id of the device to add.Must be valid QMP identifier. chardevID +// is the QMP identifier of character device using a unix socket as backend. +// driver is the name of vhost-user driver, like vhost-user-blk-pci. +func (q *QMP) ExecutePCIVhostUserDevAdd(ctx context.Context, driver, devID, chardevID, addr, bus string) error { + args := map[string]interface{}{ + "driver": driver, + "id": devID, + "chardev": chardevID, + "addr": addr, + } + + if bus != "" { + args["bus"] = bus + } + + return q.executeCommand(ctx, "device_add", args, nil) +} + // ExecuteVFIODeviceAdd adds a VFIO device to a QEMU instance // using the device_add command. devID is the id of the device to add. // Must be valid QMP identifier. bdf is the PCI bus-device-function @@ -1347,8 +1379,8 @@ func (q *QMP) ExecQueryCpusFast(ctx context.Context) ([]CPUInfoFast, error) { return cpuInfoFast, nil } -// ExecHotplugMemory adds size of MiB memory to the guest -func (q *QMP) ExecHotplugMemory(ctx context.Context, qomtype, id, mempath string, size int, share bool) error { +// ExecMemdevAdd adds size of MiB memory device to the guest +func (q *QMP) ExecMemdevAdd(ctx context.Context, qomtype, id, mempath string, size int, share bool, driver, driverID string) error { props := map[string]interface{}{"size": uint64(size) << 20} args := map[string]interface{}{ "qom-type": qomtype, @@ -1368,17 +1400,17 @@ func (q *QMP) ExecHotplugMemory(ctx context.Context, qomtype, id, mempath string defer func() { if err != nil { - q.cfg.Logger.Errorf("Unable to hotplug memory device: %v", err) + q.cfg.Logger.Errorf("Unable to add memory device %s: %v", id, err) err = q.executeCommand(ctx, "object-del", map[string]interface{}{"id": id}, nil) if err != nil { - q.cfg.Logger.Warningf("Unable to clean up memory object: %v", err) + q.cfg.Logger.Warningf("Unable to clean up memory object %s: %v", id, err) } } }() args = map[string]interface{}{ - "driver": "pc-dimm", - "id": "dimm" + id, + "driver": driver, + "id": driverID, "memdev": id, } err = q.executeCommand(ctx, "device_add", args, nil) @@ -1386,6 +1418,11 @@ func (q *QMP) ExecHotplugMemory(ctx context.Context, qomtype, id, mempath string return err } +// ExecHotplugMemory adds size of MiB memory to the guest +func (q *QMP) ExecHotplugMemory(ctx context.Context, qomtype, id, mempath string, size int, share bool) error { + return q.ExecMemdevAdd(ctx, qomtype, id, mempath, size, share, "pc-dimm", "dimm"+id) +} + // ExecuteNVDIMMDeviceAdd adds a block device to a QEMU instance using // a NVDIMM driver with the device_add command. // id is the id of the device to add. It must be a valid QMP identifier. @@ -1572,3 +1609,14 @@ func (q *QMP) ExecuteQueryStatus(ctx context.Context) (StatusInfo, error) { return status, nil } + +// ExecQomSet qom-set path property value +func (q *QMP) ExecQomSet(ctx context.Context, path, property string, value uint64) error { + args := map[string]interface{}{ + "path": path, + "property": property, + "value": value, + } + + return q.executeCommand(ctx, "qom-set", args, nil) +} diff --git a/virtcontainers/hypervisor.go b/virtcontainers/hypervisor.go index e376d0ccd..20034d5aa 100644 --- a/virtcontainers/hypervisor.go +++ b/virtcontainers/hypervisor.go @@ -307,6 +307,9 @@ type HypervisorConfig struct { // VirtioFSExtraArgs passes options to virtiofsd daemon VirtioFSExtraArgs []string + // File based memory backend root directory + FileBackedMemRootDir string + // customAssets is a map of assets. // Each value in that map takes precedence over the configured assets. // For example, if there is a value for the "kernel" key in this map, @@ -341,8 +344,8 @@ type HypervisorConfig struct { // HugePages specifies if the memory should be pre-allocated from huge pages HugePages bool - // File based memory backend root directory - FileBackedMemRootDir string + // VirtioMem is used to enable/disable virtio-mem + VirtioMem bool // Realtime Used to enable/disable realtime Realtime bool diff --git a/virtcontainers/persist.go b/virtcontainers/persist.go index f964fac02..47a5c1038 100644 --- a/virtcontainers/persist.go +++ b/virtcontainers/persist.go @@ -214,6 +214,7 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) { Msize9p: sconfig.HypervisorConfig.Msize9p, MemSlots: sconfig.HypervisorConfig.MemSlots, MemOffset: sconfig.HypervisorConfig.MemOffset, + VirtioMem: sconfig.HypervisorConfig.VirtioMem, VirtioFSCacheSize: sconfig.HypervisorConfig.VirtioFSCacheSize, KernelPath: sconfig.HypervisorConfig.KernelPath, ImagePath: sconfig.HypervisorConfig.ImagePath, @@ -499,6 +500,7 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) { Msize9p: hconf.Msize9p, MemSlots: hconf.MemSlots, MemOffset: hconf.MemOffset, + VirtioMem: hconf.VirtioMem, VirtioFSCacheSize: hconf.VirtioFSCacheSize, KernelPath: hconf.KernelPath, ImagePath: hconf.ImagePath, diff --git a/virtcontainers/persist/api/config.go b/virtcontainers/persist/api/config.go index 50e51d9b6..41877491a 100644 --- a/virtcontainers/persist/api/config.go +++ b/virtcontainers/persist/api/config.go @@ -97,6 +97,9 @@ type HypervisorConfig struct { // VirtioFSExtraArgs passes options to virtiofsd daemon VirtioFSExtraArgs []string + // File based memory backend root directory + FileBackedMemRootDir string + // BlockDeviceCacheSet specifies cache-related options will be set to block devices or not. BlockDeviceCacheSet bool @@ -125,8 +128,8 @@ type HypervisorConfig struct { // HugePages specifies if the memory should be pre-allocated from huge pages HugePages bool - // File based memory backend root directory - FileBackedMemRootDir string + // VirtioMem is used to enable/disable virtio-mem + VirtioMem bool // Realtime Used to enable/disable realtime Realtime bool diff --git a/virtcontainers/pkg/annotations/annotations.go b/virtcontainers/pkg/annotations/annotations.go index e06b071ea..3be3aa186 100644 --- a/virtcontainers/pkg/annotations/annotations.go +++ b/virtcontainers/pkg/annotations/annotations.go @@ -124,6 +124,9 @@ const ( // MemOffset is a sandbox annotation that specifies the memory space used for nvdimm device by the hypervisor. MemOffset = kataAnnotHypervisorPrefix + "memory_offset" + // VirtioMem is a sandbox annotation that is used to enable/disable virtio-mem. + VirtioMem = kataAnnotHypervisorPrefix + "enable_virtio_mem" + // MemPrealloc is a sandbox annotation that specifies the memory space used for nvdimm device by the hypervisor. MemPrealloc = kataAnnotHypervisorPrefix + "enable_mem_prealloc" diff --git a/virtcontainers/pkg/oci/utils.go b/virtcontainers/pkg/oci/utils.go index 3e01bea2b..416b68ae8 100644 --- a/virtcontainers/pkg/oci/utils.go +++ b/virtcontainers/pkg/oci/utils.go @@ -492,6 +492,15 @@ func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig } } + if value, ok := ocispec.Annotations[vcAnnotations.VirtioMem]; ok { + virtioMem, err := strconv.ParseBool(value) + if err != nil { + return fmt.Errorf("Error parsing annotation for enable_virtio_mem: Please specify boolean value 'true|false'") + } + + sbConfig.HypervisorConfig.VirtioMem = virtioMem + } + if value, ok := ocispec.Annotations[vcAnnotations.MemPrealloc]; ok { memPrealloc, err := strconv.ParseBool(value) if err != nil { diff --git a/virtcontainers/pkg/oci/utils_test.go b/virtcontainers/pkg/oci/utils_test.go index 49976fcf5..792ed37cc 100644 --- a/virtcontainers/pkg/oci/utils_test.go +++ b/virtcontainers/pkg/oci/utils_test.go @@ -741,6 +741,7 @@ func TestAddHypervisorAnnotations(t *testing.T) { ocispec.Annotations[vcAnnotations.DefaultMemory] = "1024" ocispec.Annotations[vcAnnotations.MemSlots] = "20" ocispec.Annotations[vcAnnotations.MemOffset] = "512" + ocispec.Annotations[vcAnnotations.VirtioMem] = "true" ocispec.Annotations[vcAnnotations.MemPrealloc] = "true" ocispec.Annotations[vcAnnotations.EnableSwap] = "true" ocispec.Annotations[vcAnnotations.FileBackedMemRootDir] = "/dev/shm" @@ -770,6 +771,7 @@ func TestAddHypervisorAnnotations(t *testing.T) { assert.Equal(config.HypervisorConfig.MemorySize, uint32(1024)) assert.Equal(config.HypervisorConfig.MemSlots, uint32(20)) assert.Equal(config.HypervisorConfig.MemOffset, uint32(512)) + assert.Equal(config.HypervisorConfig.VirtioMem, true) assert.Equal(config.HypervisorConfig.MemPrealloc, true) assert.Equal(config.HypervisorConfig.Mlock, false) assert.Equal(config.HypervisorConfig.FileBackedMemRootDir, "/dev/shm") diff --git a/virtcontainers/qemu.go b/virtcontainers/qemu.go index 22a1d274e..7ae543a06 100644 --- a/virtcontainers/qemu.go +++ b/virtcontainers/qemu.go @@ -668,6 +668,56 @@ func (q *qemu) setupVirtiofsd() (err error) { return err } +func (q *qemu) getMemArgs() (bool, string, string) { + share := false + target := "" + memoryBack := "memory-backend-ram" + + if q.qemuConfig.Knobs.HugePages { + // we are setting all the bits that govmm sets when hugepages are enabled. + // https://github.com/intel/govmm/blob/master/qemu/qemu.go#L1677 + target = "/dev/hugepages" + memoryBack = "memory-backend-file" + share = true + } else if q.config.SharedFS == config.VirtioFS || q.config.FileBackedMemRootDir != "" { + target = q.qemuConfig.Memory.Path + memoryBack = "memory-backend-file" + } + if q.qemuConfig.Knobs.MemShared { + share = true + } + + return share, target, memoryBack +} + +func (q *qemu) setupVirtioMem() error { + maxMem, err := q.hostMemMB() + if err != nil { + return err + } + // 1024 is size for nvdimm + sizeMB := int(maxMem) - int(q.config.MemorySize) + + share, target, memoryBack := q.getMemArgs() + err = q.qmpSetup() + if err != nil { + return err + } + err = q.qmpMonitorCh.qmp.ExecMemdevAdd(q.qmpMonitorCh.ctx, memoryBack, "virtiomem", target, sizeMB, share, "virtio-mem-pci", "virtiomem0") + if err == nil { + q.config.VirtioMem = true + q.Logger().Infof("Setup %dMB virtio-mem-pci success", sizeMB) + } else { + help := "" + if strings.Contains(err.Error(), "Cannot allocate memory") { + help = ". Please use command \"echo 1 > /proc/sys/vm/overcommit_memory\" handle it." + } + err = fmt.Errorf("Add %dMB virtio-mem-pci fail %s%s", sizeMB, err.Error(), help) + } + + return err +} + // startSandbox will start the Sandbox's VM. func (q *qemu) startSandbox(timeout int) error { span, _ := q.trace("startSandbox") @@ -744,6 +794,10 @@ func (q *qemu) startSandbox(timeout int) error { } } + if q.config.VirtioMem { + err = q.setupVirtioMem() + } + return err } @@ -1449,9 +1503,6 @@ func (q *qemu) hotplugMemory(memDev *memoryDevice, op operation) (int, error) { func (q *qemu) hotplugAddMemory(memDev *memoryDevice) (int, error) { memoryDevices, err := q.qmpMonitorCh.qmp.ExecQueryMemoryDevices(q.qmpMonitorCh.ctx) - share := false - target := "" - memoryBack := "memory-backend-ram" if err != nil { return 0, fmt.Errorf("failed to query memory devices: %v", err) } @@ -1465,19 +1516,8 @@ func (q *qemu) hotplugAddMemory(memDev *memoryDevice) (int, error) { } memDev.slot = maxSlot + 1 } - if q.qemuConfig.Knobs.HugePages { - // we are setting all the bits that govmm sets when hugepages are enabled. - // https://github.com/intel/govmm/blob/master/qemu/qemu.go#L1677 - target = "/dev/hugepages" - memoryBack = "memory-backend-file" - share = true - } else if q.config.SharedFS == config.VirtioFS || q.config.FileBackedMemRootDir != "" { - target = q.qemuConfig.Memory.Path - memoryBack = "memory-backend-file" - } - if q.qemuConfig.Knobs.MemShared { - share = true - } + + share, target, memoryBack := q.getMemArgs() err = q.qmpMonitorCh.qmp.ExecHotplugMemory(q.qmpMonitorCh.ctx, memoryBack, "mem"+strconv.Itoa(memDev.slot), target, memDev.sizeMB, share) if err != nil { q.Logger().WithError(err).Error("hotplug memory") @@ -1661,6 +1701,17 @@ func (q *qemu) resizeMemory(reqMemMB uint32, memoryBlockSizeMB uint32, probe boo return 0, memoryDevice{}, err } var addMemDevice memoryDevice + if q.config.VirtioMem && currentMemory != reqMemMB { + q.Logger().WithField("hotplug", "memory").Debugf("resize memory from %dMB to %dMB", currentMemory, reqMemMB) + sizeByte := (reqMemMB - q.config.MemorySize) * 1024 * 1024 + err = q.qmpMonitorCh.qmp.ExecQomSet(q.qmpMonitorCh.ctx, "virtiomem0", "requested-size", uint64(sizeByte)) + if err != nil { + return 0, memoryDevice{}, err + } + q.state.HotpluggedMemory = int(sizeByte / 1024 / 1024) + return reqMemMB, memoryDevice{}, nil + } + switch { case currentMemory < reqMemMB: //hotplug