Files
kata-containers/src/runtime/virtcontainers/qemu_amd64.go
Peng Tao 32fd013716 runtime: run prestart hooks before starting VM for FC
Add a new hypervisor capability to tell if it supports device hotplug.
If not, we should run prestart hooks before starting new VMs as nerdctl
is using the prestart hooks to set up netns. To make nerdctl + FC
to work, we need to run the prestart hooks before starting new VMs.

Fixes: #6384
Signed-off-by: Peng Tao <bergwolf@hyper.sh>
2023-08-30 02:52:01 +00:00

319 lines
7.9 KiB
Go

//go:build linux
// Copyright (c) 2018 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
package virtcontainers
import (
"context"
"fmt"
"time"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
"github.com/sirupsen/logrus"
"github.com/intel-go/cpuid"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
)
type qemuAmd64 struct {
// inherit from qemuArchBase, overwrite methods if needed
qemuArchBase
snpGuest bool
vmFactory bool
devLoadersCount uint32
sgxEPCSize int64
}
const (
defaultQemuPath = "/usr/bin/qemu-system-x86_64"
defaultQemuMachineType = QemuQ35
defaultQemuMachineOptions = "accel=kvm"
splitIrqChipMachineOptions = "accel=kvm,kernel_irqchip=split"
qmpMigrationWaitTimeout = 5 * time.Second
)
var kernelParams = []Param{
{"tsc", "reliable"},
{"no_timer_check", ""},
{"rcupdate.rcu_expedited", "1"},
{"i8042.direct", "1"},
{"i8042.dumbkbd", "1"},
{"i8042.nopnp", "1"},
{"i8042.noaux", "1"},
{"noreplace-smp", ""},
{"reboot", "k"},
{"cryptomgr.notests", ""},
{"net.ifnames", "0"},
{"pci", "lastbus=0"},
}
var supportedQemuMachines = []govmmQemu.Machine{
{
Type: QemuQ35,
Options: defaultQemuMachineOptions,
},
{
Type: QemuVirt,
Options: defaultQemuMachineOptions,
},
{
Type: QemuMicrovm,
Options: defaultQemuMachineOptions,
},
}
func newQemuArch(config HypervisorConfig) (qemuArch, error) {
machineType := config.HypervisorMachineType
if machineType == "" {
machineType = defaultQemuMachineType
}
var mp *govmmQemu.Machine
for _, m := range supportedQemuMachines {
if m.Type == machineType {
mp = &m
break
}
}
if mp == nil {
return nil, fmt.Errorf("unrecognised machinetype: %v", machineType)
}
factory := false
if config.BootToBeTemplate || config.BootFromTemplate {
factory = true
}
// IOMMU and Guest Protection require a split IRQ controller for handling interrupts
// otherwise QEMU won't be able to create the kernel irqchip
if config.IOMMU || config.ConfidentialGuest {
mp.Options = splitIrqChipMachineOptions
}
if config.IOMMU {
kernelParams = append(kernelParams,
Param{"intel_iommu", "on"})
kernelParams = append(kernelParams,
Param{"iommu", "pt"})
}
q := &qemuAmd64{
qemuArchBase: qemuArchBase{
qemuMachine: *mp,
qemuExePath: defaultQemuPath,
memoryOffset: config.MemOffset,
kernelParamsNonDebug: kernelParamsNonDebug,
kernelParamsDebug: kernelParamsDebug,
kernelParams: kernelParams,
disableNvdimm: config.DisableImageNvdimm,
dax: true,
protection: noneProtection,
legacySerial: config.LegacySerial,
},
vmFactory: factory,
snpGuest: config.ConfidentialGuest,
}
if config.ConfidentialGuest {
if err := q.enableProtection(); err != nil {
return nil, err
}
if !q.qemuArchBase.disableNvdimm {
hvLogger.WithField("subsystem", "qemuAmd64").Warn("Nvdimm is not supported with confidential guest, disabling it.")
q.qemuArchBase.disableNvdimm = true
}
}
if config.SGXEPCSize != 0 {
q.sgxEPCSize = config.SGXEPCSize
if q.qemuMachine.Options != "" {
q.qemuMachine.Options += ","
}
// qemu sandboxes will only support one EPC per sandbox
// this is because there is only one annotation (sgx.intel.com/epc)
// to specify the size of the EPC.
q.qemuMachine.Options += "sgx-epc.0.memdev=epc0,sgx-epc.0.node=0"
}
if err := q.handleImagePath(config); err != nil {
return nil, err
}
return q, nil
}
func (q *qemuAmd64) capabilities(hConfig HypervisorConfig) types.Capabilities {
var caps types.Capabilities
if q.qemuMachine.Type == QemuQ35 ||
q.qemuMachine.Type == QemuVirt {
caps.SetBlockDeviceHotplugSupport()
caps.SetNetworkDeviceHotplugSupported()
}
caps.SetMultiQueueSupport()
if hConfig.SharedFS != config.NoSharedFS {
caps.SetFsSharingSupport()
}
return caps
}
func (q *qemuAmd64) bridges(number uint32) {
q.Bridges = genericBridges(number, q.qemuMachine.Type)
}
func (q *qemuAmd64) cpuModel() string {
var err error
cpuModel := defaultCPUModel
// Temporary until QEMU cpu model 'host' supports AMD SEV-SNP
protection, err := availableGuestProtection()
if err == nil {
if protection == snpProtection && q.snpGuest {
cpuModel = "EPYC-v4"
}
}
return cpuModel
}
func (q *qemuAmd64) memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu.Memory {
return genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset)
}
// Is Memory Hotplug supported by this architecture/machine type combination?
func (q *qemuAmd64) supportGuestMemoryHotplug() bool {
// true for all amd64 machine types except for microvm.
if q.qemuMachine.Type == govmmQemu.MachineTypeMicrovm {
return false
}
return q.protection == noneProtection
}
func (q *qemuAmd64) appendImage(ctx context.Context, devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error) {
if !q.disableNvdimm {
return q.appendNvdimmImage(devices, path)
}
return q.appendBlockImage(ctx, devices, path)
}
// enable protection
func (q *qemuAmd64) enableProtection() error {
var err error
q.protection, err = availableGuestProtection()
if err != nil {
return err
}
// Configure SNP only if specified in config
if q.protection == snpProtection && !q.snpGuest {
q.protection = sevProtection
}
logger := hvLogger.WithFields(logrus.Fields{
"subsystem": "qemuAmd64",
"machine": q.qemuMachine,
"kernel-params-debug": q.kernelParamsDebug,
"kernel-params-non-debug": q.kernelParamsNonDebug,
"kernel-params": q.kernelParams})
switch q.protection {
case tdxProtection:
if q.qemuMachine.Options != "" {
q.qemuMachine.Options += ","
}
q.qemuMachine.Options += "confidential-guest-support=tdx"
logger.Info("Enabling TDX guest protection")
return nil
case sevProtection:
if q.qemuMachine.Options != "" {
q.qemuMachine.Options += ","
}
q.qemuMachine.Options += "confidential-guest-support=sev"
logger.Info("Enabling SEV guest protection")
return nil
case snpProtection:
if q.qemuMachine.Options != "" {
q.qemuMachine.Options += ","
}
q.qemuMachine.Options += "confidential-guest-support=snp"
logger.Info("Enabling SNP guest protection")
return nil
// TODO: Add support for other x86_64 technologies
default:
return fmt.Errorf("This system doesn't support Confidential Computing (Guest Protection)")
}
}
// append protection device
func (q *qemuAmd64) appendProtectionDevice(devices []govmmQemu.Device, firmware, firmwareVolume string) ([]govmmQemu.Device, string, error) {
if q.sgxEPCSize != 0 {
devices = append(devices,
govmmQemu.Object{
Type: govmmQemu.MemoryBackendEPC,
ID: "epc0",
Prealloc: true,
Size: uint64(q.sgxEPCSize),
})
}
switch q.protection {
case tdxProtection:
id := q.devLoadersCount
q.devLoadersCount += 1
return append(devices,
govmmQemu.Object{
Driver: govmmQemu.Loader,
Type: govmmQemu.TDXGuest,
ID: "tdx",
DeviceID: fmt.Sprintf("fd%d", id),
Debug: false,
File: firmware,
FirmwareVolume: firmwareVolume,
}), "", nil
case sevProtection:
return append(devices,
govmmQemu.Object{
Type: govmmQemu.SEVGuest,
ID: "sev",
Debug: false,
File: firmware,
CBitPos: cpuid.AMDMemEncrypt.CBitPosition,
ReducedPhysBits: 1,
}), "", nil
case snpProtection:
return append(devices,
govmmQemu.Object{
Type: govmmQemu.SNPGuest,
ID: "snp",
Debug: false,
File: firmware,
CBitPos: cpuid.AMDMemEncrypt.CBitPosition,
ReducedPhysBits: 1,
}), "", nil
case noneProtection:
return devices, firmware, nil
default:
return devices, "", fmt.Errorf("Unsupported guest protection technology: %v", q.protection)
}
}