mirror of
https://github.com/aljazceru/kata-containers.git
synced 2025-12-18 23:04:20 +01:00
Add a new hypervisor capability to tell if it supports device hotplug. If not, we should run prestart hooks before starting new VMs as nerdctl is using the prestart hooks to set up netns. To make nerdctl + FC to work, we need to run the prestart hooks before starting new VMs. Fixes: #6384 Signed-off-by: Peng Tao <bergwolf@hyper.sh>
319 lines
7.9 KiB
Go
319 lines
7.9 KiB
Go
//go:build linux
|
|
|
|
// Copyright (c) 2018 Intel Corporation
|
|
//
|
|
// SPDX-License-Identifier: Apache-2.0
|
|
//
|
|
|
|
package virtcontainers
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
|
|
"github.com/sirupsen/logrus"
|
|
|
|
"github.com/intel-go/cpuid"
|
|
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
|
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
|
|
)
|
|
|
|
type qemuAmd64 struct {
|
|
// inherit from qemuArchBase, overwrite methods if needed
|
|
qemuArchBase
|
|
|
|
snpGuest bool
|
|
|
|
vmFactory bool
|
|
|
|
devLoadersCount uint32
|
|
|
|
sgxEPCSize int64
|
|
}
|
|
|
|
const (
|
|
defaultQemuPath = "/usr/bin/qemu-system-x86_64"
|
|
|
|
defaultQemuMachineType = QemuQ35
|
|
|
|
defaultQemuMachineOptions = "accel=kvm"
|
|
|
|
splitIrqChipMachineOptions = "accel=kvm,kernel_irqchip=split"
|
|
|
|
qmpMigrationWaitTimeout = 5 * time.Second
|
|
)
|
|
|
|
var kernelParams = []Param{
|
|
{"tsc", "reliable"},
|
|
{"no_timer_check", ""},
|
|
{"rcupdate.rcu_expedited", "1"},
|
|
{"i8042.direct", "1"},
|
|
{"i8042.dumbkbd", "1"},
|
|
{"i8042.nopnp", "1"},
|
|
{"i8042.noaux", "1"},
|
|
{"noreplace-smp", ""},
|
|
{"reboot", "k"},
|
|
{"cryptomgr.notests", ""},
|
|
{"net.ifnames", "0"},
|
|
{"pci", "lastbus=0"},
|
|
}
|
|
|
|
var supportedQemuMachines = []govmmQemu.Machine{
|
|
{
|
|
Type: QemuQ35,
|
|
Options: defaultQemuMachineOptions,
|
|
},
|
|
{
|
|
Type: QemuVirt,
|
|
Options: defaultQemuMachineOptions,
|
|
},
|
|
{
|
|
Type: QemuMicrovm,
|
|
Options: defaultQemuMachineOptions,
|
|
},
|
|
}
|
|
|
|
func newQemuArch(config HypervisorConfig) (qemuArch, error) {
|
|
machineType := config.HypervisorMachineType
|
|
if machineType == "" {
|
|
machineType = defaultQemuMachineType
|
|
}
|
|
|
|
var mp *govmmQemu.Machine
|
|
for _, m := range supportedQemuMachines {
|
|
if m.Type == machineType {
|
|
mp = &m
|
|
break
|
|
}
|
|
}
|
|
if mp == nil {
|
|
return nil, fmt.Errorf("unrecognised machinetype: %v", machineType)
|
|
}
|
|
|
|
factory := false
|
|
if config.BootToBeTemplate || config.BootFromTemplate {
|
|
factory = true
|
|
}
|
|
|
|
// IOMMU and Guest Protection require a split IRQ controller for handling interrupts
|
|
// otherwise QEMU won't be able to create the kernel irqchip
|
|
if config.IOMMU || config.ConfidentialGuest {
|
|
mp.Options = splitIrqChipMachineOptions
|
|
}
|
|
|
|
if config.IOMMU {
|
|
kernelParams = append(kernelParams,
|
|
Param{"intel_iommu", "on"})
|
|
kernelParams = append(kernelParams,
|
|
Param{"iommu", "pt"})
|
|
}
|
|
|
|
q := &qemuAmd64{
|
|
qemuArchBase: qemuArchBase{
|
|
qemuMachine: *mp,
|
|
qemuExePath: defaultQemuPath,
|
|
memoryOffset: config.MemOffset,
|
|
kernelParamsNonDebug: kernelParamsNonDebug,
|
|
kernelParamsDebug: kernelParamsDebug,
|
|
kernelParams: kernelParams,
|
|
disableNvdimm: config.DisableImageNvdimm,
|
|
dax: true,
|
|
protection: noneProtection,
|
|
legacySerial: config.LegacySerial,
|
|
},
|
|
vmFactory: factory,
|
|
snpGuest: config.ConfidentialGuest,
|
|
}
|
|
|
|
if config.ConfidentialGuest {
|
|
if err := q.enableProtection(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if !q.qemuArchBase.disableNvdimm {
|
|
hvLogger.WithField("subsystem", "qemuAmd64").Warn("Nvdimm is not supported with confidential guest, disabling it.")
|
|
q.qemuArchBase.disableNvdimm = true
|
|
}
|
|
}
|
|
|
|
if config.SGXEPCSize != 0 {
|
|
q.sgxEPCSize = config.SGXEPCSize
|
|
if q.qemuMachine.Options != "" {
|
|
q.qemuMachine.Options += ","
|
|
}
|
|
// qemu sandboxes will only support one EPC per sandbox
|
|
// this is because there is only one annotation (sgx.intel.com/epc)
|
|
// to specify the size of the EPC.
|
|
q.qemuMachine.Options += "sgx-epc.0.memdev=epc0,sgx-epc.0.node=0"
|
|
}
|
|
|
|
if err := q.handleImagePath(config); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return q, nil
|
|
}
|
|
|
|
func (q *qemuAmd64) capabilities(hConfig HypervisorConfig) types.Capabilities {
|
|
var caps types.Capabilities
|
|
|
|
if q.qemuMachine.Type == QemuQ35 ||
|
|
q.qemuMachine.Type == QemuVirt {
|
|
caps.SetBlockDeviceHotplugSupport()
|
|
caps.SetNetworkDeviceHotplugSupported()
|
|
}
|
|
|
|
caps.SetMultiQueueSupport()
|
|
if hConfig.SharedFS != config.NoSharedFS {
|
|
caps.SetFsSharingSupport()
|
|
}
|
|
|
|
return caps
|
|
}
|
|
|
|
func (q *qemuAmd64) bridges(number uint32) {
|
|
q.Bridges = genericBridges(number, q.qemuMachine.Type)
|
|
}
|
|
|
|
func (q *qemuAmd64) cpuModel() string {
|
|
var err error
|
|
cpuModel := defaultCPUModel
|
|
|
|
// Temporary until QEMU cpu model 'host' supports AMD SEV-SNP
|
|
protection, err := availableGuestProtection()
|
|
if err == nil {
|
|
if protection == snpProtection && q.snpGuest {
|
|
cpuModel = "EPYC-v4"
|
|
}
|
|
}
|
|
|
|
return cpuModel
|
|
}
|
|
|
|
func (q *qemuAmd64) memoryTopology(memoryMb, hostMemoryMb uint64, slots uint8) govmmQemu.Memory {
|
|
return genericMemoryTopology(memoryMb, hostMemoryMb, slots, q.memoryOffset)
|
|
}
|
|
|
|
// Is Memory Hotplug supported by this architecture/machine type combination?
|
|
func (q *qemuAmd64) supportGuestMemoryHotplug() bool {
|
|
// true for all amd64 machine types except for microvm.
|
|
if q.qemuMachine.Type == govmmQemu.MachineTypeMicrovm {
|
|
return false
|
|
}
|
|
|
|
return q.protection == noneProtection
|
|
}
|
|
|
|
func (q *qemuAmd64) appendImage(ctx context.Context, devices []govmmQemu.Device, path string) ([]govmmQemu.Device, error) {
|
|
if !q.disableNvdimm {
|
|
return q.appendNvdimmImage(devices, path)
|
|
}
|
|
return q.appendBlockImage(ctx, devices, path)
|
|
}
|
|
|
|
// enable protection
|
|
func (q *qemuAmd64) enableProtection() error {
|
|
var err error
|
|
q.protection, err = availableGuestProtection()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Configure SNP only if specified in config
|
|
if q.protection == snpProtection && !q.snpGuest {
|
|
q.protection = sevProtection
|
|
}
|
|
|
|
logger := hvLogger.WithFields(logrus.Fields{
|
|
"subsystem": "qemuAmd64",
|
|
"machine": q.qemuMachine,
|
|
"kernel-params-debug": q.kernelParamsDebug,
|
|
"kernel-params-non-debug": q.kernelParamsNonDebug,
|
|
"kernel-params": q.kernelParams})
|
|
|
|
switch q.protection {
|
|
case tdxProtection:
|
|
if q.qemuMachine.Options != "" {
|
|
q.qemuMachine.Options += ","
|
|
}
|
|
q.qemuMachine.Options += "confidential-guest-support=tdx"
|
|
logger.Info("Enabling TDX guest protection")
|
|
return nil
|
|
case sevProtection:
|
|
if q.qemuMachine.Options != "" {
|
|
q.qemuMachine.Options += ","
|
|
}
|
|
q.qemuMachine.Options += "confidential-guest-support=sev"
|
|
logger.Info("Enabling SEV guest protection")
|
|
return nil
|
|
case snpProtection:
|
|
if q.qemuMachine.Options != "" {
|
|
q.qemuMachine.Options += ","
|
|
}
|
|
q.qemuMachine.Options += "confidential-guest-support=snp"
|
|
logger.Info("Enabling SNP guest protection")
|
|
return nil
|
|
|
|
// TODO: Add support for other x86_64 technologies
|
|
|
|
default:
|
|
return fmt.Errorf("This system doesn't support Confidential Computing (Guest Protection)")
|
|
}
|
|
}
|
|
|
|
// append protection device
|
|
func (q *qemuAmd64) appendProtectionDevice(devices []govmmQemu.Device, firmware, firmwareVolume string) ([]govmmQemu.Device, string, error) {
|
|
if q.sgxEPCSize != 0 {
|
|
devices = append(devices,
|
|
govmmQemu.Object{
|
|
Type: govmmQemu.MemoryBackendEPC,
|
|
ID: "epc0",
|
|
Prealloc: true,
|
|
Size: uint64(q.sgxEPCSize),
|
|
})
|
|
}
|
|
|
|
switch q.protection {
|
|
case tdxProtection:
|
|
id := q.devLoadersCount
|
|
q.devLoadersCount += 1
|
|
return append(devices,
|
|
govmmQemu.Object{
|
|
Driver: govmmQemu.Loader,
|
|
Type: govmmQemu.TDXGuest,
|
|
ID: "tdx",
|
|
DeviceID: fmt.Sprintf("fd%d", id),
|
|
Debug: false,
|
|
File: firmware,
|
|
FirmwareVolume: firmwareVolume,
|
|
}), "", nil
|
|
case sevProtection:
|
|
return append(devices,
|
|
govmmQemu.Object{
|
|
Type: govmmQemu.SEVGuest,
|
|
ID: "sev",
|
|
Debug: false,
|
|
File: firmware,
|
|
CBitPos: cpuid.AMDMemEncrypt.CBitPosition,
|
|
ReducedPhysBits: 1,
|
|
}), "", nil
|
|
case snpProtection:
|
|
return append(devices,
|
|
govmmQemu.Object{
|
|
Type: govmmQemu.SNPGuest,
|
|
ID: "snp",
|
|
Debug: false,
|
|
File: firmware,
|
|
CBitPos: cpuid.AMDMemEncrypt.CBitPosition,
|
|
ReducedPhysBits: 1,
|
|
}), "", nil
|
|
case noneProtection:
|
|
|
|
return devices, firmware, nil
|
|
|
|
default:
|
|
return devices, "", fmt.Errorf("Unsupported guest protection technology: %v", q.protection)
|
|
}
|
|
}
|