Merge pull request #7300 from stevenhorsman/CCv0-merge-10th-july

CCv0: Merge main into CCv0 branch
This commit is contained in:
Wainer Moschetta
2023-07-18 09:42:43 -03:00
committed by GitHub
113 changed files with 4281 additions and 1116 deletions

View File

@@ -13,14 +13,15 @@ on:
required: false
type: string
default: no
commit-hash:
required: false
type: string
jobs:
build-asset:
runs-on: ubuntu-latest
strategy:
matrix:
stage:
- ${{ inputs.stage }}
asset:
- cloud-hypervisor
- cloud-hypervisor-glibc
@@ -46,9 +47,11 @@ jobs:
- shim-v2
- tdvf
- virtiofsd
stage:
- ${{ inputs.stage }}
exclude:
- stage: release
asset: cloud-hypervisor-glibc
- asset: cloud-hypervisor-glibc
stage: release
steps:
- name: Login to Kata Containers quay.io
if: ${{ inputs.push-to-registry == 'yes' }}
@@ -60,7 +63,7 @@ jobs:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
fetch-depth: 0 # This is needed in order to keep the commit ids history
- name: Build ${{ matrix.asset }}
@@ -88,7 +91,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
- name: get-artifacts
uses: actions/download-artifact@v3
with:

View File

@@ -9,6 +9,9 @@ on:
required: false
type: string
default: no
commit-hash:
required: false
type: string
jobs:
build-asset:
@@ -41,7 +44,7 @@ jobs:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
fetch-depth: 0 # This is needed in order to keep the commit ids history
- name: Build ${{ matrix.asset }}
run: |
@@ -72,7 +75,7 @@ jobs:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
- name: get-artifacts
uses: actions/download-artifact@v3
with:

View File

@@ -9,6 +9,9 @@ on:
required: false
type: string
default: no
commit-hash:
required: false
type: string
jobs:
build-asset:
@@ -37,7 +40,7 @@ jobs:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
fetch-depth: 0 # This is needed in order to keep the commit ids history
- name: Build ${{ matrix.asset }}
run: |
@@ -69,7 +72,7 @@ jobs:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
- name: get-artifacts
uses: actions/download-artifact@v3
with:

14
.github/workflows/ci-nightly.yaml vendored Normal file
View File

@@ -0,0 +1,14 @@
name: Kata Containers Nightly CI
on:
schedule:
- cron: '0 0 * * *'
workflow_dispatch:
jobs:
kata-containers-ci-on-push:
uses: ./.github/workflows/ci.yaml
with:
commit-hash: ${{ github.sha }}
pr-number: "nightly"
tag: ${{ github.sha }}-nightly
secrets: inherit

View File

@@ -12,65 +12,14 @@ on:
- synchronize
- reopened
- labeled
paths-ignore:
- 'docs/**'
jobs:
build-kata-static-tarball-amd64:
kata-containers-ci-on-push:
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml
uses: ./.github/workflows/ci.yaml
with:
tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }}
publish-kata-deploy-payload-amd64:
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
needs: build-kata-static-tarball-amd64
uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml
with:
tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }}
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
commit-hash: ${{ github.event.pull_request.head.sha }}
pr-number: ${{ github.event.pull_request.number }}
tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}
secrets: inherit
run-k8s-tests-on-aks:
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
needs: publish-kata-deploy-payload-amd64
uses: ./.github/workflows/run-k8s-tests-on-aks.yaml
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
secrets: inherit
run-k8s-tests-on-sev:
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
needs: publish-kata-deploy-payload-amd64
uses: ./.github/workflows/run-k8s-tests-on-sev.yaml
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
run-k8s-tests-on-snp:
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
needs: publish-kata-deploy-payload-amd64
uses: ./.github/workflows/run-k8s-tests-on-snp.yaml
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
run-k8s-tests-on-tdx:
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
needs: publish-kata-deploy-payload-amd64
uses: ./.github/workflows/run-k8s-tests-on-tdx.yaml
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
run-metrics-tests:
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
needs: build-kata-static-tarball-amd64
uses: ./.github/workflows/run-metrics.yaml
with:
tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }}

76
.github/workflows/ci.yaml vendored Normal file
View File

@@ -0,0 +1,76 @@
name: Run the Kata Containers CI
on:
workflow_call:
inputs:
commit-hash:
required: true
type: string
pr-number:
required: true
type: string
tag:
required: true
type: string
jobs:
build-kata-static-tarball-amd64:
uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml
with:
tarball-suffix: -${{ inputs.tag }}
commit-hash: ${{ inputs.commit-hash }}
publish-kata-deploy-payload-amd64:
needs: build-kata-static-tarball-amd64
uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml
with:
tarball-suffix: -${{ inputs.tag }}
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
commit-hash: ${{ inputs.commit-hash }}
secrets: inherit
run-k8s-tests-on-aks:
needs: publish-kata-deploy-payload-amd64
uses: ./.github/workflows/run-k8s-tests-on-aks.yaml
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
commit-hash: ${{ inputs.commit-hash }}
pr-number: ${{ inputs.pr-number }}
secrets: inherit
run-k8s-tests-on-sev:
needs: publish-kata-deploy-payload-amd64
uses: ./.github/workflows/run-k8s-tests-on-sev.yaml
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
commit-hash: ${{ inputs.commit-hash }}
run-k8s-tests-on-snp:
needs: publish-kata-deploy-payload-amd64
uses: ./.github/workflows/run-k8s-tests-on-snp.yaml
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
commit-hash: ${{ inputs.commit-hash }}
run-k8s-tests-on-tdx:
needs: publish-kata-deploy-payload-amd64
uses: ./.github/workflows/run-k8s-tests-on-tdx.yaml
with:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ inputs.tag }}-amd64
commit-hash: ${{ inputs.commit-hash }}
run-metrics-tests:
needs: build-kata-static-tarball-amd64
uses: ./.github/workflows/run-metrics.yaml
with:
tarball-suffix: -${{ inputs.tag }}
commit-hash: ${{ inputs.commit-hash }}

View File

@@ -9,18 +9,21 @@ jobs:
build-assets-amd64:
uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml
with:
commit-hash: ${{ github.sha }}
push-to-registry: yes
secrets: inherit
build-assets-arm64:
uses: ./.github/workflows/build-kata-static-tarball-arm64.yaml
with:
commit-hash: ${{ github.sha }}
push-to-registry: yes
secrets: inherit
build-assets-s390x:
uses: ./.github/workflows/build-kata-static-tarball-s390x.yaml
with:
commit-hash: ${{ github.sha }}
push-to-registry: yes
secrets: inherit
@@ -28,6 +31,7 @@ jobs:
needs: build-assets-amd64
uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml
with:
commit-hash: ${{ github.sha }}
registry: quay.io
repo: kata-containers/kata-deploy-ci
tag: kata-containers-amd64
@@ -37,6 +41,7 @@ jobs:
needs: build-assets-arm64
uses: ./.github/workflows/publish-kata-deploy-payload-arm64.yaml
with:
commit-hash: ${{ github.sha }}
registry: quay.io
repo: kata-containers/kata-deploy-ci
tag: kata-containers-arm64
@@ -46,6 +51,7 @@ jobs:
needs: build-assets-s390x
uses: ./.github/workflows/publish-kata-deploy-payload-s390x.yaml
with:
commit-hash: ${{ github.sha }}
registry: quay.io
repo: kata-containers/kata-deploy-ci
tag: kata-containers-s390x

View File

@@ -14,6 +14,9 @@ on:
tag:
required: true
type: string
commit-hash:
required: false
type: string
jobs:
kata-payload:
@@ -21,7 +24,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
- name: get-kata-tarball
uses: actions/download-artifact@v3

View File

@@ -14,6 +14,9 @@ on:
tag:
required: true
type: string
commit-hash:
required: false
type: string
jobs:
kata-payload:
@@ -25,7 +28,7 @@ jobs:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
- name: get-kata-tarball
uses: actions/download-artifact@v3

View File

@@ -14,6 +14,9 @@ on:
tag:
required: true
type: string
commit-hash:
required: false
type: string
jobs:
kata-payload:
@@ -25,7 +28,7 @@ jobs:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
- name: get-kata-tarball
uses: actions/download-artifact@v3

View File

@@ -72,8 +72,7 @@ jobs:
- uses: actions/checkout@v3
- name: install hub
run: |
HUB_VER=$(curl -s "https://api.github.com/repos/github/hub/releases/latest" | jq -r .tag_name | sed 's/^v//')
wget -q -O- https://github.com/github/hub/releases/download/v$HUB_VER/hub-linux-amd64-$HUB_VER.tgz | \
wget -q -O- https://github.com/mislav/hub/releases/download/v2.14.2/hub-linux-amd64-2.14.2.tgz | \
tar xz --strip-components=2 --wildcards '*/bin/hub' && sudo mv hub /usr/local/bin/hub
- name: download-artifacts-amd64

View File

@@ -11,6 +11,12 @@ on:
tag:
required: true
type: string
pr-number:
required: true
type: string
commit-hash:
required: false
type: string
jobs:
run-k8s-tests:
@@ -31,13 +37,13 @@ jobs:
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}
DOCKER_TAG: ${{ inputs.tag }}
GH_PR_NUMBER: ${{ github.event.pull_request.number }}
GH_PR_NUMBER: ${{ inputs.pr-number }}
KATA_HOST_OS: ${{ matrix.host_os }}
KATA_HYPERVISOR: ${{ matrix.vmm }}
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
- name: Download Azure CLI
run: bash tests/integration/gha-run.sh install-azure-cli

View File

@@ -11,6 +11,9 @@ on:
tag:
required: true
type: string
commit-hash:
required: false
type: string
jobs:
run-k8s-tests:
@@ -29,7 +32,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
- name: Run tests
timeout-minutes: 30

View File

@@ -11,6 +11,9 @@ on:
tag:
required: true
type: string
commit-hash:
required: false
type: string
jobs:
run-k8s-tests:
@@ -29,7 +32,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
- name: Run tests
timeout-minutes: 30

View File

@@ -11,6 +11,9 @@ on:
tag:
required: true
type: string
commit-hash:
required: false
type: string
jobs:
run-k8s-tests:
@@ -29,7 +32,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
- name: Run tests
timeout-minutes: 30

View File

@@ -5,16 +5,25 @@ on:
tarball-suffix:
required: false
type: string
commit-hash:
required: false
type: string
jobs:
run-metrics:
strategy:
fail-fast: true
matrix:
vmm: ['clh', 'qemu']
max-parallel: 1
runs-on: metrics
env:
GOPATH: ${{ github.workspace }}
KATA_HYPERVISOR: ${{ matrix.vmm }}
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
ref: ${{ inputs.commit-hash }}
- name: get-kata-tarball
uses: actions/download-artifact@v3
@@ -25,8 +34,25 @@ jobs:
- name: Install kata
run: bash tests/metrics/gha-run.sh install-kata kata-artifacts
- name: run launch times on qemu
run: bash tests/metrics/gha-run.sh run-test-launchtimes-qemu
- name: run launch times test
run: bash tests/metrics/gha-run.sh run-test-launchtimes
- name: run launch times on clh
run: bash tests/metrics/gha-run.sh run-test-launchtimes-clh
- name: run memory foot print test
run: bash tests/metrics/gha-run.sh run-test-memory-usage
- name: run memory usage inside container test
run: bash tests/metrics/gha-run.sh run-test-memory-usage-inside-container
- name: run blogbench test
run: bash tests/metrics/gha-run.sh run-test-blogbench
- name: make metrics tarball ${{ matrix.vmm }}
run: bash tests/metrics/gha-run.sh make-tarball-results
- name: archive metrics results ${{ matrix.vmm }}
uses: actions/upload-artifact@v3
with:
name: metrics-artifacts-${{ matrix.vmm }}
path: results-${{ matrix.vmm }}.tar.gz
retention-days: 1
if-no-files-found: error

View File

@@ -23,7 +23,7 @@ jobs:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
run: |
./ci/install_rust.sh
PATH=$PATH:"$HOME/.cargo/bin"
echo PATH="$HOME/.cargo/bin:$PATH" >> $GITHUB_ENV
- name: Run Unit Test
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
run: |

View File

@@ -1,6 +1,6 @@
<img src="https://object-storage-ca-ymq-1.vexxhost.net/swift/v1/6e4619c416ff4bd19e1c087f27a43eea/www-images-prod/openstack-logo/kata/SVG/kata-1.svg" width="900">
[![CI | Publish Kata Containers payload](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml/badge.svg)](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml)
[![CI | Publish Kata Containers payload](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml/badge.svg)](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml) [![Kata Containers Nightly CI](https://github.com/kata-containers/kata-containers/actions/workflows/ci-nightly.yaml/badge.svg)](https://github.com/kata-containers/kata-containers/actions/workflows/ci-nightly.yaml)
# Kata Containers

View File

@@ -17,6 +17,7 @@ Kata Containers design documents:
- [Design for Kata Containers `Lazyload` ability with `nydus`](kata-nydus-design.md)
- [Design for direct-assigned volume](direct-blk-device-assignment.md)
- [Design for core-scheduling](core-scheduling.md)
- [Virtualization Reference Architecture](kata-vra.md)
---
- [Design proposals](proposals)

434
docs/design/kata-vra.md Normal file
View File

@@ -0,0 +1,434 @@
# Virtualization Reference Architecture
## Subject to Change | © 2022 by NVIDIA Corporation. All rights reserved. | For test and development only_
Before digging deeper into the virtualization reference architecture, let's
first look at the various GPUDirect use cases in the following table. Were
distinguishing between two top-tier use cases where the devices are (1)
passthrough and (2) virtualized, where a VM gets assigned a virtual function
(VF) and not the physical function (PF). A combination of PF and VF would also
be possible.
| Device #1  (passthrough) | Device #2 (passthrough) | P2P Compatibility and Mode |
| ------------------------- | ----------------------- | -------------------------------------------- |
| GPU PF | GPU PF | GPUDirect P2P  |
| GPU PF | NIC PF | GPUDirect RDMA |
| MIG-slice | MIG-slice | _No GPUDirect P2P_ |
| MIG-slice | NIC PF | GPUDirect RDMA |
| **PDevice #1  (virtualized)** | **Device #2 (virtualized)** | **P2P Compatibility and Mode** |
| Time-slice vGPU VF | Time-slice vGPU VF | _No GPUDirect P2P  but NVLINK P2P available_ |
| Time-slice vGPU VF | NIC VF | GPUDirect RDMA |
| MIG-slice vGPU | MIG-slice vGPU | _No GPUDirect P2P_ |
| MIG-slice vGPU | NIC VF | GPUDirect RDMA |
In a virtualized environment we have several distinct features that may prevent
Peer-to-peer (P2P) communication of two endpoints in a PCI Express topology. The
IOMMU translates IO virtual addresses (IOVA) to physical addresses (PA). Each
device behind an IOMMU has its own IOVA memory space, usually, no two devices
share the same IOVA memory space but its up to the hypervisor or OS how it
chooses to map devices to IOVA spaces.  Any PCI Express DMA transactions will
use IOVAs, which the IOMMU must translate. By default, all the traffic is routed
to the root complex and not issued directly to the peer device.
An IOMMU can be used to isolate and protect devices even if virtualization is
not used; since devices can only access memory regions that are mapped for it, a
DMA from one device to another is not possible. DPDK uses the IOMMU to have
better isolation between devices, another benefit is that IOVA space can be
represented as a contiguous memory even if the PA space is heavily scattered.
In the case of virtualization, the IOMMU is responsible for isolating the device
and memory between VMs for safe device assignment without compromising the host
and other guest OSes. Without an IOMMU, any device can access the entire system
and perform DMA transactions _anywhere_.
The second feature is ACS (Access Control Services), which controls which
devices are allowed to communicate with one another and thus avoids improper
routing of packets irrespectively of whether IOMMU is enabled or not.
When IOMMU is enabled, ACS is normally configured to force all PCI Express DMA
to go through the root complex so IOMMU can translate it, impacting performance
between peers with higher latency and reduced bandwidth.
A way to avoid the performance hit is to enable Address Translation Services
(ATS). ATS-capable endpoints can prefetch IOVA -> PA translations from the IOMMU
and then perform DMA transactions directly to another endpoint. Hypervisors
enable this by enabling ATS in such endpoints, configuring ACS to enable Direct
Translated P2P, and configuring the IOMMU to allow Address Translation requests.
Another important factor is that the NVIDIA driver stack will use the PCI
Express topology of the system it is running on to determine whether the
hardware is capable of supporting P2P. The driver stack qualifies specific
chipsets, and PCI Express switches for use with GPUDirect P2P. In virtual
environments, the PCI Express topology is flattened and obfuscated to present a
uniform environment to the software inside the VM, which breaks the GPUDirect
P2P use case.
On a bare metal machine, the driver stack groups GPUs into cliques that can
perform GPUDirect P2P communication, excluding peer mappings where P2P
communication is not possible, prominently if GPUs are attached to multiple CPU
sockets.  
CPUs and local memory banks are referred to as NUMA nodes. In a two-socket
server, each of the CPUs has a local memory bank for a total of two NUMA nodes.
Some servers provide the ability to configure additional NUMA nodes per CPU,
which means a CPU socket can have two NUMA nodes  (some servers support four
NUMA nodes per socket) with local memory banks and L3 NUMA domains for improved
performance.
One of the current solutions is that the hypervisor provides additional topology
information that the driver stack can pick up and enable GPUDirect P2P between
GPUs, even if the virtualized environment does not directly expose it. The PCI
Express virtual P2P approval capability structure in the PCI configuration space
is entirely emulated by the hypervisor of passthrough GPU devices.
A clique ID is provided where GPUs with the same clique ID belong to a group of
GPUs capable of P2P communication
On vSphere, Azure, and other CPSs,  the hypervisor lays down a `topologies.xml`
which NCCL can pick up and deduce the right P2P level[^1]. NCCL is leveraging
Infiniband (IB) and/or Unified Communication X (UCX) for communication, and
GPUDirect P2P and GPUDirect RDMA should just work in this case. The only culprit
is that software or applications that do not use the XML file to deduce the
topology will fail and not enable GPUDirect ( [`nccl-p2p-level`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-p2p-level) )
## Hypervisor PCI Express Topology
To enable every part of the accelerator stack, we propose a virtualized
reference architecture to enable GPUDirect P2P and GPUDirect RDMA for any
hypervisor. The idea is split into two parts to enable the right PCI Express
topology. The first part builds upon extending the PCI Express virtual P2P
approval capability structure to every device that wants to do P2P in some way
and groups devices by clique ID. The other part involves replicating a subset of
the host topology so that applications running in the VM do not need to read
additional information and enable the P2P capability like in the bare-metal use
case described above. The driver stack can then deduce automatically if the
topology presented in the VM is capable of P2P communication.
We will work with the following host topology for the following sections. It is
a system with two converged DPUs, each having an `A100X` GPU and two `ConnectX-6`
network ports connected to the downstream ports of a PCI Express switch.
```sh
+-00.0-[d8-df]----00.0-[d9-df]--+-00.0-[da-db]--+-00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
| +-00.1 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
| \-00.2 Mellanox Tech MT42822 BlueField-2 SoC Management Interface
\-01.0-[dc-df]----00.0-[dd-df]----08.0-[de-df]----00.0 NVIDIA Corporation GA100 [A100X]
+-00.0-[3b-42]----00.0-[3c-42]--+-00.0-[3d-3e]--+-00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
| +-00.1 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
| \-00.2 Mellanox Tech MT42822 BlueField-2 SoC Management Interface
\-01.0-[3f-42]----00.0-[40-42]----08.0-[41-42]----00.0 NVIDIA Corporation GA100 [A100X]
```
The green path highlighted above is the optimal and preferred path for
efficient P2P communication.
## PCI Express Virtual P2P Approval Capability
Most of the time, the PCI Express topology is flattened and obfuscated to ensure
easy migration of the VM image between different physical hardware topologies.
In Kata, we can configure the hypervisor to use PCI Express root ports to
hotplug the VFIO  devices one is passing through. A user can select how many PCI
Express root ports to allocate depending on how many devices are passed through.
A recent addition to Kata will detect the right amount of PCI Express devices
that need hotplugging and bail out if the number of root ports is insufficient.
In Kata, we do not automatically increase the number of root ports, we want the
user to be in full control of the topology.
```toml
# /etc/kata-containers/configuration.toml
# VFIO devices are hotplugged on a bridge by default.
# Enable hot-plugging on the root bus. This may be required for devices with
# a large PCI bar, as this is a current limitation with hot-plugging on
# a bridge.
# Default “bridge-port”
hotplug_vfio = "root-port"
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
# Use this parameter when using some large PCI bar devices, such as NVIDIA GPU
# The value means the number of pcie_root_port
# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35"
# Default 0
pcie_root_port = 8
```
VFIO devices are hotplugged on a PCIe-PCI bridge by default. Hotplug of PCI
Express devices is only supported on PCI Express root or downstream ports. With
this configuration set, if we start up a Kata container, we can inspect our
topology and see the allocated PCI Express root ports and the hotplugged
devices.
```sh
$ lspci -tv
-[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
+-01.0 Red Hat, Inc. Virtio console
+-02.0 Red Hat, Inc. Virtio SCSI
+-03.0 Red Hat, Inc. Virtio RNG
+-04.0-[01]----00.0 Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6
+-05.0-[02]----00.0 Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6
+-06.0-[03]----00.0 NVIDIA Corporation Device 20b8
+-07.0-[04]----00.0 NVIDIA Corporation Device 20b8
+-08.0-[05]--
+-09.0-[06]--
+-0a.0-[07]--
+-0b.0-[08]--
+-0c.0 Red Hat, Inc. Virtio socket
+-0d.0 Red Hat, Inc. Virtio file system
+-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller
+-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller
\-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
```
For devices with huge BARs (Base Address Registers) like the GPU (we need to
configure the PCI Express root port properly and allocate enough memory for
mapping), we have added a heuristic to Kata to deduce the right settings. Hence,
the BARs can be mapped correctly. This functionality is added to
[`nvidia/go-nvlib1](https://gitlab.com/nvidia/cloud-native/go-nvlib) which is part
of Kata now.
```sh
$ sudo dmesg | grep BAR
[ 0.179960] pci 0000:00:04.0: BAR 7: assigned [io 0x1000-0x1fff]
[ 0.179962] pci 0000:00:05.0: BAR 7: assigned [io 0x2000-0x2fff]
[ 0.179963] pci 0000:00:06.0: BAR 7: assigned [io 0x3000-0x3fff]
[ 0.179964] pci 0000:00:07.0: BAR 7: assigned [io 0x4000-0x4fff]
[ 0.179966] pci 0000:00:08.0: BAR 7: assigned [io 0x5000-0x5fff]
[ 0.179967] pci 0000:00:09.0: BAR 7: assigned [io 0x6000-0x6fff]
[ 0.179968] pci 0000:00:0a.0: BAR 7: assigned [io 0x7000-0x7fff]
[ 0.179969] pci 0000:00:0b.0: BAR 7: assigned [io 0x8000-0x8fff]
[ 2.115912] pci 0000:01:00.0: BAR 0: assigned [mem 0x13000000000-0x13001ffffff 64bit pref]
[ 2.116203] pci 0000:01:00.0: BAR 2: assigned [mem 0x13002000000-0x130027fffff 64bit pref]
[ 2.683132] pci 0000:02:00.0: BAR 0: assigned [mem 0x12000000000-0x12001ffffff 64bit pref]
[ 2.683419] pci 0000:02:00.0: BAR 2: assigned [mem 0x12002000000-0x120027fffff 64bit pref]
[ 2.959155] pci 0000:03:00.0: BAR 1: assigned [mem 0x11000000000-0x117ffffffff 64bit pref]
[ 2.959345] pci 0000:03:00.0: BAR 3: assigned [mem 0x11800000000-0x11801ffffff 64bit pref]
[ 2.959523] pci 0000:03:00.0: BAR 0: assigned [mem 0xf9000000-0xf9ffffff]
[ 2.966119] pci 0000:04:00.0: BAR 1: assigned [mem 0x10000000000-0x107ffffffff 64bit pref]
[ 2.966295] pci 0000:04:00.0: BAR 3: assigned [mem 0x10800000000-0x10801ffffff 64bit pref]
[ 2.966472] pci 0000:04:00.0: BAR 0: assigned [mem 0xf7000000-0xf7ffffff]
```
The NVIDIA driver stack in this case would refuse to do P2P communication since
(1) the topology is not what it expects, (2)  we do not have a qualified
chipset. Since our P2P devices are not connected to a PCI Express switch port,
we need to provide additional information to support the P2P functionality. One
way of providing such meta information would be to annotate the container; most
of the settings in Kata's configuration file can be overridden via annotations,
but this limits the flexibility, and a user would need to update all the
containers that he wants to run with Kata. The goal is to make such things as
transparent as possible, so we also introduced
[CDI](https://github.com/container-orchestrated-devices/container-device-interface)
(Container Device Interface) to Kata. CDI is a[
specification](https://github.com/container-orchestrated-devices/container-device-interface/blob/master/SPEC.md)
for container runtimes to support third-party devices.
As written before, we can provide a clique ID for the devices that belong
together and are capable of doing P2P. This information is provided to the
hypervisor, which will set up things in the VM accordingly. Let's suppose the
user wanted to do GPUDirect RDMA with the first GPU and the NIC that reside on
the same DPU, one could provide the specification telling the hypervisor that
they belong to the same clique.
```yaml
# /etc/cdi/nvidia.yaml
cdiVersion: 0.4.0
kind: nvidia.com/gpu
devices:
- name: gpu0
annotations:
bdf: “41:00.0”
clique-id: “0”
containerEdits:
deviceNodes:
- path: “/dev/vfio/71"
# /etc/cdi/mellanox.yaml
cdiVersion: 0.4.0
kind: mellanox.com/nic
devices:
- name: nic0
annotations:
bdf: “3d:00.0”
clique-id: “0”
attach-pci: “true”
containerEdits:
deviceNodes:
- path: "/dev/vfio/66"
```
Since this setting is bound to the device and not the container we do not need
to alter the container just allocate the right resource and GPUDirect RDMA would
be set up correctly. Rather than exposing them separately, an idea would be to
expose a GPUDirect RDMA device via NFD (Node Feature Discovery) that combines
both of them; this way, we could make sure that the right pair is allocated and
used more on  Kubernetes deployment in the next section.
The GPU driver stack is leveraging the PCI Express virtual P2P approval
capability, but the NIC stack does not use this now. One of the action items is
to enable MOFED to read the P2P approval capability and enable ATS and ACS
settings as described above.
This way, we could enable GPUDirect P2P and GPUDirect RDMA on any topology
presented to the VM application. It is the responsibility of the administrator
or infrastructure engineer to provide the right information either via
annotations or a CDI specification.
## Host Topology Replication
The other way to represent the PCI Express topology in the VM is to replicate a
subset of the topology needed to support the P2P use case inside the VM. Similar
to the configuration for the root ports, we can easily configure the usage of
PCI Express switch ports to hotplug the devices.
```toml
# /etc/kata-containers/configuration.toml
# VFIO devices are hotplugged on a bridge by default.
# Enable hot plugging on the root bus. This may be required for devices with
# a large PCI bar, as this is a current limitation with hot plugging on
# a bridge.
# Default “bridge-port”
hotplug_vfio = "switch-port"
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
# The value means the number of pcie_root_port
# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35"
# Default 0
pcie_switch_port = 8
```
Each device that is passed through is attached to a PCI Express downstream port
as illustrated below. We can even replicate the hosts two DPUs topologies with
added metadata through the CDI. Most of the time, a container only needs one
pair of GPU and NIC for GPUDirect RDMA. This is more of a showcase of what we
can do with the power of Kata and CDI. One could even think of adding groups of
devices that support P2P, even from different CPU sockets or NUMA nodes, into
one container; indeed, the first group is NUMA node 0 (red), and the second
group is NUMA node 1 (green). Since they are grouped correctly, P2P would be
enabled naturally inside a group, aka clique ID.
```sh
$ lspci -tv
-[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
+-01.0 Red Hat, Inc. Virtio console
+-02.0 Red Hat, Inc. Virtio SCSI
+-03.0 Red Hat, Inc. Virtio RNG
+-04.0-[01-04]----00.0-[02-04]--+-00.0-[03]----00.0 NVIDIA Corporation Device 20b8
| \-01.0-[04]----00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx
+-05.0-[05-08]----00.0-[06-08]--+-00.0-[07]----00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx
| \-01.0-[08]----00.0 NVIDIA Corporation Device 20b8
+-06.0 Red Hat, Inc. Virtio socket
+-07.0 Red Hat, Inc. Virtio file system
+-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller
+-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller [AHCI mode]
\-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
\-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
```
The configuration of using either the root port or switch port can be applied on
a per Container or Pod basis, meaning we can switch PCI Express topologies on
each run of an application.
## Hypervisor Resource Limits
Every hypervisor will have resource limits in terms of how many PCI Express root
ports, switch ports, or bridge ports can be created, especially with devices
that need to reserve a 4K IO range per PCI specification. Each instance of root
or switch port will consume 4K IO of very limited capacity, 64k is the maximum.
Simple math brings us to the conclusion that we can have a maximum of 16 PCI
Express root ports or 16 PCI Express switch ports in QEMU if devices with IO
BARs are used in the PCI Express hierarchy.
Additionally, one can have 32 slots on the PCI root bus and a maximum of 256
slots for the complete PCI(e) topology.
Per default, QEMU will attach a multi-function device in the last slot on the
PCI root bus,
```sh
+-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller
+-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller [AHCI mode]
\-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
```
Kata will additionally add `virtio-xxx-pci` devices consuming (5 slots) plus a
PCIe-PCI-bridge (1 slot) and a DRAM controller (1 slot), meaning per default, we
have already eight slots used. This leaves us 24 slots for adding other devices
to the root bus.
The problem that arises here is one use-case from a customer that uses recent
RTX GPUs with Kata. The user wanted to pass through eight of these GPUs into one
container and ran into issues. The problem is that those cards often consist of
four individual device nodes: GPU, Audio, and two USB controller devices (some
cards have a USB-C output).
These devices are grouped into one IOMMU group. Since one needs to pass through
the complete IOMMU group into the VM, we need to allocate 32 PCI Express root
ports or 32 PCI Express switch ports, which is technically impossible due to the
resource limits outlined above. Since all the devices appear as PCI Express
devices, we need to hotplug those into a root or switch port.
The solution to this problem is leveraging CDI. For each device, add the
information if it is going to be hotplugged as a PCI Express or PCI device,
which results in either using a PCI Express root/switch port or an ordinary PCI
bridge. PCI bridges are not affected by the limited IO range. This way, the GPU
is attached as a PCI Express device to a root/switch port and the other three
PCI devices to a PCI bridge, leaving enough resources to create the needed PCI
Express root/switch ports.  For example, were going to attach the GPUs to a PCI
Express root port and the NICs to a PCI bridge.
```jsonld
# /etc/cdi/mellanox.json
cdiVersion: 0.4.0
kind: mellanox.com/nic
devices:
- name: nic0
annotations:
bdf: “3d:00.0”
clique-id: “0”
attach-pci: “true”
containerEdits:
deviceNodes:
- path: "/dev/vfio/66"
- name: nic1
annotations:
bdf: “3d:00.1”
clique-id: “1”
attach-pci: “true”
containerEdits:
deviceNodes:
- path: "/dev/vfio/67”
```
The configuration is set to use eight root ports for the GPUs and attach the
NICs to a PCI bridge which is connected to a PCI Express-PCI bridge which is the
preferred way of introducing a PCI topology in a PCI Express machine.
```sh
$ lspci -tv
-[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
+-01.0 Red Hat, Inc. Virtio console
+-02.0 Red Hat, Inc. Virtio SCSI
+-03.0 Red Hat, Inc. Virtio RNG
+-04.0-[01]----00.0 NVIDIA Corporation Device 20b8
+-05.0-[02]----00.0 NVIDIA Corporation Device 20b8
+-06.0-[03]--
+-07.0-[04]--
+-08.0-[05]--
+-09.0-[06]--
+-0a.0-[07]--
+-0b.0-[08]--
+-0c.0-[09-0a]----00.0-[0a]--+-00.0 Mellanox Tech MT42822 BlueField-2 ConnectX-6
| \-01.0 Mellanox Tech MT42822 BlueField-2 ConnectX-6
+-0d.0 Red Hat, Inc. Virtio socket
+-0e.0 Red Hat, Inc. Virtio file system
+-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller
+-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller
\-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
```
The PCI devices will consume a slot of which we have 256 in the PCI(e) topology
and leave scarce resources for the needed PCI Express devices.

View File

@@ -39,11 +39,9 @@ use std::path::Path;
const GUEST_CPUS_PATH: &str = "/sys/devices/system/cpu/online";
// Convenience macro to obtain the scope logger
macro_rules! sl {
() => {
slog_scope::logger().new(o!("subsystem" => "cgroups"))
};
// Convenience function to obtain the scope logger.
fn sl() -> slog::Logger {
slog_scope::logger().new(o!("subsystem" => "cgroups"))
}
macro_rules! get_controller_or_return_singular_none {
@@ -82,7 +80,7 @@ impl CgroupManager for Manager {
fn set(&self, r: &LinuxResources, update: bool) -> Result<()> {
info!(
sl!(),
sl(),
"cgroup manager set resources for container. Resources input {:?}", r
);
@@ -120,7 +118,7 @@ impl CgroupManager for Manager {
// set devices resources
set_devices_resources(&self.cgroup, &r.devices, res);
info!(sl!(), "resources after processed {:?}", res);
info!(sl(), "resources after processed {:?}", res);
// apply resources
self.cgroup.apply(res)?;
@@ -197,7 +195,7 @@ impl CgroupManager for Manager {
if guest_cpuset.is_empty() {
return Ok(());
}
info!(sl!(), "update_cpuset_path to: {}", guest_cpuset);
info!(sl(), "update_cpuset_path to: {}", guest_cpuset);
let h = cgroups::hierarchies::auto();
let root_cg = h.root_control_group();
@@ -205,12 +203,12 @@ impl CgroupManager for Manager {
let root_cpuset_controller: &CpuSetController = root_cg.controller_of().unwrap();
let path = root_cpuset_controller.path();
let root_path = Path::new(path);
info!(sl!(), "root cpuset path: {:?}", &path);
info!(sl(), "root cpuset path: {:?}", &path);
let container_cpuset_controller: &CpuSetController = self.cgroup.controller_of().unwrap();
let path = container_cpuset_controller.path();
let container_path = Path::new(path);
info!(sl!(), "container cpuset path: {:?}", &path);
info!(sl(), "container cpuset path: {:?}", &path);
let mut paths = vec![];
for ancestor in container_path.ancestors() {
@@ -219,7 +217,7 @@ impl CgroupManager for Manager {
}
paths.push(ancestor);
}
info!(sl!(), "parent paths to update cpuset: {:?}", &paths);
info!(sl(), "parent paths to update cpuset: {:?}", &paths);
let mut i = paths.len();
loop {
@@ -233,7 +231,7 @@ impl CgroupManager for Manager {
.to_str()
.unwrap()
.trim_start_matches(root_path.to_str().unwrap());
info!(sl!(), "updating cpuset for parent path {:?}", &r_path);
info!(sl(), "updating cpuset for parent path {:?}", &r_path);
let cg = new_cgroup(cgroups::hierarchies::auto(), r_path)?;
let cpuset_controller: &CpuSetController = cg.controller_of().unwrap();
cpuset_controller.set_cpus(guest_cpuset)?;
@@ -241,7 +239,7 @@ impl CgroupManager for Manager {
if !container_cpuset.is_empty() {
info!(
sl!(),
sl(),
"updating cpuset for container path: {:?} cpuset: {}",
&container_path,
container_cpuset
@@ -276,7 +274,7 @@ fn set_network_resources(
network: &LinuxNetwork,
res: &mut cgroups::Resources,
) {
info!(sl!(), "cgroup manager set network");
info!(sl(), "cgroup manager set network");
// set classid
// description can be found at https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v1/net_cls.html
@@ -303,7 +301,7 @@ fn set_devices_resources(
device_resources: &[LinuxDeviceCgroup],
res: &mut cgroups::Resources,
) {
info!(sl!(), "cgroup manager set devices");
info!(sl(), "cgroup manager set devices");
let mut devices = vec![];
for d in device_resources.iter() {
@@ -332,7 +330,7 @@ fn set_hugepages_resources(
hugepage_limits: &[LinuxHugepageLimit],
res: &mut cgroups::Resources,
) {
info!(sl!(), "cgroup manager set hugepage");
info!(sl(), "cgroup manager set hugepage");
let mut limits = vec![];
let hugetlb_controller = cg.controller_of::<HugeTlbController>();
@@ -346,7 +344,7 @@ fn set_hugepages_resources(
limits.push(hr);
} else {
warn!(
sl!(),
sl(),
"{} page size support cannot be verified, dropping requested limit", l.page_size
);
}
@@ -359,7 +357,7 @@ fn set_block_io_resources(
blkio: &LinuxBlockIo,
res: &mut cgroups::Resources,
) {
info!(sl!(), "cgroup manager set block io");
info!(sl(), "cgroup manager set block io");
res.blkio.weight = blkio.weight;
res.blkio.leaf_weight = blkio.leaf_weight;
@@ -387,13 +385,13 @@ fn set_block_io_resources(
}
fn set_cpu_resources(cg: &cgroups::Cgroup, cpu: &LinuxCpu) -> Result<()> {
info!(sl!(), "cgroup manager set cpu");
info!(sl(), "cgroup manager set cpu");
let cpuset_controller: &CpuSetController = cg.controller_of().unwrap();
if !cpu.cpus.is_empty() {
if let Err(e) = cpuset_controller.set_cpus(&cpu.cpus) {
warn!(sl!(), "write cpuset failed: {:?}", e);
warn!(sl(), "write cpuset failed: {:?}", e);
}
}
@@ -424,7 +422,7 @@ fn set_cpu_resources(cg: &cgroups::Cgroup, cpu: &LinuxCpu) -> Result<()> {
}
fn set_memory_resources(cg: &cgroups::Cgroup, memory: &LinuxMemory, update: bool) -> Result<()> {
info!(sl!(), "cgroup manager set memory");
info!(sl(), "cgroup manager set memory");
let mem_controller: &MemController = cg.controller_of().unwrap();
if !update {
@@ -493,7 +491,7 @@ fn set_memory_resources(cg: &cgroups::Cgroup, memory: &LinuxMemory, update: bool
}
fn set_pids_resources(cg: &cgroups::Cgroup, pids: &LinuxPids) -> Result<()> {
info!(sl!(), "cgroup manager set pids");
info!(sl(), "cgroup manager set pids");
let pid_controller: &PidController = cg.controller_of().unwrap();
let v = if pids.limit > 0 {
MaxValue::Value(pids.limit)
@@ -965,7 +963,7 @@ pub fn get_paths() -> Result<HashMap<String, String>> {
for l in fs::read_to_string(PATHS)?.lines() {
let fl: Vec<&str> = l.split(':').collect();
if fl.len() != 3 {
info!(sl!(), "Corrupted cgroup data!");
info!(sl(), "Corrupted cgroup data!");
continue;
}
@@ -986,7 +984,7 @@ pub fn get_mounts(paths: &HashMap<String, String>) -> Result<HashMap<String, Str
let post: Vec<&str> = p[1].split(' ').collect();
if post.len() != 3 {
warn!(sl!(), "can't parse {} line {:?}", MOUNTS, l);
warn!(sl(), "can't parse {} line {:?}", MOUNTS, l);
continue;
}

View File

@@ -16,11 +16,9 @@ use inotify::{Inotify, WatchMask};
use tokio::io::AsyncReadExt;
use tokio::sync::mpsc::{channel, Receiver};
// Convenience macro to obtain the scope logger
macro_rules! sl {
() => {
slog_scope::logger().new(o!("subsystem" => "cgroups_notifier"))
};
// Convenience function to obtain the scope logger.
fn sl() -> slog::Logger {
slog_scope::logger().new(o!("subsystem" => "cgroups_notifier"))
}
pub async fn notify_oom(cid: &str, cg_dir: String) -> Result<Receiver<String>> {
@@ -38,7 +36,7 @@ pub async fn notify_oom(cid: &str, cg_dir: String) -> Result<Receiver<String>> {
fn get_value_from_cgroup(path: &Path, key: &str) -> Result<i64> {
let content = fs::read_to_string(path)?;
info!(
sl!(),
sl(),
"get_value_from_cgroup file: {:?}, content: {}", &path, &content
);
@@ -67,11 +65,11 @@ async fn register_memory_event_v2(
let event_control_path = Path::new(&cg_dir).join(memory_event_name);
let cgroup_event_control_path = Path::new(&cg_dir).join(cgroup_event_name);
info!(
sl!(),
sl(),
"register_memory_event_v2 event_control_path: {:?}", &event_control_path
);
info!(
sl!(),
sl(),
"register_memory_event_v2 cgroup_event_control_path: {:?}", &cgroup_event_control_path
);
@@ -82,8 +80,8 @@ async fn register_memory_event_v2(
// Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited
let cg_wd = inotify.add_watch(&cgroup_event_control_path, WatchMask::MODIFY)?;
info!(sl!(), "ev_wd: {:?}", ev_wd);
info!(sl!(), "cg_wd: {:?}", cg_wd);
info!(sl(), "ev_wd: {:?}", ev_wd);
info!(sl(), "cg_wd: {:?}", cg_wd);
let (sender, receiver) = channel(100);
let containere_id = containere_id.to_string();
@@ -97,17 +95,17 @@ async fn register_memory_event_v2(
while let Some(event_or_error) = stream.next().await {
let event = event_or_error.unwrap();
info!(
sl!(),
sl(),
"container[{}] get event for container: {:?}", &containere_id, &event
);
// info!("is1: {}", event.wd == wd1);
info!(sl!(), "event.wd: {:?}", event.wd);
info!(sl(), "event.wd: {:?}", event.wd);
if event.wd == ev_wd {
let oom = get_value_from_cgroup(&event_control_path, "oom_kill");
if oom.unwrap_or(0) > 0 {
let _ = sender.send(containere_id.clone()).await.map_err(|e| {
error!(sl!(), "send containere_id failed, error: {:?}", e);
error!(sl(), "send containere_id failed, error: {:?}", e);
});
return;
}
@@ -171,13 +169,13 @@ async fn register_memory_event(
let mut buf = [0u8; 8];
match eventfd_stream.read(&mut buf).await {
Err(err) => {
warn!(sl!(), "failed to read from eventfd: {:?}", err);
warn!(sl(), "failed to read from eventfd: {:?}", err);
return;
}
Ok(_) => {
let content = fs::read_to_string(path.clone());
info!(
sl!(),
sl(),
"cgroup event for container: {}, path: {:?}, content: {:?}",
&containere_id,
&path,
@@ -193,7 +191,7 @@ async fn register_memory_event(
}
let _ = sender.send(containere_id.clone()).await.map_err(|e| {
error!(sl!(), "send containere_id failed, error: {:?}", e);
error!(sl(), "send containere_id failed, error: {:?}", e);
});
}
});

View File

@@ -1596,10 +1596,8 @@ mod tests {
use tempfile::tempdir;
use test_utils::skip_if_not_root;
macro_rules! sl {
() => {
slog_scope::logger()
};
fn sl() -> slog::Logger {
slog_scope::logger()
}
#[test]
@@ -1854,7 +1852,7 @@ mod tests {
let _ = new_linux_container_and_then(|mut c: LinuxContainer| {
c.processes.insert(
1,
Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap(),
Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap(),
);
let p = c.get_process("123");
assert!(p.is_ok(), "Expecting Ok, Got {:?}", p);
@@ -1881,7 +1879,7 @@ mod tests {
let (c, _dir) = new_linux_container();
let ret = c
.unwrap()
.start(Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap())
.start(Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap())
.await;
assert!(ret.is_err(), "Expecting Err, Got {:?}", ret);
}
@@ -1891,7 +1889,7 @@ mod tests {
let (c, _dir) = new_linux_container();
let ret = c
.unwrap()
.run(Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap())
.run(Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap())
.await;
assert!(ret.is_err(), "Expecting Err, Got {:?}", ret);
}

View File

@@ -161,7 +161,7 @@ impl Process {
pub fn notify_term_close(&mut self) {
let notify = self.term_exit_notifier.clone();
notify.notify_waiters();
notify.notify_one();
}
pub fn close_stdin(&mut self) {

View File

@@ -26,11 +26,9 @@ use oci::{LinuxDeviceCgroup, LinuxResources, Spec};
use protocols::agent::Device;
use tracing::instrument;
// Convenience macro to obtain the scope logger
macro_rules! sl {
() => {
slog_scope::logger().new(o!("subsystem" => "device"))
};
// Convenience function to obtain the scope logger.
fn sl() -> slog::Logger {
slog_scope::logger().new(o!("subsystem" => "device"))
}
const VM_ROOTFS: &str = "/";
@@ -78,7 +76,7 @@ where
{
let syspci = Path::new(&syspci);
let drv = drv.as_ref();
info!(sl!(), "rebind_pci_driver: {} => {:?}", dev, drv);
info!(sl(), "rebind_pci_driver: {} => {:?}", dev, drv);
let devpath = syspci.join("devices").join(dev.to_string());
let overridepath = &devpath.join("driver_override");
@@ -606,7 +604,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) -
let host_minor = specdev.minor;
info!(
sl!(),
sl(),
"update_spec_devices() updating device";
"container_path" => &specdev.path,
"type" => &specdev.r#type,
@@ -659,7 +657,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) -
if let Some(update) = res_updates.get(&(host_type.as_str(), host_major, host_minor))
{
info!(
sl!(),
sl(),
"update_spec_devices() updating resource";
"type" => &host_type,
"host_major" => host_major,
@@ -923,7 +921,7 @@ pub async fn add_devices(
#[instrument]
async fn add_device(device: &Device, sandbox: &Arc<Mutex<Sandbox>>) -> Result<SpecUpdate> {
// log before validation to help with debugging gRPC protocol version differences.
info!(sl!(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}",
info!(sl(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}",
device.id, device.type_, device.vm_path, device.container_path, device.options);
if device.type_.is_empty() {

View File

@@ -38,11 +38,9 @@ const KATA_CC_IMAGE_WORK_DIR: &str = "/run/image/";
const KATA_CC_PAUSE_BUNDLE: &str = "/pause_bundle";
const CONFIG_JSON: &str = "config.json";
// Convenience macro to obtain the scope logger
macro_rules! sl {
() => {
slog_scope::logger()
};
// Convenience function to obtain the scope logger.
fn sl() -> slog::Logger {
slog_scope::logger().new(o!("subsystem" => "cgroups"))
}
pub struct ImageService {
@@ -57,18 +55,17 @@ impl ImageService {
env::set_var("CC_IMAGE_WORK_DIR", KATA_CC_IMAGE_WORK_DIR);
let mut image_client = ImageClient::default();
let image_policy_file = &AGENT_CONFIG.read().await.image_policy_file;
let image_policy_file = &AGENT_CONFIG.image_policy_file;
if !image_policy_file.is_empty() {
image_client.config.file_paths.sigstore_config = image_policy_file.clone();
}
let simple_signing_sigstore_config =
&AGENT_CONFIG.read().await.simple_signing_sigstore_config;
let simple_signing_sigstore_config = &AGENT_CONFIG.simple_signing_sigstore_config;
if !simple_signing_sigstore_config.is_empty() {
image_client.config.file_paths.sigstore_config = simple_signing_sigstore_config.clone();
}
let image_registry_auth_file = &AGENT_CONFIG.read().await.image_registry_auth_file;
let image_registry_auth_file = &AGENT_CONFIG.image_registry_auth_file;
if !image_registry_auth_file.is_empty() {
image_client.config.file_paths.auth_file = image_registry_auth_file.clone();
}
@@ -88,7 +85,7 @@ impl ImageService {
return Err(anyhow!("Pause image not present in rootfs"));
}
info!(sl!(), "use guest pause image cid {:?}", cid);
info!(sl(), "use guest pause image cid {:?}", cid);
let pause_bundle = Path::new(CONTAINER_BASE).join(cid);
let pause_rootfs = pause_bundle.join("rootfs");
let pause_config = pause_bundle.join(CONFIG_JSON);
@@ -159,12 +156,12 @@ impl ImageService {
async fn pull_image(&self, req: &image::PullImageRequest) -> Result<String> {
env::set_var("OCICRYPT_KEYPROVIDER_CONFIG", OCICRYPT_CONFIG_PATH);
let https_proxy = &AGENT_CONFIG.read().await.https_proxy;
let https_proxy = &AGENT_CONFIG.https_proxy;
if !https_proxy.is_empty() {
env::set_var("HTTPS_PROXY", https_proxy);
}
let no_proxy = &AGENT_CONFIG.read().await.no_proxy;
let no_proxy = &AGENT_CONFIG.no_proxy;
if !no_proxy.is_empty() {
env::set_var("NO_PROXY", no_proxy);
}
@@ -179,7 +176,7 @@ impl ImageService {
return Ok(image.to_owned());
}
let aa_kbc_params = &AGENT_CONFIG.read().await.aa_kbc_params;
let aa_kbc_params = &AGENT_CONFIG.aa_kbc_params;
if !aa_kbc_params.is_empty() {
match self.attestation_agent_started.compare_exchange_weak(
false,
@@ -188,22 +185,21 @@ impl ImageService {
Ordering::SeqCst,
) {
Ok(_) => Self::init_attestation_agent()?,
Err(_) => info!(sl!(), "Attestation Agent already running"),
Err(_) => info!(sl(), "Attestation Agent already running"),
}
}
// If the attestation-agent is being used, then enable the authenticated credentials support
info!(
sl!(),
sl(),
"image_client.config.auth set to: {}",
!aa_kbc_params.is_empty()
);
self.image_client.lock().await.config.auth = !aa_kbc_params.is_empty();
// Read enable signature verification from the agent config and set it in the image_client
let enable_signature_verification =
&AGENT_CONFIG.read().await.enable_signature_verification;
let enable_signature_verification = &AGENT_CONFIG.enable_signature_verification;
info!(
sl!(),
sl(),
"enable_signature_verification set to: {}", enable_signature_verification
);
self.image_client.lock().await.config.security_validate = *enable_signature_verification;
@@ -215,7 +211,7 @@ impl ImageService {
let decrypt_config = format!("provider:attestation-agent:{}", aa_kbc_params);
info!(sl!(), "pull image {:?}, bundle path {:?}", cid, bundle_path);
info!(sl(), "pull image {:?}, bundle path {:?}", cid, bundle_path);
// Image layers will store at KATA_CC_IMAGE_WORK_DIR, generated bundles
// with rootfs and config.json will store under CONTAINER_BASE/cid.
let res = self
@@ -228,13 +224,13 @@ impl ImageService {
match res {
Ok(image) => {
info!(
sl!(),
sl(),
"pull and unpack image {:?}, cid: {:?}, with image-rs succeed. ", image, cid
);
}
Err(e) => {
error!(
sl!(),
sl(),
"pull and unpack image {:?}, cid: {:?}, with image-rs failed with {:?}. ",
image,
cid,

View File

@@ -65,7 +65,7 @@ use tokio::{
io::AsyncWrite,
sync::{
watch::{channel, Receiver},
Mutex, RwLock,
Mutex,
},
task::JoinHandle,
};
@@ -84,12 +84,11 @@ cfg_if! {
const NAME: &str = "kata-agent";
lazy_static! {
static ref AGENT_CONFIG: Arc<RwLock<AgentConfig>> = Arc::new(RwLock::new(
static ref AGENT_CONFIG: AgentConfig =
// Note: We can't do AgentOpts.parse() here to send through the processed arguments to AgentConfig
// clap::Parser::parse() greedily process all command line input including cargo test parameters,
// so should only be used inside main.
AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap()
));
AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap();
}
#[derive(Parser)]
@@ -182,13 +181,13 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
lazy_static::initialize(&AGENT_CONFIG);
init_agent_as_init(&logger, AGENT_CONFIG.read().await.unified_cgroup_hierarchy)?;
init_agent_as_init(&logger, AGENT_CONFIG.unified_cgroup_hierarchy)?;
drop(logger_async_guard);
} else {
lazy_static::initialize(&AGENT_CONFIG);
}
let config = AGENT_CONFIG.read().await;
let config = &AGENT_CONFIG;
let log_vport = config.log_vport as u32;
let log_handle = tokio::spawn(create_logger_task(rfd, log_vport, shutdown_rx.clone()));
@@ -201,7 +200,7 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
let (logger, logger_async_guard) =
logging::create_logger(NAME, "agent", config.log_level, writer);
announce(&logger, &config);
announce(&logger, config);
// This variable is required as it enables the global (and crucially static) logger,
// which is required to satisfy the the lifetime constraints of the auto-generated gRPC code.
@@ -229,7 +228,7 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
let span_guard = root_span.enter();
// Start the sandbox and wait for its ttRPC server to end
start_sandbox(&logger, &config, init_mode, &mut tasks, shutdown_rx.clone()).await?;
start_sandbox(&logger, config, init_mode, &mut tasks, shutdown_rx.clone()).await?;
// Install a NOP logger for the remainder of the shutdown sequence
// to ensure any log calls made by local crates using the scope logger

View File

@@ -15,11 +15,9 @@ use tracing::instrument;
const NAMESPACE_KATA_AGENT: &str = "kata_agent";
const NAMESPACE_KATA_GUEST: &str = "kata_guest";
// Convenience macro to obtain the scope logger
macro_rules! sl {
() => {
slog_scope::logger().new(o!("subsystem" => "metrics"))
};
// Convenience function to obtain the scope logger.
fn sl() -> slog::Logger {
slog_scope::logger().new(o!("subsystem" => "metrics"))
}
lazy_static! {
@@ -139,7 +137,7 @@ fn update_agent_metrics() -> Result<()> {
Ok(p) => p,
Err(e) => {
// FIXME: return Ok for all errors?
warn!(sl!(), "failed to create process instance: {:?}", e);
warn!(sl(), "failed to create process instance: {:?}", e);
return Ok(());
}
@@ -160,7 +158,7 @@ fn update_agent_metrics() -> Result<()> {
// io
match me.io() {
Err(err) => {
info!(sl!(), "failed to get process io stat: {:?}", err);
info!(sl(), "failed to get process io stat: {:?}", err);
}
Ok(io) => {
set_gauge_vec_proc_io(&AGENT_IO_STAT, &io);
@@ -169,7 +167,7 @@ fn update_agent_metrics() -> Result<()> {
match me.stat() {
Err(err) => {
info!(sl!(), "failed to get process stat: {:?}", err);
info!(sl(), "failed to get process stat: {:?}", err);
}
Ok(stat) => {
set_gauge_vec_proc_stat(&AGENT_PROC_STAT, &stat);
@@ -177,7 +175,7 @@ fn update_agent_metrics() -> Result<()> {
}
match me.status() {
Err(err) => error!(sl!(), "failed to get process status: {:?}", err),
Err(err) => error!(sl(), "failed to get process status: {:?}", err),
Ok(status) => set_gauge_vec_proc_status(&AGENT_PROC_STATUS, &status),
}
@@ -189,7 +187,7 @@ fn update_guest_metrics() {
// try get load and task info
match procfs::LoadAverage::new() {
Err(err) => {
info!(sl!(), "failed to get guest LoadAverage: {:?}", err);
info!(sl(), "failed to get guest LoadAverage: {:?}", err);
}
Ok(load) => {
GUEST_LOAD
@@ -209,7 +207,7 @@ fn update_guest_metrics() {
// try to get disk stats
match procfs::diskstats() {
Err(err) => {
info!(sl!(), "failed to get guest diskstats: {:?}", err);
info!(sl(), "failed to get guest diskstats: {:?}", err);
}
Ok(diskstats) => {
for diskstat in diskstats {
@@ -221,7 +219,7 @@ fn update_guest_metrics() {
// try to get vm stats
match procfs::vmstat() {
Err(err) => {
info!(sl!(), "failed to get guest vmstat: {:?}", err);
info!(sl(), "failed to get guest vmstat: {:?}", err);
}
Ok(vmstat) => {
for (k, v) in vmstat {
@@ -233,7 +231,7 @@ fn update_guest_metrics() {
// cpu stat
match procfs::KernelStats::new() {
Err(err) => {
info!(sl!(), "failed to get guest KernelStats: {:?}", err);
info!(sl(), "failed to get guest KernelStats: {:?}", err);
}
Ok(kernel_stats) => {
set_gauge_vec_cpu_time(&GUEST_CPU_TIME, "total", &kernel_stats.total);
@@ -246,7 +244,7 @@ fn update_guest_metrics() {
// try to get net device stats
match procfs::net::dev_status() {
Err(err) => {
info!(sl!(), "failed to get guest net::dev_status: {:?}", err);
info!(sl(), "failed to get guest net::dev_status: {:?}", err);
}
Ok(devs) => {
// netdev: map[string]procfs::net::DeviceStatus
@@ -259,7 +257,7 @@ fn update_guest_metrics() {
// get statistics about memory from /proc/meminfo
match procfs::Meminfo::new() {
Err(err) => {
info!(sl!(), "failed to get guest Meminfo: {:?}", err);
info!(sl(), "failed to get guest Meminfo: {:?}", err);
}
Ok(meminfo) => {
set_gauge_vec_meminfo(&GUEST_MEMINFO, &meminfo);

File diff suppressed because it is too large Load Diff

View File

@@ -69,7 +69,7 @@ macro_rules! trace_rpc_call {
propagator.extract(&extract_carrier_from_ttrpc($ctx))
});
info!(sl!(), "rpc call from shim to agent: {:?}", $name);
info!(sl(), "rpc call from shim to agent: {:?}", $name);
// generate tracing span
let rpc_span = span!(tracing::Level::INFO, $name, "mod"="rpc.rs", req=?$req);

View File

@@ -19,11 +19,9 @@ use tokio::sync::watch::Receiver;
use tokio::sync::Mutex;
use tracing::instrument;
// Convenience macro to obtain the scope logger
macro_rules! sl {
() => {
slog_scope::logger().new(o!("subsystem" => "uevent"))
};
// Convenience function to obtain the scope logger.
fn sl() -> slog::Logger {
slog_scope::logger().new(o!("subsystem" => "uevent"))
}
#[derive(Debug, Default, Clone, PartialEq, Eq)]
@@ -120,11 +118,11 @@ pub async fn wait_for_uevent(
) -> Result<Uevent> {
let logprefix = format!("Waiting for {:?}", &matcher);
info!(sl!(), "{}", logprefix);
info!(sl(), "{}", logprefix);
let mut sb = sandbox.lock().await;
for uev in sb.uevent_map.values() {
if matcher.is_match(uev) {
info!(sl!(), "{}: found {:?} in uevent map", logprefix, &uev);
info!(sl(), "{}: found {:?} in uevent map", logprefix, &uev);
return Ok(uev.clone());
}
}
@@ -139,9 +137,9 @@ pub async fn wait_for_uevent(
sb.uevent_watchers.push(Some((Box::new(matcher), tx)));
drop(sb); // unlock
info!(sl!(), "{}: waiting on channel", logprefix);
info!(sl(), "{}: waiting on channel", logprefix);
let hotplug_timeout = AGENT_CONFIG.read().await.hotplug_timeout;
let hotplug_timeout = AGENT_CONFIG.hotplug_timeout;
let uev = match tokio::time::timeout(hotplug_timeout, rx).await {
Ok(v) => v?,
@@ -157,7 +155,7 @@ pub async fn wait_for_uevent(
}
};
info!(sl!(), "{}: found {:?} on channel", logprefix, &uev);
info!(sl(), "{}: found {:?} on channel", logprefix, &uev);
Ok(uev)
}

View File

@@ -341,7 +341,10 @@ impl DragonballInner {
// cannot exceed maximum value
if new_vcpus > self.config.cpu_info.default_maxvcpus {
return Err(anyhow!("resize vcpu error: cannot greater than maxvcpus"));
warn!(
sl!(),
"Cannot allocate more vcpus than the max allowed number of vcpus. The maximum allowed amount of vcpus will be used instead.");
return Ok((current_vcpus, self.config.cpu_info.default_maxvcpus));
}
Ok((current_vcpus, new_vcpus))

View File

@@ -105,7 +105,12 @@ impl InitialSizeManager {
hv.cpu_info.default_vcpus = self.resource.vcpu as i32
}
if self.resource.mem_mb > 0 {
hv.memory_info.default_memory = self.resource.mem_mb;
// since the memory overhead introduced by kata-agent and system components
// will really affect the amount of memory the user can use, so we choose to
// plus the default_memory here, instead of overriding it.
// (if we override the default_memory here, and user apllications still
// use memory as they orignally expected, it would be easy to OOM.)
hv.memory_info.default_memory += self.resource.mem_mb;
}
Ok(())
}

View File

@@ -17,7 +17,7 @@ import (
"github.com/prometheus/procfs"
"github.com/urfave/cli"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
"github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
@@ -113,8 +113,8 @@ type HypervisorInfo struct {
SocketPath string
Msize9p uint32
MemorySlots uint32
PCIeRootPort uint32
ColdPlugVFIO hv.PCIePort
HotPlugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
HotplugVFIOOnRootBus bool
Debug bool
}
@@ -317,9 +317,9 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) {
EntropySource: config.HypervisorConfig.EntropySource,
SharedFS: config.HypervisorConfig.SharedFS,
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
SocketPath: socketPath,
}, nil
}

View File

@@ -19,12 +19,12 @@ import (
"testing"
"github.com/BurntSushi/toml"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
vcUtils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/urfave/cli"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
@@ -74,8 +74,9 @@ func createConfig(configPath string, fileData string) error {
return nil
}
func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeConfig, err error) {
var coldPlugVFIO hv.PCIePort
func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.RuntimeConfig, err error) {
var hotPlugVFIO config.PCIePort
var coldPlugVFIO config.PCIePort
const logPath = "/log/path"
hypervisorPath := filepath.Join(prefixDir, "hypervisor")
kernelPath := filepath.Join(prefixDir, "kernel")
@@ -87,8 +88,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
blockStorageDriver := "virtio-scsi"
enableIOThreads := true
hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
coldPlugVFIO = hv.NoPort
hotPlugVFIO = config.BridgePort
coldPlugVFIO = config.NoPort
disableNewNetNs := false
sharedFS := "virtio-9p"
virtioFSdaemon := filepath.Join(prefixDir, "virtiofsd")
@@ -132,8 +133,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
BlockDeviceDriver: blockStorageDriver,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
PCIeRootPort: pcieRootPort,
DisableNewNetNs: disableNewNetNs,
DefaultVCPUCount: hypConfig.NumVCPUs,
DefaultMaxVCPUCount: hypConfig.DefaultMaxVCPUs,
@@ -156,12 +157,12 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
return "", oci.RuntimeConfig{}, err
}
_, config, err = katautils.LoadConfiguration(configFile, true)
_, ociConfig, err = katautils.LoadConfiguration(configFile, true)
if err != nil {
return "", oci.RuntimeConfig{}, err
}
return configFile, config, nil
return configFile, ociConfig, nil
}
func getExpectedAgentDetails(config oci.RuntimeConfig) (AgentInfo, error) {
@@ -277,7 +278,7 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo {
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
}

View File

@@ -131,6 +131,11 @@ default_maxmemory = @DEFMAXMEMSZ@
# Shared file system type:
# - virtio-fs (default)
# - virtio-fs-nydus
# - none
# WARNING: "none" should be carefully used, and only used in very few specific cases, as
# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
# issues with rotation of secrets, certs, or configurations via kubernetes objects like
# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
shared_fs = "@DEFSHAREDFS_CLH_VIRTIOFS@"
# Path to vhost-user-fs daemon.

View File

@@ -178,6 +178,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
# - virtio-fs (default)
# - virtio-9p
# - virtio-fs-nydus
# - none
# WARNING: "none" should be carefully used, and only used in very few specific cases, as
# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
# issues with rotation of secrets, certs, or configurations via kubernetes objects like
# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@"
# Path to vhost-user-fs daemon.

View File

@@ -186,6 +186,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
# - virtio-fs (default)
# - virtio-9p
# - virtio-fs-nydus
# - none
# WARNING: "none" should be carefully used, and only used in very few specific cases, as
# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
# issues with rotation of secrets, certs, or configurations via kubernetes objects like
# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
shared_fs = "@DEFSHAREDFS_QEMU_SEV_VIRTIOFS@"
# Path to vhost-user-fs daemon.
@@ -669,4 +674,4 @@ service_offload = @DEFSERVICEOFFLOAD@
#
# Keys can be remotely provisioned. The Kata agent fetches them from e.g.
# a HTTPS URL:
#provision=https://my-key-broker.foo/tenant/<tenant-id>
#provision=https://my-key-broker.foo/tenant/<tenant-id>

View File

@@ -184,6 +184,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
# - virtio-fs (default)
# - virtio-9p
# - virtio-fs-nydus
# - none
# WARNING: "none" should be carefully used, and only used in very few specific cases, as
# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
# issues with rotation of secrets, certs, or configurations via kubernetes objects like
# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
shared_fs = "@DEFSHAREDFS_QEMU_SNP_VIRTIOFS@"
# Path to vhost-user-fs daemon.

View File

@@ -172,6 +172,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
# - virtio-fs (default)
# - virtio-9p
# - virtio-fs-nydus
# - none
# WARNING: "none" should be carefully used, and only used in very few specific cases, as
# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
# issues with rotation of secrets, certs, or configurations via kubernetes objects like
# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
shared_fs = "@DEFSHAREDFS_QEMU_TDX_VIRTIOFS@"
# Path to vhost-user-fs daemon.

View File

@@ -206,6 +206,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
# - virtio-fs (default)
# - virtio-9p
# - virtio-fs-nydus
# - none
# WARNING: "none" should be carefully used, and only used in very few specific cases, as
# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
# issues with rotation of secrets, certs, or configurations via kubernetes objects like
# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@"
# Path to vhost-user-fs daemon.
@@ -380,8 +385,15 @@ pflashes = []
# Default false
#hotplug_vfio_on_root_bus = true
# Enable hot-plugging of VFIO devices to a bridge-port,
# root-port or switch-port.
# The default setting is "no-port"
#hot_plug_vfio = "root-port"
# In a confidential compute environment hot-plugging can compromise
# security. Enable cold-plugging of VFIO devices to a root-port.
# security.
# Enable cold-plugging of VFIO devices to a bridge-port,
# root-port or switch-port.
# The default setting is "no-port", which means disabled.
#cold_plug_vfio = "root-port"

View File

@@ -20,7 +20,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/assert"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
@@ -308,8 +308,9 @@ func TestCreateContainerConfigFail(t *testing.T) {
assert.Error(err)
}
func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err error) {
var coldPlugVFIO hv.PCIePort
func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, err error) {
var hotPlugVFIO config.PCIePort
var coldPlugVFIO config.PCIePort
if dir == "" {
return "", fmt.Errorf("BUG: need directory")
}
@@ -330,11 +331,11 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
blockDeviceDriver := "virtio-scsi"
enableIOThreads := true
hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
disableNewNetNs := false
sharedFS := "virtio-9p"
virtioFSdaemon := path.Join(dir, "virtiofsd")
coldPlugVFIO = hv.RootPort
hotPlugVFIO = config.BridgePort
coldPlugVFIO = config.RootPort
configFileOptions := ktu.RuntimeConfigOptions{
Hypervisor: "qemu",
@@ -349,10 +350,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
BlockDeviceDriver: blockDeviceDriver,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
DisableNewNetNs: disableNewNetNs,
SharedFS: sharedFS,
VirtioFSDaemon: virtioFSdaemon,
HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
}

View File

@@ -81,6 +81,17 @@ const (
// VirtioFSNydus means use nydus for the shared file system
VirtioFSNydus = "virtio-fs-nydus"
// NoSharedFS means *no* shared file system solution will be used
// and files will be copied into the guest system.
//
// WARNING: This should be carefully used, and only used in very few
// specific cases, as any update to the mount will *NOT* be reflected
// during the lifecycle of the pod, causing issues with rotation of
// secrets, certs, or configurations via kubernetes objects like
// configMaps or secrets, as those will be copied into the guest at
// *pod* *creation* *time*.
NoSharedFS = "none"
)
const (
@@ -114,14 +125,117 @@ const (
// SysDevPrefix is static string of /sys/dev
var SysDevPrefix = "/sys/dev"
// SysIOMMUPath is static string of /sys/kernel/iommu_groups
var SysIOMMUPath = "/sys/kernel/iommu_groups"
// SysIOMMUGroupPath is static string of /sys/kernel/iommu_groups
var SysIOMMUGroupPath = "/sys/kernel/iommu_groups"
// SysBusPciDevicesPath is static string of /sys/bus/pci/devices
var SysBusPciDevicesPath = "/sys/bus/pci/devices"
var getSysDevPath = getSysDevPathImpl
// PCIePortBusPrefix gives us the correct bus nameing dependeing on the port
// used to hot(cold)-plug the device
type PCIePortBusPrefix string
const (
PCIeRootPortPrefix PCIePortBusPrefix = "rp"
PCIeSwitchPortPrefix PCIePortBusPrefix = "sw"
PCIeSwitchUpstreamPortPrefix PCIePortBusPrefix = "swup"
PCIeSwitchhDownstreamPortPrefix PCIePortBusPrefix = "swdp"
PCIBridgePortPrefix PCIePortBusPrefix = "bp"
)
func (p PCIePortBusPrefix) String() string {
switch p {
case PCIeRootPortPrefix:
fallthrough
case PCIeSwitchPortPrefix:
fallthrough
case PCIeSwitchUpstreamPortPrefix:
fallthrough
case PCIeSwitchhDownstreamPortPrefix:
fallthrough
case PCIBridgePortPrefix:
return string(p)
}
return fmt.Sprintf("<unknown PCIePortBusPrefix: %s>", string(p))
}
// PCIePort distinguish only between root and switch port
type PCIePort string
const (
// RootPort attach VFIO devices to a root-port
RootPort PCIePort = "root-port"
// SwitchPort attach VFIO devices to a switch-port
SwitchPort = "switch-port"
// BridgePort is the default
BridgePort = "bridge-port"
// NoPort is for disabling VFIO hotplug/coldplug
NoPort = "no-port"
// InvalidPort is for invalid port
InvalidPort = "invalid-port"
)
func (p PCIePort) String() string {
switch p {
case RootPort:
fallthrough
case SwitchPort:
fallthrough
case BridgePort:
fallthrough
case NoPort:
fallthrough
case InvalidPort:
return string(p)
}
return fmt.Sprintf("<unknown PCIePort: %s>", string(p))
}
var PCIePortPrefixMapping = map[PCIePort]PCIePortBusPrefix{
RootPort: PCIeRootPortPrefix,
SwitchPort: PCIeSwitchhDownstreamPortPrefix,
BridgePort: PCIBridgePortPrefix,
}
func (p PCIePort) Invalid() bool {
switch p {
case RootPort:
fallthrough
case SwitchPort:
fallthrough
case BridgePort:
fallthrough
case NoPort:
return false
}
return true
}
func (p PCIePort) Valid() bool {
switch p {
case RootPort:
fallthrough
case SwitchPort:
fallthrough
case BridgePort:
fallthrough
case NoPort:
return true
}
return false
}
type PCIePortMapping map[string]bool
var (
// Each of this structures keeps track of the devices attached to the
// different types of PCI ports. We can deduces the Bus number from it
// and eliminate duplicates being assigned.
PCIeDevices = map[PCIePort]PCIePortMapping{}
)
// DeviceInfo is an embedded type that contains device data common to all types of devices.
type DeviceInfo struct {
// DriverOptions is specific options for each device driver
@@ -167,6 +281,9 @@ type DeviceInfo struct {
// ColdPlug specifies whether the device must be cold plugged (true)
// or hot plugged (false).
ColdPlug bool
// Specifies the PCIe port type to which the device is attached
Port PCIePort
}
// BlockDrive represents a block storage drive which may be used in case the storage
@@ -268,14 +385,8 @@ const (
VFIOAPDeviceMediatedType
)
type VFIODev interface {
GetID() *string
GetType() VFIODeviceType
GetSysfsDev() *string
}
// VFIOPCIDev represents a VFIO PCI device used for hotplugging
type VFIOPCIDev struct {
// VFIODev represents a VFIO PCI device used for hotplugging
type VFIODev struct {
// ID is used to identify this drive in the hypervisor options.
ID string
@@ -305,44 +416,15 @@ type VFIOPCIDev struct {
// IsPCIe specifies device is PCIe or PCI
IsPCIe bool
}
func (d VFIOPCIDev) GetID() *string {
return &d.ID
}
func (d VFIOPCIDev) GetType() VFIODeviceType {
return d.Type
}
func (d VFIOPCIDev) GetSysfsDev() *string {
return &d.SysfsDev
}
type VFIOAPDev struct {
// ID is used to identify this drive in the hypervisor options.
ID string
// sysfsdev of VFIO mediated device
SysfsDev string
// APDevices are the Adjunct Processor devices assigned to the mdev
APDevices []string
// Type of VFIO device
Type VFIODeviceType
}
// Rank identifies a device in a IOMMU group
Rank int
func (d VFIOAPDev) GetID() *string {
return &d.ID
}
func (d VFIOAPDev) GetType() VFIODeviceType {
return d.Type
}
func (d VFIOAPDev) GetSysfsDev() *string {
return &d.SysfsDev
// Port is the PCIe port type to which the device is attached
Port PCIePort
}
// RNGDev represents a random number generator device

View File

@@ -47,9 +47,9 @@ func deviceLogger() *logrus.Entry {
return api.DeviceLogger()
}
// Identify PCIe device by reading the size of the PCI config space
// IsPCIeDevice identifies PCIe device by reading the size of the PCI config space
// Plain PCI device have 256 bytes of config space where PCIe devices have 4K
func isPCIeDevice(bdf string) bool {
func IsPCIeDevice(bdf string) bool {
if len(strings.Split(bdf, ":")) == 2 {
bdf = PCIDomain + ":" + bdf
}
@@ -157,14 +157,12 @@ func checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (boo
// GetAllVFIODevicesFromIOMMUGroup returns all the VFIO devices in the IOMMU group
// We can reuse this function at various levels, sandbox, container.
// Only the VFIO module is allowed to do bus assignments, all other modules need to
// ignore it if used as helper function to get VFIO information.
func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignment bool) ([]*config.VFIODev, error) {
func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODev, error) {
vfioDevs := []*config.VFIODev{}
vfioGroup := filepath.Base(device.HostPath)
iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices")
iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioGroup, "devices")
deviceFiles, err := os.ReadDir(iommuDevicesPath)
if err != nil {
@@ -174,7 +172,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
// Pass all devices in iommu group
for i, deviceFile := range deviceFiles {
//Get bdf of device eg 0000:00:1c.0
deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath)
deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(deviceFile.Name(), iommuDevicesPath)
if err != nil {
return nil, err
}
@@ -196,27 +194,24 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
isPCIe := isPCIeDevice(deviceBDF)
// Do not directly assign to `vfio` -- need to access field still
vfioPCI := config.VFIOPCIDev{
vfio = config.VFIODev{
ID: id,
Type: vfioDeviceType,
BDF: deviceBDF,
SysfsDev: deviceSysfsDev,
IsPCIe: isPCIe,
IsPCIe: IsPCIeDevice(deviceBDF),
Class: pciClass,
Rank: -1,
Port: device.Port,
}
if isPCIe && !ignoreBusAssignment {
vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs))
AllPCIeDevs[deviceBDF] = true
}
vfio = vfioPCI
case config.VFIOAPDeviceMediatedType:
devices, err := GetAPVFIODevices(deviceSysfsDev)
if err != nil {
return nil, err
}
vfio = config.VFIOAPDev{
vfio = config.VFIODev{
ID: id,
SysfsDev: deviceSysfsDev,
Type: config.VFIOAPDeviceMediatedType,

View File

@@ -28,14 +28,9 @@ const (
vfioRemoveIDPath = "/sys/bus/pci/drivers/vfio-pci/remove_id"
iommuGroupPath = "/sys/bus/pci/devices/%s/iommu_group"
vfioDevPath = "/dev/vfio/%s"
pcieRootPortPrefix = "rp"
vfioAPSysfsDir = "/sys/devices/vfio_ap"
)
var (
AllPCIeDevs = map[string]bool{}
)
// VFIODevice is a vfio device meant to be passed to the hypervisor
// to be used by the Virtual Machine.
type VFIODevice struct {
@@ -70,10 +65,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
}
}()
device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo, false)
device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo)
if err != nil {
return err
}
for _, vfio := range device.VfioDevs {
if vfio.IsPCIe {
busIndex := len(config.PCIeDevices[vfio.Port])
vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
config.PCIeDevices[vfio.Port][vfio.BDF] = true
}
}
coldPlug := device.DeviceInfo.ColdPlug
deviceLogger().WithField("cold-plug", coldPlug).Info("Attaching VFIO device")
@@ -169,23 +171,18 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
for _, dev := range ds.VFIODevs {
var vfio config.VFIODev
vfioDeviceType := (*device.VfioDevs[0]).GetType()
switch vfioDeviceType {
switch dev.Type {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
bdf := ""
if pciDev, ok := (*dev).(config.VFIOPCIDev); ok {
bdf = pciDev.BDF
}
vfio = config.VFIOPCIDev{
ID: *(*dev).GetID(),
Type: config.VFIODeviceType((*dev).GetType()),
BDF: bdf,
SysfsDev: *(*dev).GetSysfsDev(),
vfio = config.VFIODev{
ID: dev.ID,
Type: config.VFIODeviceType(dev.Type),
BDF: dev.BDF,
SysfsDev: dev.SysfsDev,
}
case config.VFIOAPDeviceMediatedType:
vfio = config.VFIOAPDev{
ID: *(*dev).GetID(),
SysfsDev: *(*dev).GetSysfsDev(),
vfio = config.VFIODev{
ID: dev.ID,
SysfsDev: dev.SysfsDev,
}
default:
deviceLogger().WithError(
@@ -200,7 +197,7 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
// It should implement GetAttachCount() and DeviceID() as api.Device implementation
// here it shares function from *GenericDevice so we don't need duplicate codes
func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
vfioDeviceType, err = GetVFIODeviceType(sysfsDevStr)
if err != nil {
@@ -210,14 +207,18 @@ func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceS
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType:
// Get bdf of device eg. 0000:00:1c.0
deviceBDF = getBDF(deviceFileName)
// OLD IMPL: deviceBDF = getBDF(deviceFileName)
// The old implementation did not consider the case where
// vfio devices are located on different root busses. The
// kata-agent will handle the case now, here, use the full PCI addr
deviceBDF = deviceFileName
// Get sysfs path used by cloud-hypervisor
deviceSysfsDev = filepath.Join(config.SysBusPciDevicesPath, deviceFileName)
case config.VFIOPCIDeviceMediatedType:
// Get sysfsdev of device eg. /sys/devices/pci0000:00/0000:00:02.0/f79944e4-5a3d-11e8-99ce-479cbab002e4
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
deviceBDF = getBDF(getMediatedBDF(deviceSysfsDev))
deviceBDF = GetBDF(getMediatedBDF(deviceSysfsDev))
case config.VFIOAPDeviceMediatedType:
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
@@ -240,7 +241,7 @@ func getMediatedBDF(deviceSysfsDev string) string {
// getBDF returns the BDF of pci device
// Expected input string format is [<domain>]:[<bus>][<slot>].[<func>] eg. 0000:02:10.0
func getBDF(deviceSysStr string) string {
func GetBDF(deviceSysStr string) string {
tokens := strings.SplitN(deviceSysStr, ":", 2)
if len(tokens) == 1 {
return ""

View File

@@ -20,7 +20,7 @@ func TestGetVFIODetails(t *testing.T) {
}
data := []testData{
{"0000:02:10.0", "02:10.0"},
{"0000:02:10.0", "0000:02:10.0"},
{"0000:0210.0", ""},
{"f79944e4-5a3d-11e8-99ce-", ""},
{"f79944e4-5a3d-11e8-99ce", ""},
@@ -29,7 +29,7 @@ func TestGetVFIODetails(t *testing.T) {
}
for _, d := range data {
deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(d.deviceStr, "")
deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(d.deviceStr, "")
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType:

View File

@@ -71,7 +71,11 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS
dm.blockDriver = config.VirtioSCSI
}
drivers.AllPCIeDevs = make(map[string]bool)
config.PCIeDevices = make(map[config.PCIePort]config.PCIePortMapping)
config.PCIeDevices[config.RootPort] = make(map[string]bool)
config.PCIeDevices[config.SwitchPort] = make(map[string]bool)
config.PCIeDevices[config.BridgePort] = make(map[string]bool)
for _, dev := range devices {
dm.devices[dev.DeviceID()] = dev
@@ -118,7 +122,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device
}
if IsVFIO(devInfo.HostPath) {
return drivers.NewVFIODevice(&devInfo), nil
} else if isVhostUserBlk(devInfo) {
} else if IsVhostUserBlk(devInfo) {
if devInfo.DriverOptions == nil {
devInfo.DriverOptions = make(map[string]string)
}

View File

@@ -116,14 +116,14 @@ func TestAttachVFIODevice(t *testing.T) {
_, err = os.Create(deviceConfigFile)
assert.Nil(t, err)
savedIOMMUPath := config.SysIOMMUPath
config.SysIOMMUPath = tmpDir
savedIOMMUPath := config.SysIOMMUGroupPath
config.SysIOMMUGroupPath = tmpDir
savedSysBusPciDevicesPath := config.SysBusPciDevicesPath
config.SysBusPciDevicesPath = devicesDir
defer func() {
config.SysIOMMUPath = savedIOMMUPath
config.SysIOMMUGroupPath = savedIOMMUPath
config.SysBusPciDevicesPath = savedSysBusPciDevicesPath
}()

View File

@@ -37,7 +37,7 @@ func isBlock(devInfo config.DeviceInfo) bool {
}
// isVhostUserBlk checks if the device is a VhostUserBlk device.
func isVhostUserBlk(devInfo config.DeviceInfo) bool {
func IsVhostUserBlk(devInfo config.DeviceInfo) bool {
return devInfo.DevType == "b" && devInfo.Major == config.VhostUserBlkMajor
}

View File

@@ -70,7 +70,7 @@ func TestIsVhostUserBlk(t *testing.T) {
}
for _, d := range data {
isVhostUserBlk := isVhostUserBlk(
isVhostUserBlk := IsVhostUserBlk(
config.DeviceInfo{
DevType: d.devType,
Major: d.major,

View File

@@ -123,6 +123,14 @@ const (
// PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port.
PCIeRootPort DeviceDriver = "pcie-root-port"
// PCIeSwitchUpstreamPort is a PCIe switch upstream port
// A upstream port connects to a PCIe Root Port
PCIeSwitchUpstreamPort DeviceDriver = "x3130-upstream"
// PCIeSwitchDownstreamPort is a PCIe switch downstream port
// PCIe devices can be hot-plugged to the downstream port.
PCIeSwitchDownstreamPort DeviceDriver = "xio3130-downstream"
// Loader is the Loader device driver.
Loader DeviceDriver = "loader"
@@ -236,6 +244,7 @@ const (
// SecExecGuest represents an s390x Secure Execution (Protected Virtualization in QEMU) object
SecExecGuest ObjectType = "s390-pv-guest"
// PEFGuest represent ppc64le PEF(Protected Execution Facility) object.
PEFGuest ObjectType = "pef-guest"
)
@@ -410,7 +419,6 @@ func (object Object) QemuParams(config *Config) []string {
deviceParams = append(deviceParams, string(object.Driver))
deviceParams = append(deviceParams, fmt.Sprintf("id=%s", object.DeviceID))
deviceParams = append(deviceParams, fmt.Sprintf("host-path=%s", object.File))
}
if len(deviceParams) > 0 {
@@ -1722,6 +1730,106 @@ func (b PCIeRootPortDevice) Valid() bool {
return true
}
// PCIeSwitchUpstreamPortDevice is the port connecting to the root port
type PCIeSwitchUpstreamPortDevice struct {
ID string // format: sup{n}, n>=0
Bus string // default is rp0
}
// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
func (b PCIeSwitchUpstreamPortDevice) QemuParams(config *Config) []string {
var qemuParams []string
var deviceParams []string
driver := PCIeSwitchUpstreamPort
deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
qemuParams = append(qemuParams, "-device")
qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
return qemuParams
}
// Valid returns true if the PCIeSwitchUpstreamPortDevice structure is valid and complete.
func (b PCIeSwitchUpstreamPortDevice) Valid() bool {
if b.ID == "" {
return false
}
if b.Bus == "" {
return false
}
return true
}
// PCIeSwitchDownstreamPortDevice is the port connecting to the root port
type PCIeSwitchDownstreamPortDevice struct {
ID string // format: sup{n}, n>=0
Bus string // default is rp0
Chassis string // (slot, chassis) pair is mandatory and must be unique for each downstream port, >=0, default is 0x00
Slot string // >=0, default is 0x00
// This to work needs patches to QEMU
BusReserve string
// Pref64 and Pref32 are not allowed to be set simultaneously
Pref64Reserve string // reserve prefetched MMIO aperture, 64-bit
Pref32Reserve string // reserve prefetched MMIO aperture, 32-bit
MemReserve string // reserve non-prefetched MMIO aperture, 32-bit *only*
IOReserve string // IO reservation
}
// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
func (b PCIeSwitchDownstreamPortDevice) QemuParams(config *Config) []string {
var qemuParams []string
var deviceParams []string
driver := PCIeSwitchDownstreamPort
deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
deviceParams = append(deviceParams, fmt.Sprintf("chassis=%s", b.Chassis))
deviceParams = append(deviceParams, fmt.Sprintf("slot=%s", b.Slot))
if b.BusReserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("bus-reserve=%s", b.BusReserve))
}
if b.Pref64Reserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("pref64-reserve=%s", b.Pref64Reserve))
}
if b.Pref32Reserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("pref32-reserve=%s", b.Pref32Reserve))
}
if b.MemReserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("mem-reserve=%s", b.MemReserve))
}
if b.IOReserve != "" {
deviceParams = append(deviceParams, fmt.Sprintf("io-reserve=%s", b.IOReserve))
}
qemuParams = append(qemuParams, "-device")
qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
return qemuParams
}
// Valid returns true if the PCIeSwitchUpstremPortDevice structure is valid and complete.
func (b PCIeSwitchDownstreamPortDevice) Valid() bool {
if b.ID == "" {
return false
}
if b.Bus == "" {
return false
}
if b.Chassis == "" {
return false
}
if b.Slot == "" {
return false
}
return true
}
// VFIODevice represents a qemu vfio device meant for direct access by guest OS.
type VFIODevice struct {
// Bus-Device-Function of device

View File

@@ -5,7 +5,7 @@
package hypervisors
import "fmt"
import "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
// Bridge is a bridge where devices can be hot plugged
type Bridge struct {
@@ -28,37 +28,8 @@ type CPUDevice struct {
ID string
}
// PCIePort distinguish only between root and switch port
type PCIePort string
const (
// RootPort attach VFIO devices to a root-port
RootPort PCIePort = "root-port"
// SwitchPort attach VFIO devices to a switch-port
SwitchPort = "switch-port"
// BridgePort is the default
BridgePort = "bridge-port"
// NoPort is for disabling VFIO hotplug/coldplug
NoPort = "no-port"
)
func (p PCIePort) String() string {
switch p {
case RootPort:
return "root-port"
case SwitchPort:
return "switch-port"
case BridgePort:
return "bridge-port"
case NoPort:
return "no-port"
}
return fmt.Sprintf("<unknown PCIePort: %s>", string(p))
}
type HypervisorState struct {
BlockIndexMap map[int]struct{}
// Type of hypervisor, E.g. qemu/firecracker/acrn.
Type string
UUID string
@@ -74,7 +45,7 @@ type HypervisorState struct {
HotpluggedMemory int
VirtiofsDaemonPid int
Pid int
PCIeRootPort int
ColdPlugVFIO PCIePort
HotPlugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
HotplugVFIOOnRootBus bool
}

View File

@@ -14,7 +14,7 @@ import (
"strconv"
"testing"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/assert"
)
@@ -224,8 +224,8 @@ type RuntimeConfigOptions struct {
JaegerUser string
JaegerPassword string
PFlash []string
PCIeRootPort uint32
ColdPlugVFIO hv.PCIePort
HotPlugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
DefaultVCPUCount uint32
DefaultMaxVCPUCount uint32
DefaultMemSize uint32
@@ -318,7 +318,6 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string {
disable_block_device_use = ` + strconv.FormatBool(config.DisableBlock) + `
enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + `
hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + `
pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + `
cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `"
msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + `
enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + `

View File

@@ -10,7 +10,7 @@
package katautils
import (
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
config "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
)
// name is the name of the runtime
@@ -82,7 +82,6 @@ const defaultEnableDebug bool = false
const defaultDisableNestingChecks bool = false
const defaultMsize9p uint32 = 8192
const defaultHotplugVFIOOnRootBus bool = false
const defaultPCIeRootPort = 0
const defaultEntropySource = "/dev/urandom"
const defaultGuestHookPath string = ""
const defaultVirtioFSCacheMode = "never"
@@ -115,4 +114,5 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock"
// Default config file used by stateless systems.
var defaultRuntimeConfiguration = "@CONFIG_PATH@"
const defaultColdPlugVFIO = hv.NoPort
const defaultHotPlugVFIO = config.NoPort
const defaultColdPlugVFIO = config.NoPort

View File

@@ -20,7 +20,6 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
@@ -79,98 +78,98 @@ type factory struct {
}
type hypervisor struct {
Path string `toml:"path"`
JailerPath string `toml:"jailer_path"`
Kernel string `toml:"kernel"`
CtlPath string `toml:"ctlpath"`
Initrd string `toml:"initrd"`
Image string `toml:"image"`
RootfsType string `toml:"rootfs_type"`
Firmware string `toml:"firmware"`
FirmwareVolume string `toml:"firmware_volume"`
MachineAccelerators string `toml:"machine_accelerators"`
CPUFeatures string `toml:"cpu_features"`
KernelParams string `toml:"kernel_params"`
MachineType string `toml:"machine_type"`
BlockDeviceDriver string `toml:"block_device_driver"`
EntropySource string `toml:"entropy_source"`
SharedFS string `toml:"shared_fs"`
VirtioFSDaemon string `toml:"virtio_fs_daemon"`
VirtioFSCache string `toml:"virtio_fs_cache"`
VhostUserStorePath string `toml:"vhost_user_store_path"`
FileBackedMemRootDir string `toml:"file_mem_backend"`
GuestHookPath string `toml:"guest_hook_path"`
GuestMemoryDumpPath string `toml:"guest_memory_dump_path"`
SeccompSandbox string `toml:"seccompsandbox"`
GuestPreAttestationURI string `toml:"guest_pre_attestation_kbs_uri"`
GuestPreAttestationMode string `toml:"guest_pre_attestation_kbs_mode"`
GuestPreAttestationKeyset string `toml:"guest_pre_attestation_keyset"`
SEVCertChainPath string `toml:"sev_cert_chain"`
BlockDeviceAIO string `toml:"block_device_aio"`
RemoteHypervisorSocket string `toml:"remote_hypervisor_socket"`
HypervisorPathList []string `toml:"valid_hypervisor_paths"`
JailerPathList []string `toml:"valid_jailer_paths"`
CtlPathList []string `toml:"valid_ctlpaths"`
VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"`
VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"`
PFlashList []string `toml:"pflashes"`
VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"`
FileBackedMemRootList []string `toml:"valid_file_mem_backends"`
EntropySourceList []string `toml:"valid_entropy_sources"`
EnableAnnotations []string `toml:"enable_annotations"`
RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"`
TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"`
MemOffset uint64 `toml:"memory_offset"`
DefaultMaxMemorySize uint64 `toml:"default_maxmemory"`
DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"`
DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"`
DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"`
DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"`
NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"`
NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"`
NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"`
NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"`
VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"`
VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"`
DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"`
MemorySize uint32 `toml:"default_memory"`
MemSlots uint32 `toml:"memory_slots"`
DefaultBridges uint32 `toml:"default_bridges"`
Msize9p uint32 `toml:"msize_9p"`
PCIeRootPort uint32 `toml:"pcie_root_port"`
GuestPreAttestationGRPCTimeout uint32 `toml:"guest_pre_attestation_grpc_timeout"`
SEVGuestPolicy uint32 `toml:"sev_guest_policy"`
SNPGuestPolicy uint64 `toml:"snp_guest_policy"`
RemoteHypervisorTimeout uint32 `toml:"remote_hypervisor_timeout"`
NumVCPUs int32 `toml:"default_vcpus"`
BlockDeviceCacheSet bool `toml:"block_device_cache_set"`
BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"`
BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"`
EnableVhostUserStore bool `toml:"enable_vhost_user_store"`
VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"`
DisableBlockDeviceUse bool `toml:"disable_block_device_use"`
MemPrealloc bool `toml:"enable_mem_prealloc"`
HugePages bool `toml:"enable_hugepages"`
VirtioMem bool `toml:"enable_virtio_mem"`
IOMMU bool `toml:"enable_iommu"`
IOMMUPlatform bool `toml:"enable_iommu_platform"`
Debug bool `toml:"enable_debug"`
DisableNestingChecks bool `toml:"disable_nesting_checks"`
EnableIOThreads bool `toml:"enable_iothreads"`
DisableImageNvdimm bool `toml:"disable_image_nvdimm"`
HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"`
ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"`
DisableVhostNet bool `toml:"disable_vhost_net"`
GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"`
ConfidentialGuest bool `toml:"confidential_guest"`
SevSnpGuest bool `toml:"sev_snp_guest"`
GuestSwap bool `toml:"enable_guest_swap"`
Rootless bool `toml:"rootless"`
DisableSeccomp bool `toml:"disable_seccomp"`
DisableSeLinux bool `toml:"disable_selinux"`
DisableGuestSeLinux bool `toml:"disable_guest_selinux"`
LegacySerial bool `toml:"use_legacy_serial"`
GuestPreAttestation bool `toml:"guest_pre_attestation"`
Path string `toml:"path"`
JailerPath string `toml:"jailer_path"`
Kernel string `toml:"kernel"`
CtlPath string `toml:"ctlpath"`
Initrd string `toml:"initrd"`
Image string `toml:"image"`
RootfsType string `toml:"rootfs_type"`
Firmware string `toml:"firmware"`
FirmwareVolume string `toml:"firmware_volume"`
MachineAccelerators string `toml:"machine_accelerators"`
CPUFeatures string `toml:"cpu_features"`
KernelParams string `toml:"kernel_params"`
MachineType string `toml:"machine_type"`
BlockDeviceDriver string `toml:"block_device_driver"`
EntropySource string `toml:"entropy_source"`
SharedFS string `toml:"shared_fs"`
VirtioFSDaemon string `toml:"virtio_fs_daemon"`
VirtioFSCache string `toml:"virtio_fs_cache"`
VhostUserStorePath string `toml:"vhost_user_store_path"`
FileBackedMemRootDir string `toml:"file_mem_backend"`
GuestHookPath string `toml:"guest_hook_path"`
GuestMemoryDumpPath string `toml:"guest_memory_dump_path"`
SeccompSandbox string `toml:"seccompsandbox"`
GuestPreAttestationURI string `toml:"guest_pre_attestation_kbs_uri"`
GuestPreAttestationMode string `toml:"guest_pre_attestation_kbs_mode"`
GuestPreAttestationKeyset string `toml:"guest_pre_attestation_keyset"`
SEVCertChainPath string `toml:"sev_cert_chain"`
BlockDeviceAIO string `toml:"block_device_aio"`
RemoteHypervisorSocket string `toml:"remote_hypervisor_socket"`
HypervisorPathList []string `toml:"valid_hypervisor_paths"`
JailerPathList []string `toml:"valid_jailer_paths"`
CtlPathList []string `toml:"valid_ctlpaths"`
VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"`
VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"`
PFlashList []string `toml:"pflashes"`
VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"`
FileBackedMemRootList []string `toml:"valid_file_mem_backends"`
EntropySourceList []string `toml:"valid_entropy_sources"`
EnableAnnotations []string `toml:"enable_annotations"`
RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"`
TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"`
MemOffset uint64 `toml:"memory_offset"`
DefaultMaxMemorySize uint64 `toml:"default_maxmemory"`
DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"`
DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"`
DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"`
DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"`
NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"`
NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"`
NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"`
NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"`
VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"`
VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"`
DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"`
MemorySize uint32 `toml:"default_memory"`
MemSlots uint32 `toml:"memory_slots"`
DefaultBridges uint32 `toml:"default_bridges"`
Msize9p uint32 `toml:"msize_9p"`
GuestPreAttestationGRPCTimeout uint32 `toml:"guest_pre_attestation_grpc_timeout"`
SEVGuestPolicy uint32 `toml:"sev_guest_policy"`
SNPGuestPolicy uint64 `toml:"snp_guest_policy"`
RemoteHypervisorTimeout uint32 `toml:"remote_hypervisor_timeout"`
NumVCPUs int32 `toml:"default_vcpus"`
BlockDeviceCacheSet bool `toml:"block_device_cache_set"`
BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"`
BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"`
EnableVhostUserStore bool `toml:"enable_vhost_user_store"`
VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"`
DisableBlockDeviceUse bool `toml:"disable_block_device_use"`
MemPrealloc bool `toml:"enable_mem_prealloc"`
HugePages bool `toml:"enable_hugepages"`
VirtioMem bool `toml:"enable_virtio_mem"`
IOMMU bool `toml:"enable_iommu"`
IOMMUPlatform bool `toml:"enable_iommu_platform"`
Debug bool `toml:"enable_debug"`
DisableNestingChecks bool `toml:"disable_nesting_checks"`
EnableIOThreads bool `toml:"enable_iothreads"`
DisableImageNvdimm bool `toml:"disable_image_nvdimm"`
HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"`
HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"`
ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"`
DisableVhostNet bool `toml:"disable_vhost_net"`
GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"`
ConfidentialGuest bool `toml:"confidential_guest"`
SevSnpGuest bool `toml:"sev_snp_guest"`
GuestSwap bool `toml:"enable_guest_swap"`
Rootless bool `toml:"rootless"`
DisableSeccomp bool `toml:"disable_seccomp"`
DisableSeLinux bool `toml:"disable_selinux"`
DisableGuestSeLinux bool `toml:"disable_guest_selinux"`
LegacySerial bool `toml:"use_legacy_serial"`
GuestPreAttestation bool `toml:"guest_pre_attestation"`
}
type runtime struct {
@@ -298,12 +297,18 @@ func (h hypervisor) firmware() (string, error) {
return ResolvePath(p)
}
func (h hypervisor) coldPlugVFIO() hv.PCIePort {
func (h hypervisor) coldPlugVFIO() config.PCIePort {
if h.ColdPlugVFIO == "" {
return defaultColdPlugVFIO
}
return h.ColdPlugVFIO
}
func (h hypervisor) hotPlugVFIO() config.PCIePort {
if h.HotPlugVFIO == "" {
return defaultHotPlugVFIO
}
return h.HotPlugVFIO
}
func (h hypervisor) firmwareVolume() (string, error) {
p := h.FirmwareVolume
@@ -523,7 +528,7 @@ func (h hypervisor) blockDeviceAIO() (string, error) {
}
func (h hypervisor) sharedFS() (string, error) {
supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus}
supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus, config.NoSharedFS}
if h.SharedFS == "" {
return config.VirtioFS, nil
@@ -838,6 +843,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
KernelPath: kernel,
InitrdPath: initrd,
ImagePath: image,
RootfsType: rootfsType,
FirmwarePath: firmware,
FirmwareVolumePath: firmwareVolume,
PFlash: pflashes,
@@ -880,8 +886,8 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
Msize9p: h.msize9p(),
DisableImageNvdimm: h.DisableImageNvdimm,
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
HotPlugVFIO: h.hotPlugVFIO(),
ColdPlugVFIO: h.coldPlugVFIO(),
PCIeRootPort: h.PCIeRootPort,
DisableVhostNet: h.DisableVhostNet,
EnableVhostUserStore: h.EnableVhostUserStore,
VhostUserStorePath: h.vhostUserStorePath(),
@@ -907,7 +913,6 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
SNPGuestPolicy: h.getSnpGuestPolicy(),
SEVCertChainPath: h.SEVCertChainPath,
DisableGuestSeLinux: h.DisableGuestSeLinux,
RootfsType: rootfsType,
}, nil
}
@@ -1034,11 +1039,12 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
return vc.HypervisorConfig{}, err
}
if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus {
return vc.HypervisorConfig{}, errors.New("clh only support virtio-fs or virtio-fs-nydus")
if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus && sharedFS != config.NoSharedFS {
return vc.HypervisorConfig{},
fmt.Errorf("Cloud Hypervisor does not support %s shared filesystem option", sharedFS)
}
if h.VirtioFSDaemon == "" {
if (sharedFS == config.VirtioFS || sharedFS == config.VirtioFSNydus) && h.VirtioFSDaemon == "" {
return vc.HypervisorConfig{},
fmt.Errorf("cannot enable %s without daemon path in configuration file", sharedFS)
}
@@ -1084,7 +1090,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
Msize9p: h.msize9p(),
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
ColdPlugVFIO: h.coldPlugVFIO(),
PCIeRootPort: h.PCIeRootPort,
HotPlugVFIO: h.hotPlugVFIO(),
DisableVhostNet: true,
GuestHookPath: h.guestHookPath(),
VirtioFSExtraArgs: h.VirtioFSExtraArgs,
@@ -1302,6 +1308,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
KernelPath: defaultKernelPath,
ImagePath: defaultImagePath,
InitrdPath: defaultInitrdPath,
RootfsType: defaultRootfsType,
FirmwarePath: defaultFirmwarePath,
FirmwareVolumePath: defaultFirmwareVolumePath,
MachineAccelerators: defaultMachineAccelerators,
@@ -1330,9 +1337,10 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
Msize9p: defaultMsize9p,
HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus,
ColdPlugVFIO: defaultColdPlugVFIO,
PCIeRootPort: defaultPCIeRootPort,
HotPlugVFIO: defaultHotPlugVFIO,
GuestHookPath: defaultGuestHookPath,
VhostUserStorePath: defaultVhostUserStorePath,
VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect,
VirtioFSCache: defaultVirtioFSCacheMode,
DisableImageNvdimm: defaultDisableImageNvdimm,
RxRateLimiterMaxRate: defaultRxRateLimiterMaxRate,
@@ -1352,8 +1360,6 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
SEVGuestPolicy: defaultSEVGuestPolicy,
SNPGuestPolicy: defaultSNPGuestPolicy,
SEVCertChainPath: defaultSEVCertChainPath,
VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect,
RootfsType: defaultRootfsType,
}
}
@@ -1711,9 +1717,10 @@ func checkConfig(config oci.RuntimeConfig) error {
return err
}
hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO
coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO
machineType := config.HypervisorConfig.HypervisorMachineType
if err := checkPCIeConfig(coldPlugVFIO, machineType); err != nil {
if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType); err != nil {
return err
}
@@ -1723,18 +1730,32 @@ func checkConfig(config oci.RuntimeConfig) error {
// checkPCIeConfig ensures the PCIe configuration is valid.
// Only allow one of the following settings for cold-plug:
// no-port, root-port, switch-port
func checkPCIeConfig(vfioPort hv.PCIePort, machineType string) error {
func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string) error {
// Currently only QEMU q35 supports advanced PCIe topologies
// firecracker, dragonball do not have right now any PCIe support
if machineType != "q35" {
return nil
}
if vfioPort == hv.NoPort || vfioPort == hv.RootPort || vfioPort == hv.SwitchPort {
if coldPlug != config.NoPort && hotPlug != config.NoPort {
return fmt.Errorf("invalid hot-plug=%s and cold-plug=%s settings, only one of them can be set", coldPlug, hotPlug)
}
if coldPlug == config.NoPort && hotPlug == config.NoPort {
return nil
}
var port config.PCIePort
if coldPlug != config.NoPort {
port = coldPlug
}
if hotPlug != config.NoPort {
port = hotPlug
}
if port == config.NoPort || port == config.BridgePort || port == config.RootPort || port == config.SwitchPort {
return nil
}
return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s",
vfioPort, hv.NoPort, hv.RootPort, hv.SwitchPort)
return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s, %s",
coldPlug, config.NoPort, config.BridgePort, config.RootPort, config.SwitchPort)
}
// checkNetNsConfig performs sanity checks on disable_new_netns config.

View File

@@ -18,8 +18,8 @@ import (
"syscall"
"testing"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
@@ -63,15 +63,16 @@ func createConfig(configPath string, fileData string) error {
// createAllRuntimeConfigFiles creates all files necessary to call
// loadConfiguration().
func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConfig, err error) {
func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntimeConfig, err error) {
if dir == "" {
return config, fmt.Errorf("BUG: need directory")
return testConfig, fmt.Errorf("BUG: need directory")
}
if hypervisor == "" {
return config, fmt.Errorf("BUG: need hypervisor")
return testConfig, fmt.Errorf("BUG: need hypervisor")
}
var coldPlugVFIO hv.PCIePort
var hotPlugVFIO config.PCIePort
var coldPlugVFIO config.PCIePort
hypervisorPath := path.Join(dir, "hypervisor")
kernelPath := path.Join(dir, "kernel")
kernelParams := "foo=bar xyz"
@@ -85,8 +86,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
blockDeviceAIO := "io_uring"
enableIOThreads := true
hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
coldPlugVFIO = hv.RootPort
hotPlugVFIO = config.NoPort
coldPlugVFIO = config.BridgePort
disableNewNetNs := false
sharedFS := "virtio-9p"
virtioFSdaemon := path.Join(dir, "virtiofsd")
@@ -108,7 +109,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
BlockDeviceAIO: blockDeviceAIO,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
DisableNewNetNs: disableNewNetNs,
DefaultVCPUCount: defaultVCPUCount,
@@ -134,7 +135,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
configPath := path.Join(dir, "runtime.toml")
err = createConfig(configPath, runtimeConfigFileData)
if err != nil {
return config, err
return testConfig, err
}
configPathLink := path.Join(filepath.Dir(configPath), "link-to-configuration.toml")
@@ -142,7 +143,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
// create a link to the config file
err = syscall.Symlink(configPath, configPathLink)
if err != nil {
return config, err
return testConfig, err
}
files := []string{hypervisorPath, kernelPath, imagePath}
@@ -151,7 +152,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
// create the resource (which must be >0 bytes)
err := WriteFile(file, "foo", testFileMode)
if err != nil {
return config, err
return testConfig, err
}
}
@@ -172,7 +173,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
DefaultBridges: defaultBridgesCount,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
Msize9p: defaultMsize9p,
MemSlots: defaultMemSlots,
@@ -217,10 +218,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
err = SetKernelParams(&runtimeConfig)
if err != nil {
return config, err
return testConfig, err
}
config = testRuntimeConfig{
rtimeConfig := testRuntimeConfig{
RuntimeConfig: runtimeConfig,
RuntimeConfigFile: configPath,
ConfigPath: configPath,
@@ -229,7 +230,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
LogPath: logPath,
}
return config, nil
return rtimeConfig, nil
}
// testLoadConfiguration accepts an optional function that can be used
@@ -570,6 +571,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
BlockDeviceAIO: defaultBlockDeviceAIO,
DisableGuestSeLinux: defaultDisableGuestSeLinux,
SNPGuestPolicy: defaultSNPGuestPolicy,
HotPlugVFIO: defaultHotPlugVFIO,
ColdPlugVFIO: defaultColdPlugVFIO,
}
@@ -604,7 +606,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
func TestNewQemuHypervisorConfig(t *testing.T) {
dir := t.TempDir()
var coldPlugVFIO hv.PCIePort
var coldPlugVFIO config.PCIePort
hypervisorPath := path.Join(dir, "hypervisor")
kernelPath := path.Join(dir, "kernel")
imagePath := path.Join(dir, "image")
@@ -612,8 +614,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
disableBlock := true
enableIOThreads := true
hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
coldPlugVFIO = hv.RootPort
coldPlugVFIO = config.BridgePort
orgVHostVSockDevicePath := utils.VHostVSockDevicePath
blockDeviceAIO := "io_uring"
defer func() {
@@ -632,7 +633,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
DisableBlockDeviceUse: disableBlock,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
ColdPlugVFIO: coldPlugVFIO,
RxRateLimiterMaxRate: rxRateLimiterMaxRate,
TxRateLimiterMaxRate: txRateLimiterMaxRate,
@@ -688,10 +688,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus)
}
if config.PCIeRootPort != pcieRootPort {
t.Errorf("Expected value for PCIeRootPort %v, got %v", pcieRootPort, config.PCIeRootPort)
}
if config.RxRateLimiterMaxRate != rxRateLimiterMaxRate {
t.Errorf("Expected value for rx rate limiter %v, got %v", rxRateLimiterMaxRate, config.RxRateLimiterMaxRate)
}
@@ -814,7 +810,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
disableBlock := true
enableIOThreads := true
hotplugVFIOOnRootBus := true
pcieRootPort := uint32(2)
hypervisor := hypervisor{
Path: hypervisorPath,
@@ -825,7 +820,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
DisableBlockDeviceUse: disableBlock,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
PCIeRootPort: pcieRootPort,
}
_, err := newQemuHypervisorConfig(hypervisor)

View File

@@ -460,6 +460,10 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
return err
}
if err := addHypervisorHotColdPlugVfioOverrides(ocispec, config); err != nil {
return err
}
if value, ok := ocispec.Annotations[vcAnnotations.MachineType]; ok {
if value != "" {
config.HypervisorConfig.HypervisorMachineType = value
@@ -515,12 +519,6 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
return err
}
if err := newAnnotationConfiguration(ocispec, vcAnnotations.PCIeRootPort).setUint(func(pcieRootPort uint64) {
config.HypervisorConfig.PCIeRootPort = uint32(pcieRootPort)
}); err != nil {
return err
}
if value, ok := ocispec.Annotations[vcAnnotations.EntropySource]; ok {
if !checkPathIsInGlobs(runtime.HypervisorConfig.EntropySourceList, value) {
return fmt.Errorf("entropy source %v required from annotation is not valid", value)
@@ -583,6 +581,37 @@ func addHypervisorPathOverrides(ocispec specs.Spec, config *vc.SandboxConfig, ru
return nil
}
func addHypervisorPCIePortOverride(value string) (config.PCIePort, error) {
if value == "" {
return config.NoPort, nil
}
port := config.PCIePort(value)
if port.Invalid() {
return config.InvalidPort, fmt.Errorf("Invalid PCIe port \"%v\" specified in annotation", value)
}
return port, nil
}
func addHypervisorHotColdPlugVfioOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error {
var err error
if value, ok := ocispec.Annotations[vcAnnotations.HotPlugVFIO]; ok {
if sbConfig.HypervisorConfig.HotPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil {
return err
}
// If hot-plug is specified disable cold-plug and vice versa
sbConfig.HypervisorConfig.ColdPlugVFIO = config.NoPort
}
if value, ok := ocispec.Annotations[vcAnnotations.ColdPlugVFIO]; ok {
if sbConfig.HypervisorConfig.ColdPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil {
return err
}
// If cold-plug is specified disable hot-plug and vice versa
sbConfig.HypervisorConfig.HotPlugVFIO = config.NoPort
}
return nil
}
func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, runtime RuntimeConfig) error {
if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMemory).setUintWithCheck(func(memorySz uint64) error {

View File

@@ -599,7 +599,7 @@ func TestContainerPipeSizeAnnotation(t *testing.T) {
func TestAddHypervisorAnnotations(t *testing.T) {
assert := assert.New(t)
config := vc.SandboxConfig{
sbConfig := vc.SandboxConfig{
Annotations: make(map[string]string),
}
@@ -628,8 +628,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
runtimeConfig.HypervisorConfig.VirtioFSDaemonList = []string{"/bin/*ls*"}
ocispec.Annotations[vcAnnotations.KernelParams] = "vsyscall=emulate iommu=on"
addHypervisorConfigOverrides(ocispec, &config, runtimeConfig)
assert.Exactly(expectedHyperConfig, config.HypervisorConfig)
addHypervisorConfigOverrides(ocispec, &sbConfig, runtimeConfig)
assert.Exactly(expectedHyperConfig, sbConfig.HypervisorConfig)
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1"
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1"
@@ -660,7 +660,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
ocispec.Annotations[vcAnnotations.GuestHookPath] = "/usr/bin/"
ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true"
ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true"
ocispec.Annotations[vcAnnotations.PCIeRootPort] = "2"
ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = config.BridgePort
ocispec.Annotations[vcAnnotations.HotPlugVFIO] = config.NoPort
ocispec.Annotations[vcAnnotations.IOMMUPlatform] = "true"
ocispec.Annotations[vcAnnotations.SGXEPC] = "64Mi"
ocispec.Annotations[vcAnnotations.UseLegacySerial] = "true"
@@ -668,55 +669,58 @@ func TestAddHypervisorAnnotations(t *testing.T) {
ocispec.Annotations[vcAnnotations.RxRateLimiterMaxRate] = "10000000"
ocispec.Annotations[vcAnnotations.TxRateLimiterMaxRate] = "10000000"
addAnnotations(ocispec, &config, runtimeConfig)
assert.Equal(config.HypervisorConfig.NumVCPUs, uint32(1))
assert.Equal(config.HypervisorConfig.DefaultMaxVCPUs, uint32(1))
assert.Equal(config.HypervisorConfig.MemorySize, uint32(1024))
assert.Equal(config.HypervisorConfig.MemSlots, uint32(20))
assert.Equal(config.HypervisorConfig.MemOffset, uint64(512))
assert.Equal(config.HypervisorConfig.VirtioMem, true)
assert.Equal(config.HypervisorConfig.MemPrealloc, true)
assert.Equal(config.HypervisorConfig.FileBackedMemRootDir, "/dev/shm")
assert.Equal(config.HypervisorConfig.HugePages, true)
assert.Equal(config.HypervisorConfig.IOMMU, true)
assert.Equal(config.HypervisorConfig.BlockDeviceDriver, "virtio-scsi")
assert.Equal(config.HypervisorConfig.BlockDeviceAIO, "io_uring")
assert.Equal(config.HypervisorConfig.DisableBlockDeviceUse, true)
assert.Equal(config.HypervisorConfig.EnableIOThreads, true)
assert.Equal(config.HypervisorConfig.BlockDeviceCacheSet, true)
assert.Equal(config.HypervisorConfig.BlockDeviceCacheDirect, true)
assert.Equal(config.HypervisorConfig.BlockDeviceCacheNoflush, true)
assert.Equal(config.HypervisorConfig.SharedFS, "virtio-fs")
assert.Equal(config.HypervisorConfig.VirtioFSDaemon, "/bin/false")
assert.Equal(config.HypervisorConfig.VirtioFSCache, "auto")
assert.ElementsMatch(config.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"})
assert.Equal(config.HypervisorConfig.Msize9p, uint32(512))
assert.Equal(config.HypervisorConfig.HypervisorMachineType, "q35")
assert.Equal(config.HypervisorConfig.MachineAccelerators, "nofw")
assert.Equal(config.HypervisorConfig.CPUFeatures, "pmu=off")
assert.Equal(config.HypervisorConfig.DisableVhostNet, true)
assert.Equal(config.HypervisorConfig.GuestHookPath, "/usr/bin/")
assert.Equal(config.HypervisorConfig.DisableImageNvdimm, true)
assert.Equal(config.HypervisorConfig.HotplugVFIOOnRootBus, true)
assert.Equal(config.HypervisorConfig.PCIeRootPort, uint32(2))
assert.Equal(config.HypervisorConfig.IOMMUPlatform, true)
assert.Equal(config.HypervisorConfig.SGXEPCSize, int64(67108864))
assert.Equal(config.HypervisorConfig.LegacySerial, true)
assert.Equal(config.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000))
assert.Equal(config.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000))
err := addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.NoError(err)
assert.Equal(sbConfig.HypervisorConfig.NumVCPUs, uint32(1))
assert.Equal(sbConfig.HypervisorConfig.DefaultMaxVCPUs, uint32(1))
assert.Equal(sbConfig.HypervisorConfig.MemorySize, uint32(1024))
assert.Equal(sbConfig.HypervisorConfig.MemSlots, uint32(20))
assert.Equal(sbConfig.HypervisorConfig.MemOffset, uint64(512))
assert.Equal(sbConfig.HypervisorConfig.VirtioMem, true)
assert.Equal(sbConfig.HypervisorConfig.MemPrealloc, true)
assert.Equal(sbConfig.HypervisorConfig.FileBackedMemRootDir, "/dev/shm")
assert.Equal(sbConfig.HypervisorConfig.HugePages, true)
assert.Equal(sbConfig.HypervisorConfig.IOMMU, true)
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceDriver, "virtio-scsi")
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceAIO, "io_uring")
assert.Equal(sbConfig.HypervisorConfig.DisableBlockDeviceUse, true)
assert.Equal(sbConfig.HypervisorConfig.EnableIOThreads, true)
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheSet, true)
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheDirect, true)
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheNoflush, true)
assert.Equal(sbConfig.HypervisorConfig.SharedFS, "virtio-fs")
assert.Equal(sbConfig.HypervisorConfig.VirtioFSDaemon, "/bin/false")
assert.Equal(sbConfig.HypervisorConfig.VirtioFSCache, "auto")
assert.ElementsMatch(sbConfig.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"})
assert.Equal(sbConfig.HypervisorConfig.Msize9p, uint32(512))
assert.Equal(sbConfig.HypervisorConfig.HypervisorMachineType, "q35")
assert.Equal(sbConfig.HypervisorConfig.MachineAccelerators, "nofw")
assert.Equal(sbConfig.HypervisorConfig.CPUFeatures, "pmu=off")
assert.Equal(sbConfig.HypervisorConfig.DisableVhostNet, true)
assert.Equal(sbConfig.HypervisorConfig.GuestHookPath, "/usr/bin/")
assert.Equal(sbConfig.HypervisorConfig.DisableImageNvdimm, true)
assert.Equal(sbConfig.HypervisorConfig.HotplugVFIOOnRootBus, true)
assert.Equal(string(sbConfig.HypervisorConfig.ColdPlugVFIO), string(config.BridgePort))
assert.Equal(string(sbConfig.HypervisorConfig.HotPlugVFIO), string(config.NoPort))
assert.Equal(sbConfig.HypervisorConfig.IOMMUPlatform, true)
assert.Equal(sbConfig.HypervisorConfig.SGXEPCSize, int64(67108864))
assert.Equal(sbConfig.HypervisorConfig.LegacySerial, true)
assert.Equal(sbConfig.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000))
assert.Equal(sbConfig.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000))
// In case an absurd large value is provided, the config value if not over-ridden
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "655536"
err := addAnnotations(ocispec, &config, runtimeConfig)
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.Error(err)
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "-1"
err = addAnnotations(ocispec, &config, runtimeConfig)
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.Error(err)
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1"
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "-1"
err = addAnnotations(ocispec, &config, runtimeConfig)
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.Error(err)
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1"

View File

@@ -90,7 +90,7 @@ func (a *Acrn) Capabilities(ctx context.Context) types.Capabilities {
span, _ := katatrace.Trace(ctx, a.Logger(), "Capabilities", acrnTracingTags, map[string]string{"sandbox_id": a.id})
defer span.End()
return a.arch.capabilities()
return a.arch.capabilities(a.config)
}
func (a *Acrn) HypervisorConfig() HypervisorConfig {

View File

@@ -33,7 +33,7 @@ type acrnArch interface {
kernelParameters(debug bool) []Param
//capabilities returns the capabilities supported by acrn
capabilities() types.Capabilities
capabilities(config HypervisorConfig) types.Capabilities
// memoryTopology returns the memory topology using the given amount of memoryMb and hostMemoryMb
memoryTopology(memMb uint64) Memory
@@ -361,7 +361,7 @@ func (a *acrnArchBase) memoryTopology(memoryMb uint64) Memory {
return memory
}
func (a *acrnArchBase) capabilities() types.Capabilities {
func (a *acrnArchBase) capabilities(config HypervisorConfig) types.Capabilities {
var caps types.Capabilities
caps.SetBlockDeviceSupport()

View File

@@ -83,8 +83,9 @@ func TestAcrnArchBaseKernelParameters(t *testing.T) {
func TestAcrnArchBaseCapabilities(t *testing.T) {
assert := assert.New(t)
acrnArchBase := newAcrnArchBase()
config := HypervisorConfig{}
c := acrnArchBase.capabilities()
c := acrnArchBase.capabilities(config)
assert.True(c.IsBlockDeviceSupported())
assert.True(c.IsBlockDeviceHotplugSupported())
assert.False(c.IsFsSharingSupported())

View File

@@ -349,6 +349,10 @@ func (clh *cloudHypervisor) createVirtiofsDaemon(sharedPath string) (VirtiofsDae
}
func (clh *cloudHypervisor) setupVirtiofsDaemon(ctx context.Context) error {
if clh.config.SharedFS == config.NoSharedFS {
return nil
}
if clh.config.SharedFS == config.Virtio9P {
return errors.New("cloud-hypervisor only supports virtio based file sharing")
}
@@ -860,12 +864,12 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
defer cancel()
// Create the clh device config via the constructor to ensure default values are properly assigned
clhDevice := *chclient.NewDeviceConfig(*(*device).GetSysfsDev())
clhDevice := *chclient.NewDeviceConfig(device.SysfsDev)
pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice)
if err != nil {
return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err))
}
clh.devicesIds[*(*device).GetID()] = pciInfo.GetId()
clh.devicesIds[device.ID] = pciInfo.GetId()
// clh doesn't use bridges, so the PCI path is simply the slot
// number of the device. This will break if clh starts using
@@ -882,14 +886,11 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf)
}
guestPciPath, err := types.PciPathFromString(tokens[0])
pciDevice, ok := (*device).(config.VFIOPCIDev)
if !ok {
if device.Type == config.VFIOAPDeviceMediatedType {
return fmt.Errorf("VFIO device %+v is not PCI, only PCI is supported in Cloud Hypervisor", device)
}
pciDevice.GuestPciPath = guestPciPath
*device = pciDevice
device.GuestPciPath, err = types.PciPathFromString(tokens[0])
return err
}
@@ -933,7 +934,7 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int
case BlockDev:
deviceID = clhDriveIndexToID(devInfo.(*config.BlockDrive).Index)
case VfioDev:
deviceID = *devInfo.(config.VFIODev).GetID()
deviceID = devInfo.(*config.VFIODev).ID
default:
clh.Logger().WithFields(log.Fields{"devInfo": devInfo,
"deviceType": devType}).Error("HotplugRemoveDevice: unsupported device")
@@ -1210,7 +1211,9 @@ func (clh *cloudHypervisor) Capabilities(ctx context.Context) types.Capabilities
clh.Logger().WithField("function", "Capabilities").Info("get Capabilities")
var caps types.Capabilities
caps.SetFsSharingSupport()
if clh.config.SharedFS != config.NoSharedFS {
caps.SetFsSharingSupport()
}
caps.SetBlockDeviceHotplugSupport()
return caps
}

View File

@@ -673,7 +673,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) {
_, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev)
assert.NoError(err, "Hotplug remove block device expected no error")
_, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIOPCIDev{}, VfioDev)
_, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIODev{}, VfioDev)
assert.NoError(err, "Hotplug remove vfio block device expected no error")
_, err = clh.HotplugRemoveDevice(context.Background(), nil, NetDev)
@@ -726,3 +726,30 @@ func TestClhSetConfig(t *testing.T) {
assert.Equal(clh.config, config)
}
func TestClhCapabilities(t *testing.T) {
assert := assert.New(t)
hConfig, err := newClhConfig()
assert.NoError(err)
clh := &cloudHypervisor{}
assert.Equal(clh.config, HypervisorConfig{})
hConfig.SharedFS = config.VirtioFS
err = clh.setConfig(&hConfig)
assert.NoError(err)
var ctx context.Context
c := clh.Capabilities(ctx)
assert.True(c.IsFsSharingSupported())
hConfig.SharedFS = config.NoSharedFS
err = clh.setConfig(&hConfig)
assert.NoError(err)
c = clh.Capabilities(ctx)
assert.False(c.IsFsSharingSupported())
}

View File

@@ -288,12 +288,12 @@ type HypervisorConfig struct {
// root bus instead of a bridge.
HotplugVFIOOnRootBus bool
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
// The PCIe Root Port device is used to hot-plug the PCIe device
PCIeRootPort uint32
// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
// root port, switch, bridge or no port
HotPlugVFIO hv.PCIePort
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
// root port, switch or no port
// root port, switch, bridge or no port
ColdPlugVFIO hv.PCIePort
// BootToBeTemplate used to indicate if the VM is created to be a template VM

View File

@@ -389,7 +389,6 @@ type HypervisorConfig struct {
Gid uint32
SEVGuestPolicy uint32
SNPGuestPolicy uint64
PCIeRootPort uint32
NumVCPUs uint32
RemoteHypervisorTimeout uint32
IOMMUPlatform bool
@@ -420,7 +419,10 @@ type HypervisorConfig struct {
DisableSeLinux bool
DisableGuestSeLinux bool
LegacySerial bool
ColdPlugVFIO hv.PCIePort
HotPlugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
VFIODevices []config.DeviceInfo
VhostUserBlkDevices []config.DeviceInfo
}
// vcpu mapping from vcpu number to thread number

View File

@@ -21,6 +21,7 @@ import (
"github.com/docker/go-units"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/api"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
"github.com/kata-containers/kata-containers/src/runtime/pkg/uuid"
@@ -1148,7 +1149,7 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
ContainerPath: dev.ContainerPath,
Type: kataVfioPciDevType,
Id: groupNum,
Options: nil,
Options: make([]string, len(devList)),
}
// We always pass the device information to the agent, since
@@ -1158,16 +1159,16 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel {
kataDevice.Type = kataVfioPciGuestKernelDevType
}
for i, dev := range devList {
if dev.Type == config.VFIOAPDeviceMediatedType {
kataDevice.Type = kataVfioApDevType
kataDevice.Options = dev.APDevices
} else {
if (*devList[0]).GetType() == config.VFIOAPDeviceMediatedType {
kataDevice.Type = kataVfioApDevType
kataDevice.Options = (*devList[0]).(config.VFIOAPDev).APDevices
} else {
kataDevice.Options = make([]string, len(devList))
for i, device := range devList {
pciDevice := (*device).(config.VFIOPCIDev)
kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDevice.BDF, pciDevice.GuestPciPath)
devBDF := drivers.GetBDF(dev.BDF)
kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", devBDF, dev.GuestPciPath)
}
}
return kataDevice
@@ -1354,7 +1355,6 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co
if _, err = k.sendReq(ctx, req); err != nil {
return nil, err
}
return buildProcessFromExecID(req.ExecId)
}

View File

@@ -245,7 +245,6 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) {
DisableNestingChecks: sconfig.HypervisorConfig.DisableNestingChecks,
DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm,
HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus,
PCIeRootPort: sconfig.HypervisorConfig.PCIeRootPort,
BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate,
BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate,
DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet,
@@ -487,8 +486,8 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) {
DisableNestingChecks: hconf.DisableNestingChecks,
DisableImageNvdimm: hconf.DisableImageNvdimm,
HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus,
HotPlugVFIO: hconf.HotPlugVFIO,
ColdPlugVFIO: hconf.ColdPlugVFIO,
PCIeRootPort: hconf.PCIeRootPort,
BootToBeTemplate: hconf.BootToBeTemplate,
BootFromTemplate: hconf.BootFromTemplate,
DisableVhostNet: hconf.DisableVhostNet,

View File

@@ -7,7 +7,7 @@
package persistapi
import (
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/opencontainers/runc/libcontainer/configs"
specs "github.com/opencontainers/runtime-spec/specs-go"
)
@@ -131,10 +131,6 @@ type HypervisorConfig struct {
// Enable SGX. Hardware-based isolation and memory encryption.
SGXEPCSize int64
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
// The PCIe Root Port device is used to hot-plug the PCIe device
PCIeRootPort uint32
// NumVCPUs specifies default number of vCPUs for the VM.
NumVCPUs uint32
@@ -199,9 +195,13 @@ type HypervisorConfig struct {
// root bus instead of a bridge.
HotplugVFIOOnRootBus bool
// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
// root, switch, bridge or no-port
HotPlugVFIO config.PCIePort
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
// root port or a switch or no-port
ColdPlugVFIO hv.PCIePort
// root, bridge, switch or no-port
ColdPlugVFIO config.PCIePort
// BootToBeTemplate used to indicate if the VM is created to be a template VM
BootToBeTemplate bool

View File

@@ -143,9 +143,11 @@ const (
// root bus instead of a bridge.
HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus"
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
// The PCIe Root Port device is used to hot-plug the PCIe device
PCIeRootPort = kataAnnotHypervisorPrefix + "pcie_root_port"
// ColdPlugVFIO is a sandbox annotation used to indicate if devices need to be coldplugged.
ColdPlugVFIO = kataAnnotHypervisorPrefix + "cold_plug_vfio"
// HotPlugVFIO is a sandbox annotation used to indicate if devices need to be hotplugged.
HotPlugVFIO = kataAnnotHypervisorPrefix + "hot_plug_vfio"
// EntropySource is a sandbox annotation to specify the path to a host source of
// entropy (/dev/random, /dev/urandom or real hardware RNG device)

View File

@@ -66,6 +66,11 @@ const romFile = ""
// Default value is false.
const defaultDisableModern = false
// A deeper PCIe topology than 5 is already not advisable just for the sake
// of having enough buffer we limit ourselves to 10 and exit if we reach
// the root bus
const maxPCIeTopoDepth = 10
type qmpChannel struct {
qmp *govmmQemu.QMP
ctx context.Context
@@ -76,15 +81,15 @@ type qmpChannel struct {
// QemuState keeps Qemu's state
type QemuState struct {
UUID string
Bridges []types.Bridge
// HotpluggedCPUs is the list of CPUs that were hot-added
UUID string
HotPlugVFIO config.PCIePort
Bridges []types.Bridge
HotpluggedVCPUs []hv.CPUDevice
HotpluggedMemory int
VirtiofsDaemonPid int
PCIeRootPort int
HotplugVFIOOnRootBus bool
ColdPlugVFIO hv.PCIePort
HotplugVFIO config.PCIePort
ColdPlugVFIO config.PCIePort
}
// qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
@@ -207,7 +212,7 @@ func (q *qemu) Capabilities(ctx context.Context) types.Capabilities {
span, _ := katatrace.Trace(ctx, q.Logger(), "Capabilities", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
return q.arch.capabilities()
return q.arch.capabilities(q.config)
}
func (q *qemu) HypervisorConfig() HypervisorConfig {
@@ -278,10 +283,10 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
q.Logger().Debug("Creating UUID")
q.state.UUID = uuid.Generate().String()
q.state.HotPlugVFIO = q.config.HotPlugVFIO
q.state.ColdPlugVFIO = q.config.ColdPlugVFIO
q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus
q.state.PCIeRootPort = int(q.config.PCIeRootPort)
q.state.HotPlugVFIO = q.config.HotPlugVFIO
// The path might already exist, but in case of VM templating,
// we have to create it since the sandbox has not created it yet.
@@ -727,27 +732,12 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
}
}
// Add PCIe Root Port devices to hypervisor
// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port.
// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory()
if hypervisorConfig.PCIeRootPort > 0 {
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort, memSize32bit, memSize64bit)
}
// The default OVMF MMIO aperture is too small for some PCIe devices
// with huge BARs so we need to increase it.
// memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
fwCfg := govmmQemu.FwCfg{
Name: "opt/ovmf/X-PciMmio64Mb",
Str: pciMmio64Mb,
if machine.Type == QemuQ35 || machine.Type == QemuVirt {
if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig, machine.Type); err != nil {
q.Logger().WithError(err).Errorf("Cannot create PCIe topology")
return err
}
qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
}
q.qemuConfig = qemuConfig
q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath)
@@ -773,6 +763,101 @@ func (q *qemu) checkBpfEnabled() {
}
}
// If a user uses 8 GPUs with 4 devices in each IOMMU Group that means we need
// to hotplug 32 devices. We do not have enough PCIe root bus slots to
// accomplish this task. Kata will use already some slots for vfio-xxxx-pci
// devices.
// Max PCI slots per root bus is 32
// Max PCIe root ports is 16
// Max PCIe switch ports is 16
// There is only 64kB of IO memory each root,switch port will consume 4k hence
// only 16 ports possible.
func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig, machineType string) error {
// If no-port set just return no need to add PCIe Root Port or PCIe Switches
if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort && machineType == QemuQ35 {
return nil
}
// Add PCIe Root Port or PCIe Switches to the hypervisor
// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged
// into a PCIe Root Port or PCIe Switch.
// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
// Deduce the right values for mem-reserve and pref-64-reserve memory regions
memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory()
// The default OVMF MMIO aperture is too small for some PCIe devices
// with huge BARs so we need to increase it.
// memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
fwCfg := govmmQemu.FwCfg{
Name: "opt/ovmf/X-PciMmio64Mb",
Str: pciMmio64Mb,
}
qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
}
// Get the number of hot(cold)-pluggable ports needed from the provided
// VFIO devices and VhostUserBlockDevices
var numOfPluggablePorts uint32 = 0
for _, dev := range hypervisorConfig.VFIODevices {
var err error
dev.HostPath, err = config.GetHostPath(dev, false, "")
if err != nil {
return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err)
}
devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev)
if err != nil {
return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err)
}
for _, vfioDevice := range devicesPerIOMMUGroup {
if drivers.IsPCIeDevice(vfioDevice.BDF) {
numOfPluggablePorts = numOfPluggablePorts + 1
}
}
}
vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus)
vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort)
numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices)
// If number of PCIe root ports > 16 then bail out otherwise we may
// use up all slots or IO memory on the root bus and vfio-XXX-pci devices
// cannot be added which are crucial for Kata max slots on root bus is 32
// max slots on the complete pci(e) topology is 256 in QEMU
if vfioOnRootPort {
// On Arm the vhost-user-block device is a PCIe device we need
// to account for it in the number of pluggable ports
if machineType == QemuVirt {
numOfPluggablePorts = numOfPluggablePorts + uint32(numOfVhostUserBlockDevices)
}
if numOfPluggablePorts > maxPCIeRootPort {
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
}
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
return nil
}
if vfioOnSwitchPort {
// On Arm the vhost-user-block device is a PCIe device we need
// to account for it in the number of pluggable ports
if machineType == QemuVirt {
numOfPluggableRootPorts := uint32(numOfVhostUserBlockDevices)
if numOfPluggableRootPorts > maxPCIeRootPort {
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
}
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggableRootPorts, memSize32bit, memSize64bit)
}
if numOfPluggablePorts > maxPCIeSwitchPort {
return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort)
}
qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
return nil
}
return nil
}
func (q *qemu) vhostFSSocketPath(id string) (string, error) {
return utils.BuildSocketPath(q.config.VMStorePath, id, vhostFSSocket)
}
@@ -1612,6 +1697,7 @@ func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDri
}
func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation, devID string) (err error) {
err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false, vAttr.ReconnectTime)
if err != nil {
return err
@@ -1629,18 +1715,14 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
switch machineType {
case QemuVirt:
if q.state.PCIeRootPort <= 0 {
return fmt.Errorf("Vhost-user-blk device is a PCIe device if machine type is virt. Need to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for virt")
}
//The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0
//Since the dev is the first and only one on this bus(root port), it should be 0.
addr := "00"
bridgeId := fmt.Sprintf("%s%d", pcieRootPortPrefix, len(drivers.AllPCIeDevs))
drivers.AllPCIeDevs[devID] = true
bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort]))
config.PCIeDevices[config.RootPort][devID] = true
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeId)
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID)
bridgeSlot, err := q.qomGetSlot(bridgeQomPath)
if err != nil {
return err
@@ -1656,7 +1738,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
return err
}
if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeId); err != nil {
if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeID); err != nil {
return err
}
@@ -1770,41 +1852,108 @@ func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) {
// Query QMP to find a device's PCI path given its QOM path or ID
func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) {
// XXX: For now we assume there's exactly one bridge, since
// that's always how we configure qemu from Kata for now. It
// would be good to generalize this to different PCI
// topologies
var slots []types.PciSlot
devSlot, err := q.qomGetSlot(qemuID)
if err != nil {
return types.PciPath{}, err
}
slots = append(slots, devSlot)
busq, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qemuID, "parent_bus")
// This only works for Q35 and Virt
r, _ := regexp.Compile(`^/machine/.*/pcie.0`)
var parentPath = qemuID
// We do not want to use a forever loop here, a deeper PCIe topology
// than 5 is already not advisable just for the sake of having enough
// buffer we limit ourselves to 10 and leave the loop early if we hit
// the root bus.
for i := 1; i <= maxPCIeTopoDepth; i++ {
parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus")
if err != nil {
return types.PciPath{}, err
}
busQOM, ok := parenBusQOM.(string)
if !ok {
return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM)
}
// If we hit /machine/q35/pcie.0 we're done this is the root bus
// we climbed the complete hierarchy
if r.Match([]byte(busQOM)) {
break
}
// `bus` is the QOM path of the QOM bus object, but we need
// the PCI parent_bus which manages that bus. There doesn't seem
// to be a way to get that other than to simply drop the last
// path component.
idx := strings.LastIndex(busQOM, "/")
if idx == -1 {
return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM)
}
parentBus := busQOM[:idx]
parentSlot, err := q.qomGetSlot(parentBus)
if err != nil {
return types.PciPath{}, err
}
// Prepend the slots, since we're climbing the hierarchy
slots = append([]types.PciSlot{parentSlot}, slots...)
parentPath = parentBus
}
return types.PciPathFromSlots(slots...)
}
func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) {
return q.executeVFIODeviceAdd(device)
}
func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.VFIODev) (err error) {
return q.executeVFIODeviceAdd(device)
}
func (q *qemu) hotplugVFIODeviceBridgePort(ctx context.Context, device *config.VFIODev) (err error) {
addr, bridge, err := q.arch.addDeviceToBridge(ctx, device.ID, types.PCI)
if err != nil {
return types.PciPath{}, err
return err
}
bus, ok := busq.(string)
if !ok {
return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, busq)
}
defer func() {
if err != nil {
q.arch.removeDeviceFromBridge(device.ID)
}
}()
return q.executePCIVFIODeviceAdd(device, addr, bridge.ID)
}
// `bus` is the QOM path of the QOM bus object, but we need
// the PCI bridge which manages that bus. There doesn't seem
// to be a way to get that other than to simply drop the last
// path component.
idx := strings.LastIndex(bus, "/")
if idx == -1 {
return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", bus)
func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, bridgeID string) error {
switch device.Type {
case config.VFIOPCIDeviceNormalType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, addr, bridgeID, romFile)
case config.VFIOPCIDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile)
case config.VFIOAPDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
bridge := bus[:idx]
}
bridgeSlot, err := q.qomGetSlot(bridge)
if err != nil {
return types.PciPath{}, err
func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error {
switch device.Type {
case config.VFIOPCIDeviceNormalType:
return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, device.Bus, romFile)
case config.VFIOPCIDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile)
case config.VFIOAPDeviceMediatedType:
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
return types.PciPathFromSlots(bridgeSlot, devSlot)
}
func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op Operation) (err error) {
@@ -1812,109 +1961,53 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op
return err
}
devID := *(*device).GetID()
machineType := q.HypervisorConfig().HypervisorMachineType
if op == AddDevice {
buf, _ := json.Marshal(device)
q.Logger().WithFields(logrus.Fields{
"machine-type": machineType,
"hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus,
"pcie-root-port": q.state.PCIeRootPort,
"device-info": string(buf),
"machine-type": q.HypervisorConfig().HypervisorMachineType,
"hot-plug-vfio": q.state.HotPlugVFIO,
"device-info": string(buf),
}).Info("Start hot-plug VFIO device")
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
// for pc machine type instead of bridge. This is useful for devices that require
// a large PCI BAR which is a currently a limitation with PCI bridges.
if q.state.HotplugVFIOOnRootBus {
switch (*device).GetType() {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
// In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port.
pciDevice, ok := (*device).(config.VFIOPCIDev)
if !ok {
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
}
switch machineType {
case QemuQ35:
if pciDevice.IsPCIe && q.state.PCIeRootPort <= 0 {
q.Logger().WithField("dev-id", (*device).GetID()).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35")
pciDevice.Bus = ""
}
default:
pciDevice.Bus = ""
}
*device = pciDevice
if pciDevice.Type == config.VFIOPCIDeviceNormalType {
err = q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, pciDevice.Bus, romFile)
} else {
err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), "", pciDevice.Bus, romFile)
}
case config.VFIOAPDeviceMediatedType:
err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
}
// In case MachineType is q35, a PCIe device is hotplugged on
// a PCIe Root Port or alternatively on a PCIe Switch Port
if q.HypervisorConfig().HypervisorMachineType != QemuQ35 && q.HypervisorConfig().HypervisorMachineType != QemuVirt {
device.Bus = ""
} else {
addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI)
var err error
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
// for pc machine type instead of bridge. This is useful for devices that require
// a large PCI BAR which is a currently a limitation with PCI bridges.
if q.state.HotPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus {
err = q.hotplugVFIODeviceRootPort(ctx, device)
} else if q.state.HotPlugVFIO == config.SwitchPort {
err = q.hotplugVFIODeviceSwitchPort(ctx, device)
} else {
err = q.hotplugVFIODeviceBridgePort(ctx, device)
}
if err != nil {
return err
}
defer func() {
if err != nil {
q.arch.removeDeviceFromBridge(devID)
}
}()
switch (*device).GetType() {
case config.VFIOPCIDeviceNormalType:
pciDevice, ok := (*device).(config.VFIOPCIDev)
if !ok {
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
}
err = q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, addr, bridge.ID, romFile)
case config.VFIOPCIDeviceMediatedType:
err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), addr, bridge.ID, romFile)
case config.VFIOAPDeviceMediatedType:
err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
default:
return fmt.Errorf("Incorrect VFIO device type found")
}
}
if err != nil {
return err
}
switch (*device).GetType() {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
pciDevice, ok := (*device).(config.VFIOPCIDev)
if !ok {
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
}
// XXX: Depending on whether we're doing root port or
// bridge hotplug, and how the bridge is set up in
// other parts of the code, we may or may not already
// have information about the slot number of the
// bridge and or the device. For simplicity, just
// query both of them back from qemu
guestPciPath, err := q.qomGetPciPath(devID)
pciDevice.GuestPciPath = guestPciPath
*device = pciDevice
return err
}
// XXX: Depending on whether we're doing root port or
// bridge hotplug, and how the bridge is set up in
// other parts of the code, we may or may not already
// have information about the slot number of the
// bridge and or the device. For simplicity, just
// query both of them back from qemu
device.GuestPciPath, err = q.qomGetPciPath(device.ID)
return err
} else {
q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device")
if !q.state.HotplugVFIOOnRootBus {
if err := q.arch.removeDeviceFromBridge(devID); err != nil {
return err
}
}
return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID)
}
q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device")
if !q.state.HotplugVFIOOnRootBus {
if err := q.arch.removeDeviceFromBridge(device.ID); err != nil {
return err
}
}
return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID)
}
func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error {
@@ -2612,7 +2705,7 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin
for i := uint32(0); i < number; i++ {
devices = append(devices,
govmmQemu.PCIeRootPortDevice{
ID: fmt.Sprintf("%s%d", pcieRootPortPrefix, i),
ID: fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, i),
Bus: bus,
Chassis: chassis,
Slot: strconv.FormatUint(uint64(i), 10),
@@ -2626,6 +2719,79 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin
return devices
}
// gollangci-lint enforces multi-line comments to be a block comment
// not multiple single line comments ...
/* pcie.0 bus
// -------------------------------------------------
// |
// -------------
// | Root Port |
// -------------
// -------------------------|------------------------
// | ----------------- |
// | PCI Express | Upstream Port | |
// | Switch ----------------- |
// | | | |
// | ------------------- ------------------- |
// | | Downstream Port | | Downstream Port | |
// | ------------------- ------------------- |
// -------------|-----------------------|------------
// ------------- --------------
// | GPU/ACCEL | | IB/ETH NIC |
// ------------- --------------
*/
// genericAppendPCIeSwitch adds a PCIe Swtich
func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
// Q35, Virt have the correct PCIe support,
// hence ignore all other machines
if machineType != QemuQ35 && machineType != QemuVirt {
return devices
}
// Using an own ID for the root port, so we do not clash with already
// existing root ports adding "s" for switch prefix
pcieRootPort := govmmQemu.PCIeRootPortDevice{
ID: fmt.Sprintf("%s%s%d", config.PCIeSwitchPortPrefix, config.PCIeRootPortPrefix, 0),
Bus: defaultBridgeBus,
Chassis: "1",
Slot: strconv.FormatUint(uint64(0), 10),
Multifunction: false,
Addr: "0",
MemReserve: fmt.Sprintf("%dB", memSize32bit),
Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
}
devices = append(devices, pcieRootPort)
pcieSwitchUpstreamPort := govmmQemu.PCIeSwitchUpstreamPortDevice{
ID: fmt.Sprintf("%s%d", config.PCIeSwitchUpstreamPortPrefix, 0),
Bus: pcieRootPort.ID,
}
devices = append(devices, pcieSwitchUpstreamPort)
currentChassis, err := strconv.Atoi(pcieRootPort.Chassis)
if err != nil {
return devices
}
nextChassis := currentChassis + 1
for i := uint32(0); i < number; i++ {
pcieSwitchDownstreamPort := govmmQemu.PCIeSwitchDownstreamPortDevice{
ID: fmt.Sprintf("%s%d", config.PCIeSwitchhDownstreamPortPrefix, i),
Bus: pcieSwitchUpstreamPort.ID,
Chassis: fmt.Sprintf("%d", nextChassis),
Slot: strconv.FormatUint(uint64(i), 10),
// TODO: MemReserve: fmt.Sprintf("%dB", memSize32bit),
// TODO: Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
}
devices = append(devices, pcieSwitchDownstreamPort)
}
return devices
}
func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
span, _ := katatrace.Trace(ctx, q.Logger(), "GetThreadIDs", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
@@ -2801,7 +2967,6 @@ func (q *qemu) Save() (s hv.HypervisorState) {
s.UUID = q.state.UUID
s.HotpluggedMemory = q.state.HotpluggedMemory
s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus
s.PCIeRootPort = q.state.PCIeRootPort
for _, bridge := range q.arch.getBridges() {
s.Bridges = append(s.Bridges, hv.Bridge{
@@ -2825,7 +2990,6 @@ func (q *qemu) Load(s hv.HypervisorState) {
q.state.HotpluggedMemory = s.HotpluggedMemory
q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus
q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid
q.state.PCIeRootPort = s.PCIeRootPort
for _, bridge := range s.Bridges {
q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr))

View File

@@ -26,6 +26,7 @@ import (
"google.golang.org/grpc/credentials/insecure"
"github.com/intel-go/cpuid"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
)
@@ -182,7 +183,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
return q, nil
}
func (q *qemuAmd64) capabilities() types.Capabilities {
func (q *qemuAmd64) capabilities(hConfig HypervisorConfig) types.Capabilities {
var caps types.Capabilities
if q.qemuMachine.Type == QemuQ35 ||
@@ -191,7 +192,9 @@ func (q *qemuAmd64) capabilities() types.Capabilities {
}
caps.SetMultiQueueSupport()
caps.SetFsSharingSupport()
if hConfig.SharedFS != config.NoSharedFS {
caps.SetFsSharingSupport()
}
return caps
}
@@ -323,6 +326,7 @@ func (q *qemuAmd64) appendProtectionDevice(devices []govmmQemu.Device, firmware,
ReducedPhysBits: 1,
}), "", nil
case noneProtection:
return devices, firmware, nil
default:

View File

@@ -42,13 +42,14 @@ func TestQemuAmd64BadMachineType(t *testing.T) {
func TestQemuAmd64Capabilities(t *testing.T) {
assert := assert.New(t)
config := HypervisorConfig{}
amd64 := newTestQemu(assert, QemuQ35)
caps := amd64.capabilities()
caps := amd64.capabilities(config)
assert.True(caps.IsBlockDeviceHotplugSupported())
amd64 = newTestQemu(assert, QemuMicrovm)
caps = amd64.capabilities()
caps = amd64.capabilities(config)
assert.False(caps.IsBlockDeviceHotplugSupported())
}

View File

@@ -61,7 +61,7 @@ type qemuArch interface {
kernelParameters(debug bool) []Param
//capabilities returns the capabilities supported by QEMU
capabilities() types.Capabilities
capabilities(config HypervisorConfig) types.Capabilities
// bridges sets the number bridges for the machine type
bridges(number uint32)
@@ -150,6 +150,9 @@ type qemuArch interface {
// appendPCIeRootPortDevice appends a pcie-root-port device to pcie.0 bus
appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device
// appendPCIeSwitch appends a ioh3420 device to a pcie-root-port
appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device
// append vIOMMU device
appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Device, error)
@@ -204,7 +207,8 @@ const (
defaultBridgeBus = "pcie.0"
defaultPCBridgeBus = "pci.0"
maxDevIDSize = 31
pcieRootPortPrefix = "rp"
maxPCIeRootPort = 16 // Limitation from QEMU
maxPCIeSwitchPort = 16 // Limitation from QEMU
)
// This is the PCI start address assigned to the first bridge that
@@ -313,11 +317,13 @@ func (q *qemuArchBase) kernelParameters(debug bool) []Param {
return params
}
func (q *qemuArchBase) capabilities() types.Capabilities {
func (q *qemuArchBase) capabilities(hConfig HypervisorConfig) types.Capabilities {
var caps types.Capabilities
caps.SetBlockDeviceHotplugSupport()
caps.SetMultiQueueSupport()
caps.SetFsSharingSupport()
if hConfig.SharedFS != config.NoSharedFS {
caps.SetFsSharingSupport()
}
return caps
}
@@ -708,17 +714,17 @@ func (q *qemuArchBase) appendVhostUserDevice(ctx context.Context, devices []govm
}
func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device {
pciDevice := vfioDev.(config.VFIOPCIDev)
if pciDevice.BDF == "" {
if vfioDev.BDF == "" {
return devices
}
devices = append(devices,
govmmQemu.VFIODevice{
BDF: pciDevice.BDF,
VendorID: pciDevice.VendorID,
DeviceID: pciDevice.DeviceID,
Bus: pciDevice.Bus,
BDF: vfioDev.BDF,
VendorID: vfioDev.VendorID,
DeviceID: vfioDev.DeviceID,
Bus: vfioDev.Bus,
},
)
@@ -834,6 +840,13 @@ func (q *qemuArchBase) appendPCIeRootPortDevice(devices []govmmQemu.Device, numb
return genericAppendPCIeRootPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
}
// appendPCIeSwitchPortDevice appends a PCIe Switch with <number> ports
func (q *qemuArchBase) appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
return genericAppendPCIeSwitchPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
}
// getBARsMaxAddressableMemory we need to know the BAR sizes to configure the
// PCIe Root Port or PCIe Downstream Port attaching a device with huge BARs.
func (q *qemuArchBase) getBARsMaxAddressableMemory() (uint64, uint64) {
pci := nvpci.New()

View File

@@ -117,9 +117,16 @@ func TestQemuArchBaseKernelParameters(t *testing.T) {
func TestQemuArchBaseCapabilities(t *testing.T) {
assert := assert.New(t)
qemuArchBase := newQemuArchBase()
hConfig := HypervisorConfig{}
hConfig.SharedFS = config.VirtioFS
c := qemuArchBase.capabilities()
c := qemuArchBase.capabilities(hConfig)
assert.True(c.IsBlockDeviceHotplugSupported())
assert.True(c.IsFsSharingSupported())
hConfig.SharedFS = config.NoSharedFS
c = qemuArchBase.capabilities(hConfig)
assert.False(c.IsFsSharingSupported())
}
func TestQemuArchBaseBridges(t *testing.T) {
@@ -463,7 +470,7 @@ func TestQemuArchBaseAppendVFIODevice(t *testing.T) {
},
}
vfDevice := config.VFIOPCIDev{
vfDevice := config.VFIODev{
BDF: bdf,
}
@@ -483,7 +490,7 @@ func TestQemuArchBaseAppendVFIODeviceWithVendorDeviceID(t *testing.T) {
},
}
vfDevice := config.VFIOPCIDev{
vfDevice := config.VFIODev{
BDF: bdf,
VendorID: vendorID,
DeviceID: deviceID,

View File

@@ -11,6 +11,7 @@ import (
"fmt"
"time"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
"github.com/sirupsen/logrus"
@@ -97,7 +98,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
return q, nil
}
func (q *qemuPPC64le) capabilities() types.Capabilities {
func (q *qemuPPC64le) capabilities(hConfig HypervisorConfig) types.Capabilities {
var caps types.Capabilities
// pseries machine type supports hotplugging drives
@@ -106,7 +107,9 @@ func (q *qemuPPC64le) capabilities() types.Capabilities {
}
caps.SetMultiQueueSupport()
caps.SetFsSharingSupport()
if hConfig.SharedFS != config.NoSharedFS {
caps.SetFsSharingSupport()
}
return caps
}

View File

@@ -111,9 +111,6 @@ func TestQemuCreateVM(t *testing.T) {
config6 := newQemuConfig()
config6.DisableGuestSeLinux = false
config7 := newQemuConfig()
config7.PCIeRootPort = 1
config8 := newQemuConfig()
config8.EnableVhostUserStore = true
config8.HugePages = true
@@ -161,7 +158,6 @@ func TestQemuCreateVM(t *testing.T) {
{config3, false, true},
{config5, false, true},
{config6, false, false},
{config7, false, true},
{config8, false, true},
{config9, true, false},
{config10, false, true},

View File

@@ -36,7 +36,6 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
resCtrl "github.com/kata-containers/kata-containers/src/runtime/pkg/resourcecontrol"
exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental"
@@ -106,15 +105,10 @@ type HypervisorPidKey struct{}
// SandboxStatus describes a sandbox status.
type SandboxStatus struct {
ContainersStatus []ContainerStatus
// Annotations allow clients to store arbitrary values,
// for example to add additional status values required
// to support particular specifications.
Annotations map[string]string
Annotations map[string]string
ID string
Hypervisor HypervisorType
ContainersStatus []ContainerStatus
State types.SandboxState
HypervisorConfig HypervisorConfig
}
@@ -530,6 +524,7 @@ func createSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Fac
return s, nil
}
//nolint:gocyclo
func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factory) (sb *Sandbox, retErr error) {
span, ctx := katatrace.Trace(ctx, nil, "newSandbox", sandboxTracingTags, map[string]string{"sandbox_id": sandboxConfig.ID})
defer span.End()
@@ -630,22 +625,49 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
// until we have TDISP/IDE PCIe support.
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != hv.NoPort)
var devs []config.DeviceInfo
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort)
// Aggregate all the containner devices for hot-plug and use them to dedcue
// the correct amount of ports to reserve for the hypervisor.
hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
var vfioDevices []config.DeviceInfo
// vhost-user-block device is a PCIe device in Virt, keep track of it
// for correct number of PCIe root ports.
var vhostUserBlkDevices []config.DeviceInfo
for cnt, containers := range sandboxConfig.Containers {
for dev, device := range containers.DeviceInfos {
if coldPlugVFIO && deviceManager.IsVFIO(device.ContainerPath) {
if deviceManager.IsVhostUserBlk(device) {
vhostUserBlkDevices = append(vhostUserBlkDevices, device)
continue
}
isVFIO := deviceManager.IsVFIO(device.ContainerPath)
if hotPlugVFIO && isVFIO {
vfioDevices = append(vfioDevices, device)
sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
}
if coldPlugVFIO && isVFIO {
device.ColdPlug = true
devs = append(devs, device)
device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO
vfioDevices = append(vfioDevices, device)
// We need to remove the devices marked for cold-plug
// otherwise at the container level the kata-agent
// will try to hot-plug them.
infos := sandboxConfig.Containers[cnt].DeviceInfos
infos = append(infos[:dev], infos[dev+1:]...)
sandboxConfig.Containers[cnt].DeviceInfos = infos
sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging"
}
}
var filteredDevices []config.DeviceInfo
for _, device := range containers.DeviceInfos {
if device.ID != "remove-we-are-cold-plugging" {
filteredDevices = append(filteredDevices, device)
}
}
sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices
}
sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices
// store doesn't require hypervisor to be stored immediately
if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil {
@@ -660,7 +682,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
return s, nil
}
for _, dev := range devs {
for _, dev := range vfioDevices {
_, err := s.AddDevice(ctx, dev)
if err != nil {
s.Logger().WithError(err).Debug("Cannot cold-plug add device")
@@ -1723,7 +1745,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
defer span.End()
for i := range s.config.Containers {
c, err := newContainer(ctx, s, &s.config.Containers[i])
if err != nil {
return err
@@ -1742,7 +1763,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
if err := s.updateResources(ctx); err != nil {
return err
}
if err := s.resourceControllerUpdate(ctx); err != nil {
return err
}
@@ -1754,7 +1774,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
if err := s.storeSandbox(ctx); err != nil {
return err
}
return nil
}
@@ -1918,15 +1937,11 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
// adding a group of VFIO devices
for _, dev := range vfioDevices {
if _, err := s.hypervisor.HotplugAddDevice(ctx, dev, VfioDev); err != nil {
bdf := ""
if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
bdf = pciDevice.BDF
}
s.Logger().
WithFields(logrus.Fields{
"sandbox": s.id,
"vfio-device-ID": (*dev).GetID(),
"vfio-device-BDF": bdf,
"vfio-device-ID": dev.ID,
"vfio-device-BDF": dev.BDF,
}).WithError(err).Error("failed to hotplug VFIO device")
return err
}
@@ -1941,6 +1956,7 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
return err
case config.VhostUserBlk:
vhostUserBlkDevice, ok := device.(*drivers.VhostUserBlkDevice)
if !ok {
return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
}
@@ -1975,15 +1991,11 @@ func (s *Sandbox) HotplugRemoveDevice(ctx context.Context, device api.Device, de
// remove a group of VFIO devices
for _, dev := range vfioDevices {
if _, err := s.hypervisor.HotplugRemoveDevice(ctx, dev, VfioDev); err != nil {
bdf := ""
if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
bdf = pciDevice.BDF
}
s.Logger().WithError(err).
WithFields(logrus.Fields{
"sandbox": s.id,
"vfio-device-ID": (*dev).GetID(),
"vfio-device-BDF": bdf,
"vfio-device-ID": dev.ID,
"vfio-device-BDF": dev.BDF,
}).Error("failed to hot unplug VFIO device")
return err
}

View File

@@ -593,11 +593,11 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) {
_, err = os.Create(deviceFile)
assert.Nil(t, err)
savedIOMMUPath := config.SysIOMMUPath
config.SysIOMMUPath = tmpDir
savedIOMMUPath := config.SysIOMMUGroupPath
config.SysIOMMUGroupPath = tmpDir
defer func() {
config.SysIOMMUPath = savedIOMMUPath
config.SysIOMMUGroupPath = savedIOMMUPath
}()
dm := manager.NewDeviceManager(config.VirtioSCSI, false, "", 0, nil)

View File

@@ -240,3 +240,17 @@ restart_containerd_service() {
clean_env_ctr
return 0
}
# @path_results: path to the input metric-results folder
# @tarball_fname: path and filename to the output tarball
function compress_metrics_results_dir()
{
local path_results="${1:-results}"
local tarball_fname="${2:-}"
[ -z "${tarball_fname}" ] && die "Missing the tarball filename or the path to save the tarball results is incorrect."
[ ! -d "${path_results}" ] && die "Missing path to the results folder."
cd "${path_results}" && tar -czf "${tarball_fname}" *.json && cd -
info "tarball generated: ${tarball_fname}"
}

View File

@@ -31,13 +31,16 @@ function login_azure() {
}
function create_cluster() {
# First, ensure that the cluster didn't fail to get cleaned up from a previous run.
delete_cluster || true
az aks create \
-g "kataCI" \
-n "$(_print_cluster_name)" \
-s "Standard_D4s_v5" \
--node-count 1 \
--generate-ssh-keys \
$([ "${KATA_HOST_OS}" = "cbl-mariner" ] && echo "--os-sku mariner --workload-runtime KataMshvVmIsolation")
$([ "${KATA_HOST_OS}" = "cbl-mariner" ] && echo "--os-sku AzureLinux --workload-runtime KataMshvVmIsolation")
}
function install_bats() {
@@ -55,10 +58,28 @@ function get_cluster_credentials() {
-n "$(_print_cluster_name)"
}
function ensure_yq() {
: "${GOPATH:=${GITHUB_WORKSPACE}}"
export GOPATH
export PATH="${GOPATH}/bin:${PATH}"
INSTALL_IN_GOPATH=true "${repo_root_dir}/ci/install_yq.sh"
}
function run_tests() {
platform="${1}"
ensure_yq
# Emsure we're in the default namespace
kubectl config set-context --current --namespace=default
# Delete any spurious tests namespace that was left behind
kubectl delete namespace kata-containers-k8s-tests &> /dev/null || true
sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}|g" "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml"
if [ "${KATA_HOST_OS}" = "cbl-mariner" ]; then
yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[+].name' "HOST_OS"
yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[-1].value' "${KATA_HOST_OS}"
fi
cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml"
cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" | grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" || die "Failed to setup the tests image"
@@ -80,6 +101,10 @@ function run_tests() {
sleep 60s
fi
# Create a new namespace for the tests and switch to it
kubectl apply -f ${integration_dir}/kubernetes/runtimeclass_workloads/tests-namespace.yaml
kubectl config set-context --current --namespace=kata-containers-k8s-tests
pushd "${integration_dir}/kubernetes"
bash setup.sh
bash run_kubernetes_tests.sh
@@ -89,6 +114,10 @@ function run_tests() {
function cleanup() {
platform="${1}"
# Switch back to the default namespace and delete the tests one
kubectl config set-context --current --namespace=default
kubectl delete namespace kata-containers-k8s-tests
if [ "${platform}" = "tdx" ]; then
deploy_spec="-k "${tools_dir}/packaging/kata-deploy/kata-deploy/overlays/k3s""
cleanup_spec="-k "${tools_dir}/packaging/kata-deploy/kata-cleanup/overlays/k3s""
@@ -115,11 +144,12 @@ function delete_cluster() {
az aks delete \
-g "kataCI" \
-n "$(_print_cluster_name)" \
--yes \
--no-wait
--yes
}
function main() {
export KATA_HOST_OS="${KATA_HOST_OS:-}"
action="${1:-}"
case "${action}" in

View File

@@ -14,13 +14,12 @@ setup() {
@test "Pod quota" {
resource_name="pod-quota"
deployment_name="deploymenttest"
namespace="test-quota-ns"
# Create the resourcequota
kubectl create -f "${pod_config_dir}/resource-quota.yaml"
# View information about resourcequota
kubectl get -n "$namespace" resourcequota "$resource_name" \
kubectl get resourcequota "$resource_name" \
--output=yaml | grep 'pods: "2"'
# Create deployment
@@ -28,10 +27,9 @@ setup() {
# View deployment
kubectl wait --for=condition=Available --timeout=$timeout \
-n "$namespace" deployment/${deployment_name}
deployment/${deployment_name}
}
teardown() {
kubectl delete -n "$namespace" deployment "$deployment_name"
kubectl delete -f "${pod_config_dir}/resource-quota.yaml"
}

View File

@@ -54,10 +54,6 @@ else
)
fi
if [ ${KATA_HOST_OS} == "cbl-mariner" ]; then
exit 0
fi
# we may need to skip a few test cases when running on non-x86_64 arch
arch_config_file="${kubernetes_dir}/filter_out_per_arch/${TARGET_ARCH}.yaml"
if [ -f "${arch_config_file}" ]; then

View File

@@ -6,7 +6,6 @@
apiVersion: v1
kind: Pod
metadata:
namespace: default
name: custom-dns-test
spec:
terminationGracePeriodSeconds: 0

View File

@@ -8,7 +8,6 @@ apiVersion: v1
kind: Pod
metadata:
name: pod-oom
namespace: default
spec:
runtimeClassName: kata
restartPolicy: Never

View File

@@ -7,7 +7,6 @@ apiVersion: apps/v1
kind: Deployment
metadata:
name: deploymenttest
namespace: test-quota-ns
spec:
selector:
matchLabels:

View File

@@ -14,7 +14,6 @@ items:
kind: ResourceQuota
metadata:
name: pod-quota
namespace: test-quota-ns
spec:
hard:
pods: "2"

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: kata-containers-k8s-tests

View File

@@ -13,8 +13,24 @@ set_runtime_class() {
sed -i -e "s|runtimeClassName: kata|runtimeClassName: kata-${KATA_HYPERVISOR}|" ${kubernetes_dir}/runtimeclass_workloads/*.yaml
}
set_kernel_path() {
if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then
mariner_kernel_path="/usr/share/cloud-hypervisor/vmlinux.bin"
find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.kernel]' "${mariner_kernel_path}" \;
fi
}
set_initrd_path() {
if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then
initrd_path="/opt/kata/share/kata-containers/kata-containers-initrd-cbl-mariner.img"
find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.initrd]' "${initrd_path}" \;
fi
}
main() {
set_runtime_class
set_kernel_path
set_initrd_path
}
main "$@"

View File

@@ -55,6 +55,8 @@ For further details see the [time tests documentation](time).
Tests that measure the size and overheads of the runtime. Generally this is looking at
memory footprint sizes, but could also cover disk space or even CPU consumption.
For further details see the [density tests documentation](density).
### Networking
Tests relating to networking. General items could include:

View File

@@ -0,0 +1,34 @@
# Copyright (c) 2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
# This file contains baseline expectations
# for checked results by checkmetrics tool.
#
# values set specifically for packet.com c1.small worker.
[[metric]]
name = "boot-times"
type = "json"
description = "measure container lifecycle timings"
# Min and Max values to set a 'range' that
# the median of the CSV Results data must fall
# within (inclusive)
checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result"
checktype = "mean"
midval = 0.42
minpercent = 20.0
maxpercent = 20.0
[[metric]]
name = "memory-footprint"
type = "json"
description = "measure memory usage"
# Min and Max values to set a 'range' that
# the median of the CSV Results data must fall
# within (inclusive)
checkvar = ".\"memory-footprint\".Results | .[] | .average.Result"
checktype = "mean"
midval = 2518364.00
minpercent = 20.0
maxpercent = 20.0

View File

@@ -0,0 +1,34 @@
# Copyright (c) 2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
# This file contains baseline expectations
# for checked results by checkmetrics tool.
#
# values set specifically for Equinix m3.small.x86.
[[metric]]
name = "boot-times"
type = "json"
description = "measure container lifecycle timings"
# Min and Max values to set a 'range' that
# the median of the CSV Results data must fall
# within (inclusive)
checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result"
checktype = "mean"
midval = 0.61
minpercent = 20.0
maxpercent = 20.0
[[metric]]
name = "memory-footprint"
type = "json"
description = "measure memory usage"
# Min and Max values to set a 'range' that
# the median of the CSV Results data must fall
# within (inclusive)
checkvar = ".\"memory-footprint\".Results | .[] | .average.Result"
checktype = "mean"
midval = 2435844.00
minpercent = 20.0
maxpercent = 20.0

View File

@@ -0,0 +1,53 @@
# Kata Containers density metrics tests
This directory contains a number of tests to help measure container
memory footprint. Some measures are based around the
[PSS](https://en.wikipedia.org/wiki/Proportional_set_size) of the runtime
components, and others look at the system level (`free` and `/proc/meminfo`
for instance) impact.
## `memory_usage`
This test measures the PSS footprint of the runtime components whilst
launching a number of small ([BusyBox](https://hub.docker.com/_/busybox/)) containers
using ctr.
## `fast_footprint`
This test takes system level resource measurements after launching a number of
containers in parallel and optionally waiting for KSM to settle its memory
compaction cycles.
The script is quite configurable via environment variables, including:
* Which container workload to run.
* How many containers to launch.
* How many containers are launched in parallel.
* How long to wait until taking the measures.
See the script itself for more details.
This test shares many config options with the `footprint_data` test. Thus, referring
to the [footprint test documentation](footprint_data.md) may be useful.
> *Note:* If this test finds KSM is enabled on the host, it will wait for KSM
> to "settle" before taking the final measurement. If your KSM is not configured
> to process all the allocated VM memory fast enough, the test will hit a timeout
> and proceed to take the final measurement anyway.
## `footprint_data`
Similar to the `fast_footprint` test, but this test launches the containers
sequentially and takes a system level measurement between each launch. Thus,
this test provides finer grained information on system scaling, but takes
significantly longer to run than the `fast_footprint` test. If you are only
interested in the final figure or the average impact, you may be better running
the `fast_footprint` test.
For more details see the [footprint test documentation](footprint_data.md).
## `memory_usage_inside_container`
Measures the memory statistics *inside* the container. This allows evaluation of
the overhead the VM kernel and rootfs are having on the memory that was requested
by the container co-ordination system, and thus supplied to the VM.

View File

@@ -0,0 +1,433 @@
#!/bin/bash
# Copyright (c) 2017-2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
# A script to gather memory 'footprint' information as we launch more
# and more containers
#
# The script gathers information about both user and kernel space consumption
# Output is into a .json file, named using some of the config component names
# (such as footprint-busybox.json)
# Pull in some common, useful, items
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
source "${SCRIPT_PATH}/../lib/common.bash"
# Note that all vars that can be set from outside the script (that is,
# passed in the ENV), use the ':-' setting to allow being over-ridden
# Default sleep, in seconds, to let containers come up and finish their
# initialisation before we take the measures. Some of the larger
# containers can take a number of seconds to get running.
PAYLOAD_SLEEP="${PAYLOAD_SLEEP:-10}"
# How long, in seconds, do we wait for KSM to 'settle down', before we
# timeout and just continue anyway.
KSM_WAIT_TIME="${KSM_WAIT_TIME:-300}"
# How long, in seconds, do we poll for ctr to complete launching all the
# containers?
CTR_POLL_TIMEOUT="${CTR_POLL_TIMEOUT:-300}"
# How many containers do we launch in parallel before taking the PAYLOAD_SLEEP
# nap
PARALLELISM="${PARALLELISM:-10}"
### The default config - run a small busybox image
# Define what we will be running (app under test)
# Default is we run busybox, as a 'small' workload
PAYLOAD="${PAYLOAD:-quay.io/prometheus/busybox:latest}"
PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}"
###
# which RUNTIME we use is picked up from the env in
# common.bash. You can over-ride by setting RUNTIME in your env
###
# Define the cutoff checks for when we stop running the test
# Run up to this many containers
NUM_CONTAINERS="${NUM_CONTAINERS:-100}"
# Run until we have consumed this much memory (from MemFree)
MAX_MEMORY_CONSUMED="${MAX_MEMORY_CONSUMED:-256*1024*1024*1024}"
# Run until we have this much MemFree left
MIN_MEMORY_FREE="${MIN_MEMORY_FREE:-2*1024*1024*1024}"
# Tools we need to have installed in order to operate
REQUIRED_COMMANDS="smem awk"
# If we 'dump' the system caches before we measure then we get less
# noise in the results - they show more what our un-reclaimable footprint is
DUMP_CACHES="${DUMP_CACHES:-1}"
# Affects the name of the file to store the results in
TEST_NAME="${TEST_NAME:-fast-footprint-busybox}"
############# end of configurable items ###################
# vars to remember where we started so we can calc diffs
base_mem_avail=0
base_mem_free=0
# dump the kernel caches, so we get a more precise (or just different)
# view of what our footprint really is.
function dump_caches() {
sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
}
function init() {
restart_containerd_service
check_cmds $REQUIRED_COMMANDS
sudo -E "${CTR_EXE}" image pull "$PAYLOAD"
# Modify the test name if running with KSM enabled
check_for_ksm
# Use the common init func to get to a known state
init_env
# Prepare to start storing results
metrics_json_init
# Store up baseline measures
base_mem_avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
base_mem_free=$(get_memfree)
# Store our configuration for this run
save_config
}
save_config(){
metrics_json_start_array
local json="$(cat << EOF
{
"testname": "${TEST_NAME}",
"payload": "${PAYLOAD}",
"payload_args": "${PAYLOAD_ARGS}",
"payload_sleep": ${PAYLOAD_SLEEP},
"ksm_settle_time": ${KSM_WAIT_TIME},
"num_containers": ${NUM_CONTAINERS},
"parallelism": ${PARALLELISM},
"max_memory_consumed": "${MAX_MEMORY_CONSUMED}",
"min_memory_free": "${MIN_MEMORY_FREE}",
"dump_caches": "${DUMP_CACHES}"
}
EOF
)"
metrics_json_add_array_element "$json"
metrics_json_end_array "Config"
}
function cleanup() {
# Finish storing the results
metrics_json_save
clean_env_ctr
}
# helper function to get USS of process in arg1
function get_proc_uss() {
item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $4}')
((item*=1024))
echo $item
}
# helper function to get PSS of process in arg1
function get_proc_pss() {
item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $5}')
((item*=1024))
echo $item
}
# Get the PSS for the whole of userspace (all processes)
# This allows us to see if we had any impact on the rest of the system, for instance
# dockerd grows as we launch containers, so we should account for that in our total
# memory breakdown
function grab_all_pss() {
item=$(sudo smem -t | tail -1 | awk '{print $5}')
((item*=1024))
local json="$(cat << EOF
"all_pss": {
"pss": $item,
"Units": "KB"
}
EOF
)"
metrics_json_add_array_fragment "$json"
}
function grab_user_smem() {
# userspace
item=$(sudo smem -w | head -5 | tail -1 | awk '{print $3}')
((item*=1024))
local json="$(cat << EOF
"user_smem": {
"userspace": $item,
"Units": "KB"
}
EOF
)"
metrics_json_add_array_fragment "$json"
}
function grab_slab() {
# Grabbing slab total from meminfo is easier than doing the math
# on slabinfo
item=$(fgrep "Slab:" /proc/meminfo | awk '{print $2}')
((item*=1024))
local json="$(cat << EOF
"slab": {
"slab": $item,
"Units": "KB"
}
EOF
)"
metrics_json_add_array_fragment "$json"
}
function get_memfree() {
mem_free=$(sudo smem -w | head -6 | tail -1 | awk '{print $4}')
((mem_free*=1024))
echo $mem_free
}
function grab_system() {
# avail memory, from 'free'
local avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
local avail_decr=$((base_mem_avail-avail))
# cached memory, from 'free'
local cached=$(free -b | head -2 | tail -1 | awk '{print $6}')
# free memory from smem
local smem_free=$(get_memfree)
local free_decr=$((base_mem_free-item))
# Anon pages
local anon=$(fgrep "AnonPages:" /proc/meminfo | awk '{print $2}')
((anon*=1024))
# Mapped pages
local mapped=$(egrep "^Mapped:" /proc/meminfo | awk '{print $2}')
((mapped*=1024))
# Cached
local meminfo_cached=$(grep "^Cached:" /proc/meminfo | awk '{print $2}')
((meminfo_cached*=1024))
local json="$(cat << EOF
"system": {
"avail": $avail,
"avail_decr": $avail_decr,
"cached": $cached,
"smem_free": $smem_free,
"free_decr": $free_decr,
"anon": $anon,
"mapped": $mapped,
"meminfo_cached": $meminfo_cached,
"Units": "KB"
}
EOF
)"
metrics_json_add_array_fragment "$json"
}
function grab_stats() {
# If configured, dump the caches so we get a more stable
# view of what our static footprint really is
if [[ "$DUMP_CACHES" ]] ; then
dump_caches
fi
# user space data
# PSS taken all userspace
grab_all_pss
# user as reported by smem
grab_user_smem
# System overview data
# System free and cached
grab_system
# kernel data
# The 'total kernel space taken' we can work out as:
# ktotal = ((free-avail)-user)
# So, we don't grab that number from smem, as that is what it does
# internally anyhow.
# Still try to grab any finer kernel details that we can though
# totals from slabinfo
grab_slab
metrics_json_close_array_element
}
function check_limits() {
mem_free=$(get_memfree)
if ((mem_free <= MIN_MEMORY_FREE)); then
echo 1
return
fi
mem_consumed=$((base_mem_avail-mem_free))
if ((mem_consumed >= MAX_MEMORY_CONSUMED)); then
echo 1
return
fi
echo 0
}
launch_containers() {
local parloops leftovers
(( parloops=${NUM_CONTAINERS}/${PARALLELISM} ))
(( leftovers=${NUM_CONTAINERS} - (${parloops}*${PARALLELISM}) ))
echo "Launching ${parloops}x${PARALLELISM} containers + ${leftovers} etras"
containers=()
local iter n
for iter in $(seq 1 $parloops); do
echo "Launch iteration ${iter}"
for n in $(seq 1 $PARALLELISM); do
containers+=($(random_name))
sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS &
done
if [[ $PAYLOAD_SLEEP ]]; then
sleep $PAYLOAD_SLEEP
fi
# check if we have hit one of our limits and need to wrap up the tests
if (($(check_limits))); then
echo "Ran out of resources, check_limits failed"
return
fi
done
for n in $(seq 1 $leftovers); do
containers+=($(random_name))
sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS &
done
}
wait_containers() {
local t numcontainers
# nap 3s between checks
local step=3
for ((t=0; t<${CTR_POLL_TIMEOUT}; t+=step)); do
numcontainers=$(sudo -E "${CTR_EXE}" c list -q | wc -l)
if (( numcontainers >= ${NUM_CONTAINERS} )); then
echo "All containers now launched (${t}s)"
return
else
echo "Waiting for containers to launch (${numcontainers} at ${t}s)"
fi
sleep ${step}
done
echo "Timed out waiting for containers to launch (${t}s)"
cleanup
die "Timed out waiting for containers to launch (${t}s)"
}
function go() {
# Init the json cycle for this save
metrics_json_start_array
# Grab the first set of stats before we run any containers.
grab_stats
launch_containers
wait_containers
if [ $ksm_on == "1" ]; then
echo "Wating for KSM to settle..."
wait_ksm_settle ${KSM_WAIT_TIME}
fi
grab_stats
# Wrap up the results array
metrics_json_end_array "Results"
}
function show_vars()
{
echo -e "\nEvironment variables:"
echo -e "\tName (default)"
echo -e "\t\tDescription"
echo -e "\tPAYLOAD (${PAYLOAD})"
echo -e "\t\tThe ctr image to run"
echo -e "\tPAYLOAD_ARGS (${PAYLOAD_ARGS})"
echo -e "\t\tAny extra arguments passed into the docker 'run' command"
echo -e "\tPAYLOAD_SLEEP (${PAYLOAD_SLEEP})"
echo -e "\t\tSeconds to sleep between launch and measurement, to allow settling"
echo -e "\tKSM_WAIT_TIME (${KSM_WAIT_TIME})"
echo -e "\t\tSeconds to wait for KSM to settle before we take the final measure"
echo -e "\tCTR_POLL_TIMEOUT (${CTR_POLL_TIMEOUT})"
echo -e "\t\tSeconds to poll for ctr to finish launching containers"
echo -e "\tPARALLELISM (${PARALLELISM})"
echo -e "\t\tNumber of containers we launch in parallel"
echo -e "\tNUM_CONTAINERS (${NUM_CONTAINERS})"
echo -e "\t\tThe total number of containers to run"
echo -e "\tMAX_MEMORY_CONSUMED (${MAX_MEMORY_CONSUMED})"
echo -e "\t\tThe maximum amount of memory to be consumed before terminating"
echo -e "\tMIN_MEMORY_FREE (${MIN_MEMORY_FREE})"
echo -e "\t\tThe minimum amount of memory allowed to be free before terminating"
echo -e "\tDUMP_CACHES (${DUMP_CACHES})"
echo -e "\t\tA flag to note if the system caches should be dumped before capturing stats"
echo -e "\tTEST_NAME (${TEST_NAME})"
echo -e "\t\tCan be set to over-ride the default JSON results filename"
}
function help()
{
usage=$(cat << EOF
Usage: $0 [-h] [options]
Description:
Launch a series of workloads and take memory metric measurements after
each launch.
Options:
-h, Help page.
EOF
)
echo "$usage"
show_vars
}
function main() {
local OPTIND
while getopts "h" opt;do
case ${opt} in
h)
help
exit 0;
;;
esac
done
shift $((OPTIND-1))
init
go
cleanup
}
main "$@"

View File

@@ -0,0 +1,87 @@
# Footprint data script details
The `footprint_data.sh` script runs a number of identical containers sequentially
via ctr and takes a number of memory related measurements after each
launch. The script is generally not used in a CI type environment, but is intended
to be run and analyzed manually.
You can configure the script by setting a number of environment variables.
The following sections list details of the configurable variables, along with a
small example invocation script.
## Variables
Environment variables can take effect in two ways.
Some variables affect how the payload is executed. The `RUNTIME` and `PAYLOAD`
arguments directly affect the payload execution with the following line in
the script:
`$ ctr run --memory-limit $PAYLOAD_RUNTIME_ARGS --rm --runtime=$CONTAINERD_RUNTIME $PAYLOAD $NAME sh -c $PAYLOAD_ARGS`
Other settings affect how memory footprint is measured and the test termination
conditions.
| Variable | Function
| -------- | --------
| `PAYLOAD` | The ctr image to run
| `PAYLOAD_ARGS` | Any arguments passed into the ctr image
| `PAYLOAD_RUNTIME_ARGS` | Any extra arguments passed into the ctr `run` command
| `PAYLOAD_SLEEP` | Seconds to sleep between launch and measurement, to allow settling
| `MAX_NUM_CONTAINERS` | The maximum number of containers to run before terminating
| `MAX_MEMORY_CONSUMED` | The maximum amount of memory to be consumed before terminating
| `MIN_MEMORY_FREE` | The minimum amount of memory allowed to be free before terminating
| `DUMP_CACHES` | A flag to note if the system caches should be dumped before capturing stats
| `DATAFILE` | Can be set to over-ride the default JSON results filename
## Output files
The names of the JSON files generated by the test are dictated by some of the parameters
the test is utilising. The default filename is generated in the form of:
`footprint-${PAYLOAD}[-ksm].json`
## Measurements
The test measures, calculates, and stores a number of data items:
| Item | Description
| ---- | -----------
| `uss` | USS for all the VM runtime components
| `pss` | PSS for all the VM runtime components
| `all_pss` | PSS of all of userspace - to monitor if we had other impact on the system
| `user_smem` | `smem` "userspace" consumption value
| `avail` | "available" memory from `free`
| `avail_decr` | "available" memory decrease since start of test
| `cached` | "Cached" memory from `/proc/meminfo`
| `smem_free` | Free memory as reported by `smem`
| `free_decr` | Decrease in Free memory reported by `smem` since start of test
| `anon` | `AnonPages` as reported from `/proc/meminfo`
| `mapped` | Mapped pages as reported from `/proc/meminfo`
| `cached` | Cached pages as reported from `/proc/meminfo`
| `slab` | Slab as reported from `/proc/meminfo`
## Example script
The following script is an example of how to configure the environment variables and
invoke the test script to run a number of different container tests.
```
#!/bin/bash
set -e
set -x
export MAX_NUM_CONTAINERS=10
export MAX_MEMORY_CONSUMED=6*1024*1024*1024
function run() {
###
# Define what we will be running (app under test)
# Default is we run busybox, as a 'small' workload
export PAYLOAD="quay.io/prometheus/busybox:latest"
export PAYLOAD_ARGS="tail -f /dev/null"
export PAYLOAD_SLEEP=10
export PAYLOAD_RUNTIME_ARGS="5120"
sudo -E bash $(pwd)/density/footprint_data.sh
}
export CONTAINERD_RUNTIME=io.containerd.kata.v2
run
```

View File

@@ -0,0 +1,360 @@
#!/bin/bash
# Copyright (c) 2017-2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
# A script to gather memory 'footprint' information as we launch more
# and more containers
#
# The script gathers information about both user and kernel space consumption
# Output is into a .json file, named using some of the config component names
# (such as footprint-busybox.json)
# Pull in some common, useful, items
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
source "${SCRIPT_PATH}/../lib/common.bash"
KSM_ENABLE_FILE="/sys/kernel/mm/ksm/run"
# Note that all vars that can be set from outside the script (that is,
# passed in the ENV), use the ':-' setting to allow being over-ridden
# Default sleep for 10s to let containers come up and finish their
# initialisation before we take the measures. Some of the larger
# containers can take a number of seconds to get running.
PAYLOAD_SLEEP="${PAYLOAD_SLEEP:-10}"
### The default config - run a small busybox image
# Define what we will be running (app under test)
# Default is we run busybox, as a 'small' workload
PAYLOAD="${PAYLOAD:-quay.io/prometheus/busybox:latest}"
PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}"
###
# Define the cutoff checks for when we stop running the test
# Run up to this many containers
MAX_NUM_CONTAINERS="${MAX_NUM_CONTAINERS:-10}"
# Run until we have consumed this much memory (from MemFree)
MAX_MEMORY_CONSUMED="${MAX_MEMORY_CONSUMED:-6*1024*1024*1024}"
# Run until we have this much MemFree left
MIN_MEMORY_FREE="${MIN_MEMORY_FREE:-2*1024*1024*1024}"
# Tools we need to have installed in order to operate
REQUIRED_COMMANDS="smem awk"
# If we 'dump' the system caches before we measure then we get less
# noise in the results - they show more what our un-reclaimable footprint is
DUMP_CACHES="${DUMP_CACHES:-1}"
# Affects the name of the file to store the results in
TEST_NAME="${TEST_NAME:-footprint-busybox}"
############# end of configurable items ###################
# vars to remember where we started so we can calc diffs
base_mem_avail=0
base_mem_free=0
# dump the kernel caches, so we get a more precise (or just different)
# view of what our footprint really is.
function dump_caches() {
sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
}
function init() {
restart_containerd_service
check_cmds $REQUIRED_COMMANDS
sudo -E "${CTR_EXE}" image pull "$PAYLOAD"
# Modify the test name if running with KSM enabled
check_for_ksm
# Use the common init func to get to a known state
init_env
# Prepare to start storing results
metrics_json_init
# Store up baseline measures
base_mem_avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
base_mem_free=$(get_memfree)
# Store our configuration for this run
save_config
}
save_config(){
metrics_json_start_array
local json="$(cat << EOF
{
"testname": "${TEST_NAME}",
"payload": "${PAYLOAD}",
"payload_args": "${PAYLOAD_ARGS}",
"payload_sleep": ${PAYLOAD_SLEEP},
"max_containers": ${MAX_NUM_CONTAINERS},
"max_memory_consumed": "${MAX_MEMORY_CONSUMED}",
"min_memory_free": "${MIN_MEMORY_FREE}",
"dump_caches": "${DUMP_CACHES}"
}
EOF
)"
metrics_json_add_array_element "$json"
metrics_json_end_array "Config"
}
function cleanup() {
# Finish storing the results
metrics_json_save
clean_env_ctr
}
# helper function to get USS of process in arg1
function get_proc_uss() {
item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $4}')
((item*=1024))
echo $item
}
# helper function to get PSS of process in arg1
function get_proc_pss() {
item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $5}')
((item*=1024))
echo $item
}
# Get the PSS for the whole of userspace (all processes)
# This allows us to see if we had any impact on the rest of the system, for instance
# containerd grows as we launch containers, so we should account for that in our total
# memory breakdown
function grab_all_pss() {
item=$(sudo smem -t | tail -1 | awk '{print $5}')
((item*=1024))
local json="$(cat << EOF
"all_pss": {
"pss": $item,
"Units": "KB"
}
EOF
)"
metrics_json_add_array_fragment "$json"
}
function grab_user_smem() {
# userspace
item=$(sudo smem -w | head -5 | tail -1 | awk '{print $3}')
((item*=1024))
local json="$(cat << EOF
"user_smem": {
"userspace": $item,
"Units": "KB"
}
EOF
)"
metrics_json_add_array_fragment "$json"
}
function grab_slab() {
# Grabbing slab total from meminfo is easier than doing the math
# on slabinfo
item=$(fgrep "Slab:" /proc/meminfo | awk '{print $2}')
((item*=1024))
local json="$(cat << EOF
"slab": {
"slab": $item,
"Units": "KB"
}
EOF
)"
metrics_json_add_array_fragment "$json"
}
function get_memfree() {
mem_free=$(sudo smem -w | head -6 | tail -1 | awk '{print $4}')
((mem_free*=1024))
echo $mem_free
}
function grab_system() {
# avail memory, from 'free'
local avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
local avail_decr=$((base_mem_avail-avail))
# cached memory, from 'free'
local cached=$(free -b | head -2 | tail -1 | awk '{print $6}')
# free memory from smem
local smem_free=$(get_memfree)
local free_decr=$((base_mem_free-item))
# Anon pages
local anon=$(fgrep "AnonPages:" /proc/meminfo | awk '{print $2}')
((anon*=1024))
# Mapped pages
local mapped=$(egrep "^Mapped:" /proc/meminfo | awk '{print $2}')
((mapped*=1024))
# Cached
local meminfo_cached=$(grep "^Cached:" /proc/meminfo | awk '{print $2}')
((meminfo_cached*=1024))
local json="$(cat << EOF
"system": {
"avail": $avail,
"avail_decr": $avail_decr,
"cached": $cached,
"smem_free": $smem_free,
"free_decr": $free_decr,
"anon": $anon,
"mapped": $mapped,
"meminfo_cached": $meminfo_cached,
"Units": "KB"
}
EOF
)"
metrics_json_add_array_fragment "$json"
}
function grab_stats() {
# If configured, dump the caches so we get a more stable
# view of what our static footprint really is
if [[ "$DUMP_CACHES" ]] ; then
dump_caches
fi
# user space data
# PSS taken all userspace
grab_all_pss
# user as reported by smem
grab_user_smem
# System overview data
# System free and cached
grab_system
# kernel data
# The 'total kernel space taken' we can work out as:
# ktotal = ((free-avail)-user)
# So, we don't grab that number from smem, as that is what it does
# internally anyhow.
# Still try to grab any finer kernel details that we can though
# totals from slabinfo
grab_slab
metrics_json_close_array_element
}
function check_limits() {
mem_free=$(get_memfree)
if ((mem_free <= MIN_MEMORY_FREE)); then
echo 1
return
fi
mem_consumed=$((base_mem_avail-mem_free))
if ((mem_consumed >= MAX_MEMORY_CONSUMED)); then
echo 1
return
fi
echo 0
}
function go() {
# Init the json cycle for this save
metrics_json_start_array
containers=()
for i in $(seq 1 $MAX_NUM_CONTAINERS); do
containers+=($(random_name))
sudo -E "${CTR_EXE}" run --rm --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS
if [[ $PAYLOAD_SLEEP ]]; then
sleep $PAYLOAD_SLEEP
fi
grab_stats
# check if we have hit one of our limits and need to wrap up the tests
if (($(check_limits))); then
# Wrap up the results array
metrics_json_end_array "Results"
return
fi
done
# Wrap up the results array
metrics_json_end_array "Results"
}
function show_vars()
{
echo -e "\nEvironment variables:"
echo -e "\tName (default)"
echo -e "\t\tDescription"
echo -e "\tPAYLOAD (${PAYLOAD})"
echo -e "\t\tThe ctr image to run"
echo -e "\tPAYLOAD_ARGS (${PAYLOAD_ARGS})"
echo -e "\t\tAny extra arguments passed into the ctr 'run' command"
echo -e "\tPAYLOAD_SLEEP (${PAYLOAD_SLEEP})"
echo -e "\t\tSeconds to sleep between launch and measurement, to allow settling"
echo -e "\tMAX_NUM_CONTAINERS (${MAX_NUM_CONTAINERS})"
echo -e "\t\tThe maximum number of containers to run before terminating"
echo -e "\tMAX_MEMORY_CONSUMED (${MAX_MEMORY_CONSUMED})"
echo -e "\t\tThe maximum amount of memory to be consumed before terminating"
echo -e "\tMIN_MEMORY_FREE (${MIN_MEMORY_FREE})"
echo -e "\t\tThe path to the ctr binary (for 'smem' measurements)"
echo -e "\tDUMP_CACHES (${DUMP_CACHES})"
echo -e "\t\tA flag to note if the system caches should be dumped before capturing stats"
echo -e "\tTEST_NAME (${TEST_NAME})"
echo -e "\t\tCan be set to over-ride the default JSON results filename"
}
function help()
{
usage=$(cat << EOF
Usage: $0 [-h] [options]
Description:
Launch a series of workloads and take memory metric measurements after
each launch.
Options:
-h, Help page.
EOF
)
echo "$usage"
show_vars
}
function main() {
local OPTIND
while getopts "h" opt;do
case ${opt} in
h)
help
exit 0;
;;
esac
done
shift $((OPTIND-1))
init
go
cleanup
}
main "$@"

View File

@@ -0,0 +1,383 @@
#!/bin/bash
# Copyright (c) 2017-2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
# Description of the test:
# This test launches a number of containers in idle mode,
# It will then sleep for a configurable period of time to allow
# any memory optimisations to 'settle, and then checks the
# amount of memory used by all the containers to come up with
# an average (using the PSS measurements)
# This test uses smem tool to get the memory used.
set -e
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
source "${SCRIPT_PATH}/../lib/common.bash"
# Busybox image: Choose a small workload image, this is
# in order to measure the runtime footprint, not the workload
# footprint.
IMAGE='quay.io/prometheus/busybox:latest'
CMD='tail -f /dev/null'
NUM_CONTAINERS="$1"
WAIT_TIME="$2"
AUTO_MODE="$3"
TEST_NAME="memory footprint"
SMEM_BIN="smem"
KSM_ENABLE_FILE="/sys/kernel/mm/ksm/run"
MEM_TMP_FILE=$(mktemp meminfo.XXXXXXXXXX)
PS_TMP_FILE=$(mktemp psinfo.XXXXXXXXXX)
function remove_tmp_file() {
rm -rf "${MEM_TMP_FILE}" "${PS_TMP_FILE}"
}
trap remove_tmp_file EXIT
# Show help about this script
function help(){
cat << EOF
Usage: $0 <count> <wait_time> [auto]
Description:
<count> : Number of containers to run.
<wait_time> : Time in seconds to wait before taking
metrics.
[auto] : Optional 'auto KSM settle' mode
waits for ksm pages_shared to settle down
EOF
}
function get_runc_pss_memory(){
ctr_runc_shim_path="/usr/local/bin/containerd-shim-runc-v2"
get_pss_memory "${ctr_runc_shim_path}"
}
function get_runc_individual_memory() {
runc_process_result=$(cat "${MEM_TMP_FILE}" | tr "\n" " " | sed -e 's/\s$//g' | sed 's/ /, /g')
# Verify runc process result
if [ -z "${runc_process_result}" ];then
die "Runc process not found"
fi
read -r -a runc_values <<< "${runc_process_result}"
metrics_json_start_array
local json="$(cat << EOF
{
"runc individual results": [
$(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
printf '%s\n\t\t\t' "${runc_values[i]}"
done)
]
}
EOF
)"
metrics_json_add_array_element "$json"
metrics_json_end_array "Raw results"
}
# This function measures the PSS average
# memory of a process.
function get_pss_memory(){
ps="$1"
mem_amount=0
count=0
avg=0
if [ -z "${ps}" ]; then
die "No argument to get_pss_memory()"
fi
# Save all the processes names
# This will be help us to retrieve raw information
echo "${ps}" >> "${PS_TMP_FILE}"
data=$(sudo "${SMEM_BIN}" --no-header -P "^${ps}" -c "pss" | sed 's/[[:space:]]//g' | tr '\n' ' ' | sed 's/[[:blank:]]*$//')
# Save all the smem results
# This will help us to retrieve raw information
echo "${data}" >> "${MEM_TMP_FILE}"
gral_data=$(echo "${data// /+}" | bc)
for i in "${gral_data}"; do
if (( $i > 0 ));then
mem_amount=$(( i + mem_amount ))
(( count++ ))
fi
done
if (( "${count}" > 0 ));then
avg=$(bc -l <<< "scale=2; ${mem_amount} / ${count}")
fi
echo "${avg}"
}
function ppid() {
local pid
pid=$(ps -p "${1:-nopid}" -o ppid=)
echo "${pid//[[:blank:]]/}"
}
# This function measures the PSS average
# memory of virtiofsd.
# It is a special case of get_pss_memory,
# virtiofsd forks itself so, smem sees the process
# two times, this function sum both pss values:
# pss_virtiofsd=pss_fork + pss_parent
function get_pss_memory_virtiofsd() {
mem_amount=0
count=0
avg=0
virtiofsd_path=${1:-}
if [ -z "${virtiofsd_path}" ]; then
die "virtiofsd_path not provided"
fi
echo "${virtiofsd_path}" >> "${PS_TMP_FILE}"
virtiofsd_pids=$(ps aux | grep [v]irtiofsd | awk '{print $2}' | head -1)
data=$(sudo smem --no-header -P "^${virtiofsd_path}" -c pid -c "pid pss")
for p in "${virtiofsd_pids}"; do
parent_pid=$(ppid "${p}")
cmd="$(cat /proc/${p}/cmdline | tr -d '\0')"
cmd_parent="$(cat /proc/${parent_pid}/cmdline | tr -d '\0')"
if [ "${cmd}" != "${cmd_parent}" ]; then
pss_parent=$(printf "%s" "${data}" | grep "\s^${p}" | awk '{print $2}')
fork=$(pgrep -P "${p}")
pss_fork=$(printf "%s" "${data}" | grep "^\s*${fork}" | awk '{print $2}')
pss_process=$((pss_fork + pss_parent))
# Save all the smem results
# This will help us to retrieve raw information
echo "${pss_process}" >>"${MEM_TMP_FILE}"
if ((pss_process > 0)); then
mem_amount=$((pss_process + mem_amount))
((count++))
fi
fi
done
if (( "${count}" > 0 ));then
avg=$(bc -l <<< "scale=2; ${mem_amount} / ${count}")
fi
echo "${avg}"
}
function get_individual_memory(){
# Getting all the individual container information
first_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==1' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g')
first_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==1' | sed 's/ /, /g')
second_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==2' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g')
second_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==2' | sed 's/ /, /g')
third_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==3' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g')
third_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==3' | sed 's/ /, /g')
read -r -a first_values <<< "${first_process_result}"
read -r -a second_values <<< "${second_process_result}"
read -r -a third_values <<< "${third_process_result}"
metrics_json_start_array
local json="$(cat << EOF
{
"${first_process_name} memory": [
$(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
[ -n "${first_values[i]}" ] &&
printf '%s\n\t\t\t' "${first_values[i]}"
done)
],
"${second_process_name} memory": [
$(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
[ -n "${second_values[i]}" ] &&
printf '%s\n\t\t\t' "${second_values[i]}"
done)
],
"${third_process_name} memory": [
$(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
[ -n "${third_values[i]}" ] &&
printf '%s\n\t\t\t' "${third_values[i]}"
done)
]
}
EOF
)"
metrics_json_add_array_element "$json"
metrics_json_end_array "Raw results"
}
# Try to work out the 'average memory footprint' of a container.
function get_memory_usage(){
hypervisor_mem=0
virtiofsd_mem=0
shim_mem=0
memory_usage=0
containers=()
info "Creating ${NUM_CONTAINERS} containers"
for ((i=1; i<="${NUM_CONTAINERS}"; i++)); do
containers+=($(random_name))
sudo "${CTR_EXE}" run --runtime "${CTR_RUNTIME}" -d "${IMAGE}" "${containers[-1]}" sh -c "${CMD}"
done
if [ "${AUTO_MODE}" == "auto" ]; then
if (( ksm_on != 1 )); then
die "KSM not enabled, cannot use auto mode"
fi
echo "Entering KSM settle auto detect mode..."
wait_ksm_settle "${WAIT_TIME}"
else
# If KSM is enabled, then you normally want to sleep long enough to
# let it do its work and for the numbers to 'settle'.
echo "napping ${WAIT_TIME} s"
sleep "${WAIT_TIME}"
fi
metrics_json_start_array
# Check the runtime in order in order to determine which process will
# be measured about PSS
if [ "${RUNTIME}" == "runc" ]; then
runc_workload_mem="$(get_runc_pss_memory)"
memory_usage="${runc_workload_mem}"
local json="$(cat << EOF
{
"average": {
"Result": ${memory_usage},
"Units" : "KB"
},
"runc": {
"Result": ${runc_workload_mem},
"Units" : "KB"
}
}
EOF
)"
else [ "$RUNTIME" == "kata-runtime" ] || [ "$RUNTIME" == "kata-qemu" ]
# Get PSS memory of VM runtime components.
# And check that the smem search has found the process - we get a "0"
# back if that procedure fails (such as if a process has changed its name
# or is not running when expected to be so)
# As an added bonus - this script must be run as root.
# Now if you do not have enough rights
# the smem failure to read the stats will also be trapped.
hypervisor_mem="$(get_pss_memory ${HYPERVISOR_PATH})"
if [ "${hypervisor_mem}" == "0" ]; then
die "Failed to find PSS for ${HYPERVISOR_PATH}"
fi
virtiofsd_mem="$(get_pss_memory_virtiofsd ${VIRTIOFSD_PATH})"
if [ "${virtiofsd_mem}" == "0" ]; then
echo >&2 "WARNING: Failed to find PSS for ${VIRTIOFSD_PATH}"
fi
shim_mem="$(get_pss_memory ${SHIM_PATH})"
if [ "${shim_mem}" == "0" ]; then
die "Failed to find PSS for ${SHIM_PATH}"
fi
mem_usage="$(bc -l <<< "scale=2; ${hypervisor_mem} +${virtiofsd_mem} + ${shim_mem}")"
memory_usage="${mem_usage}"
local json="$(cat << EOF
{
"average": {
"Result": ${mem_usage},
"Units" : "KB"
},
"qemus": {
"Result": ${hypervisor_mem},
"Units" : "KB"
},
"virtiofsds": {
"Result": ${virtiofsd_mem},
"Units" : "KB"
},
"shims": {
"Result": ${shim_mem},
"Units" : "KB"
}
}
EOF
)"
fi
metrics_json_add_array_element "$json"
metrics_json_end_array "Results"
clean_env_ctr
}
function save_config(){
metrics_json_start_array
local json="$(cat << EOF
{
"containers": "${NUM_CONTAINERS}",
"ksm": "${ksm_on}",
"auto": "${AUTO_MODE}",
"waittime": "${WAIT_TIME}",
"image": "${IMAGE}",
"command": "${CMD}"
}
EOF
)"
metrics_json_add_array_element "$json"
metrics_json_end_array "Config"
}
function main(){
# Verify enough arguments
if [ $# != 2 ] && [ $# != 3 ];then
echo >&2 "error: Not enough arguments [$@]"
help
exit 1
fi
#Check for KSM before reporting test name, as it can modify it
check_for_ksm
init_env
check_cmds "${SMEM_BIN}" bc
check_images "${IMAGE}"
if [ "${CTR_RUNTIME}" == "io.containerd.kata.v2" ]; then
export RUNTIME="kata-runtime"
elif [ "${CTR_RUNTIME}" == "io.containerd.runc.v2" ]; then
export RUNTIME="runc"
else
die "Unknown runtime ${CTR_RUNTIME}"
fi
metrics_json_init
save_config
get_memory_usage
if [ "$RUNTIME" == "runc" ]; then
get_runc_individual_memory
elif [ "$RUNTIME" == "kata-runtime" ]; then
get_individual_memory
fi
metrics_json_save
}
main "$@"

View File

@@ -0,0 +1,134 @@
#!/bin/bash
# Copyright (c) 2017-2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
#
# Description of the test:
# This test launches a busybox container and inside
# memory free, memory available and total memory
# is measured by using /proc/meminfo.
set -e
# General env
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
source "${SCRIPT_PATH}/../lib/common.bash"
TEST_NAME="memory footprint inside container"
VERSIONS_FILE="${SCRIPT_PATH}/../../versions.yaml"
IMAGE='quay.io/prometheus/busybox:latest'
CMD="sleep 10; cat /proc/meminfo"
# We specify here in 'k', as that then matches the results we get from the meminfo,
# which makes later direct comparison easier.
MEMSIZE=${MEMSIZE:-$((2048*1024))}
# this variable determines the number of attempts when a test
# result is considered not valid (a zero value or a negative value)
MAX_FAILED_ATTEMPTS=3
memtotalAvg=0
units_memtotal=""
memfreeAvg=0
units_memfree=""
memavailableAvg=0
units_memavailable=""
# count_iters: is the index of the current iteration
count_iters=0
# valid_result: if value stored is '1' the result is valid, '0' otherwise
valid_result=0
parse_results() {
local raw_results="${1}"
# Variables used for sum cummulative values in the case of two or more reps.
# and used to compute average results for 'json' output format.
local memtotal_acu="${2:-0}"
local memfree_acu="${3:-0}"
local memavailable_acu="${4:-0}"
local memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $2}')
units_memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $3}')
local memfree=$(echo "$raw_results" | awk '/MemFree/ {print $2}')
units_memfree=$(echo "$raw_results" | awk '/MemFree/ {print $3}')
local memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $2}')
units_memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $3}')
# check results: if any result is zero or negative, it is considered as invalid, and the test will be repeated.
if (( $(echo "$memtotal <= 0" | bc -l) )) || (( $(echo "$memfree <= 0" | bc -l) )) || (( $(echo "$memavailable <= 0" | bc -l) )); then
MAX_FAILED_ATTEMPTS=$((MAX_FAILED_ATTEMPTS-1))
valid_result=0
info "Skipping invalid result: memtotal: $memtotal memfree: $memfree memavailable: $memavailable"
return 0
fi
memtotalAvg=$((memtotal+memtotal_acu))
memfreeAvg=$((memfree+memfree_acu))
memavailableAvg=$((memavailable+memavailable_acu))
valid_result=1
info "Iteration# $count_iters memtotal: $memtotal memfree: $memfree memavailable: $memavailable"
}
store_results_json() {
metrics_json_start_array
memtotalAvg=$(echo "scale=2; $memtotalAvg / $count_iters" | bc)
memfreeAvg=$(echo "scale=2; $memfreeAvg / $count_iters" | bc)
memavailableAvg=$(echo "scale=2; $memavailableAvg / $count_iters" | bc)
local json="$(cat << EOF
{
"memrequest": {
"Result" : ${MEMSIZE},
"Units" : "Kb"
},
"memtotal": {
"Result" : ${memtotalAvg},
"Units" : "${units_memtotal}"
},
"memfree": {
"Result" : ${memfreeAvg},
"Units" : "${units_memfree}"
},
"memavailable": {
"Result" : ${memavailableAvg},
"Units" : "${units_memavailable}"
},
"repetitions": {
"Result" : ${count_iters}
}
}
EOF
)"
metrics_json_add_array_element "$json"
metrics_json_end_array "Results"
metrics_json_save
}
function main() {
# switch to select output format
local num_iterations=${1:-1}
info "Iterations: $num_iterations"
# Check tools/commands dependencies
cmds=("awk" "ctr")
init_env
check_cmds "${cmds[@]}"
check_images "${IMAGE}"
metrics_json_init
while [ $count_iters -lt $num_iterations ]; do
local output=$(sudo -E "${CTR_EXE}" run --memory-limit $((MEMSIZE*1024)) --rm --runtime=$CTR_RUNTIME $IMAGE busybox sh -c "$CMD" 2>&1)
parse_results "${output}" "${memtotalAvg}" "${memfreeAvg}" "${memavailableAvg}"
# quit if number of attempts exceeds the allowed value.
[ ${MAX_FAILED_ATTEMPTS} -eq 0 ] && die "Max number of attempts exceeded."
[ ${valid_result} -eq 1 ] && count_iters=$((count_iters+1))
done
store_results_json
clean_env_ctr
}
# Parameters
# @1: num_iterations {integer}
main "$@"

View File

@@ -9,24 +9,28 @@ set -o errexit
set -o nounset
set -o pipefail
kata_tarball_dir=${2:-kata-artifacts}
kata_tarball_dir="${2:-kata-artifacts}"
metrics_dir="$(dirname "$(readlink -f "$0")")"
source "${metrics_dir}/../common.bash"
source "${metrics_dir}/lib/common.bash"
create_symbolic_links() {
hypervisor="${1:-qemu}"
declare -r results_dir="${metrics_dir}/results"
declare -r checkmetrics_dir="${metrics_dir}/cmd/checkmetrics"
declare -r checkmetrics_config_dir="${checkmetrics_dir}/ci_worker"
function create_symbolic_links() {
local link_configuration_file="/opt/kata/share/defaults/kata-containers/configuration.toml"
local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${hypervisor}.toml"
local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${KATA_HYPERVISOR}.toml"
if [ ${hypervisor} != 'qemu' ] && [ ${hypervisor} != 'clh' ]; then
die "Failed to set the configuration.toml: '${hypervisor}' is not recognized as a valid hypervisor name."
if [ "${KATA_HYPERVISOR}" != 'qemu' ] && [ "${KATA_HYPERVISOR}" != 'clh' ]; then
die "Failed to set the configuration.toml: '${KATA_HYPERVISOR}' is not recognized as a valid hypervisor name."
fi
sudo ln -sf "${source_configuration_file}" "${link_configuration_file}"
}
# Configures containerd
overwrite_containerd_config() {
function overwrite_containerd_config() {
containerd_config="/etc/containerd/config.toml"
sudo rm "${containerd_config}"
sudo tee "${containerd_config}" << EOF
@@ -44,7 +48,7 @@ version = 2
EOF
}
install_kata() {
function install_kata() {
local kata_tarball="kata-static.tar.xz"
declare -r katadir="/opt/kata"
declare -r destdir="/"
@@ -53,7 +57,7 @@ install_kata() {
# Removing previous kata installation
sudo rm -rf "${katadir}"
pushd ${kata_tarball_dir}
pushd "${kata_tarball_dir}"
sudo tar -xvf "${kata_tarball}" -C "${destdir}"
popd
@@ -64,17 +68,26 @@ install_kata() {
check_containerd_config_for_kata
restart_containerd_service
install_checkmetrics
}
check_containerd_config_for_kata() {
function install_checkmetrics() {
# Ensure we have the latest checkmetrics
pushd "${checkmetrics_dir}"
make
sudo make install
popd
}
function check_containerd_config_for_kata() {
# check containerd config
declare -r line1="default_runtime_name = \"kata\""
declare -r line2="runtime_type = \"io.containerd.kata.v2\""
declare -r num_lines_containerd=2
declare -r containerd_path="/etc/containerd/config.toml"
local count_matches=$(grep -ic "$line1\|$line2" ${containerd_path})
local count_matches=$(grep -ic "$line1\|$line2" "${containerd_path}")
if [ $count_matches = $num_lines_containerd ]; then
if [ "${count_matches}" = "${num_lines_containerd}" ]; then
info "containerd ok"
else
info "overwriting containerd configuration w/ a valid one"
@@ -82,21 +95,62 @@ check_containerd_config_for_kata() {
fi
}
function check_metrics() {
local cm_base_file="${checkmetrics_config_dir}/checkmetrics-json-${KATA_HYPERVISOR}-kata-metric8.toml"
checkmetrics --debug --percentage --basefile "${cm_base_file}" --metricsdir "${results_dir}"
cm_result=$?
if [ "${cm_result}" != 0 ]; then
die "run-metrics-ci: checkmetrics FAILED (${cm_result})"
fi
}
function make_tarball_results() {
compress_metrics_results_dir "${metrics_dir}/results" "${GITHUB_WORKSPACE}/results-${KATA_HYPERVISOR}.tar.gz"
}
function run_test_launchtimes() {
hypervisor="${1}"
info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor"
info "Running Launch Time test using ${hypervisor} hypervisor"
create_symbolic_links "${hypervisor}"
create_symbolic_links
bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20
}
function run_test_memory_usage() {
info "Running memory-usage test using ${KATA_HYPERVISOR} hypervisor"
create_symbolic_links
bash tests/metrics/density/memory_usage.sh 20 5
check_metrics
}
function run_test_memory_usage_inside_container() {
info "Running memory-usage inside the container test using ${KATA_HYPERVISOR} hypervisor"
# ToDo: remove the exit once the metrics workflow is stable
exit 0
create_symbolic_links
bash tests/metrics/density/memory_usage_inside_container.sh 5
}
function run_test_blogbench() {
info "Running Blogbench test using ${KATA_HYPERVISOR} hypervisor"
# ToDo: remove the exit once the metrics workflow is stable
exit 0
create_symbolic_links
bash tests/metrics/storage/blogbench.sh
}
function main() {
action="${1:-}"
case "${action}" in
install-kata) install_kata ;;
run-test-launchtimes-qemu) run_test_launchtimes "qemu" ;;
run-test-launchtimes-clh) run_test_launchtimes "clh" ;;
make-tarball-results) make_tarball_results ;;
run-test-launchtimes) run_test_launchtimes ;;
run-test-memory-usage) run_test_memory_usage ;;
run-test-memory-usage-inside-container) run_test_memory_usage_inside_container ;;
run-test-blogbench) run_test_blogbench ;;
*) >&2 die "Invalid argument" ;;
esac
}

View File

@@ -47,14 +47,14 @@ quay.io/libpod"
#
# cmds=(“cmd1” “cmd2”)
# check_cmds "${cmds[@]}"
check_cmds()
function check_cmds()
{
local cmd req_cmds=( "$@" )
for cmd in "${req_cmds[@]}"; do
if ! command -v "$cmd" > /dev/null 2>&1; then
die "command $cmd not available"
fi
echo "command: $cmd: yes"
info "command: $cmd: yes"
done
}
@@ -68,19 +68,20 @@ check_cmds()
#
# images=(“img1” “img2”)
# check_imgs "${images[@]}"
check_images()
function check_images()
{
local img req_images=( "$@" )
for img in "${req_images[@]}"; do
echo "ctr pull'ing: $img"
info "ctr pull'ing: $img"
if ! sudo "${CTR_EXE}" image pull "$img"; then
die "Failed to pull image $img"
fi
echo "ctr pull'd: $img"
info "ctr pull'd: $img"
done
}
generate_build_dockerfile() {
function generate_build_dockerfile()
{
local dockerfile="$1"
local image="$2"
local map_key="$3"
@@ -99,14 +100,14 @@ generate_build_dockerfile() {
# This function performs a build on the image names
# passed in, to ensure that we have the latest changes from
# the dockerfiles
build_dockerfile_image()
function build_dockerfile_image()
{
local image="$1"
local dockerfile_path="$2"
local dockerfile_dir=${2%/*}
if [ -f "$dockerfile_path" ]; then
echo "docker building $image"
info "docker building $image"
if ! sudo "${DOCKER_EXE}" build --build-arg http_proxy="${http_proxy}" --build-arg https_proxy="${https_proxy}" --label "$image" --tag "${image}" -f "$dockerfile_path" "$dockerfile_dir"; then
die "Failed to docker build image $image"
fi
@@ -119,7 +120,7 @@ build_dockerfile_image()
# This function removes the ctr image, builds a new one using a dockerfile
# and imports the image from docker to ctr
check_ctr_images()
function check_ctr_images()
{
local ctr_image="$1"
local dockerfile_path="$2"
@@ -138,7 +139,7 @@ check_ctr_images()
# A one time (per uber test cycle) init that tries to get the
# system to a 'known state' as much as possible
metrics_onetime_init()
function metrics_onetime_init()
{
# The onetime init must be called once, and only once
if [ ! -z "$onetime_init_done" ]; then
@@ -155,14 +156,14 @@ metrics_onetime_init()
# Print a banner to the logs noting clearly which test
# we are about to run
test_banner()
function test_banner()
{
echo -e "\n===== starting test [$1] ====="
info -e "\n===== starting test [$1] ====="
}
# Initialization/verification environment. This function makes
# minimal steps for metrics/tests execution.
init_env()
function init_env()
{
test_banner "${TEST_NAME}"
@@ -183,7 +184,8 @@ init_env()
# This function checks if there are containers or
# shim/proxy/hypervisor processes up, if found, they are
# killed to start test with clean environment.
kill_processes_before_start() {
function kill_processes_before_start()
{
DOCKER_PROCS=$(sudo "${DOCKER_EXE}" ps -q)
[[ -n "${DOCKER_PROCS}" ]] && clean_env
@@ -195,26 +197,29 @@ kill_processes_before_start() {
# Generate a random name - generally used when creating containers, but can
# be used for any other appropriate purpose
random_name() {
function random_name()
{
mktemp -u kata-XXXXXX
}
show_system_ctr_state() {
echo "Showing system state:"
echo " --Check containers--"
function show_system_ctr_state()
{
info "Showing system state:"
info " --Check containers--"
sudo "${CTR_EXE}" c list
echo " --Check tasks--"
info " --Check tasks--"
sudo "${CTR_EXE}" task list
local processes="containerd-shim-kata-v2"
for p in ${processes}; do
echo " --pgrep ${p}--"
info " --pgrep ${p}--"
pgrep -a ${p}
done
}
common_init(){
function common_init()
{
if [ "$CTR_RUNTIME" == "io.containerd.kata.v2" ] || [ "$RUNTIME" == "containerd-shim-kata-v2" ]; then
extract_kata_env
else
@@ -225,17 +230,18 @@ common_init(){
fi
}
# Save the current KSM settings so we can restore them later
save_ksm_settings(){
echo "saving KSM settings"
function save_ksm_settings()
{
info "saving KSM settings"
ksm_stored_run=$(cat ${KSM_ENABLE_FILE})
ksm_stored_pages=$(cat ${KSM_ENABLE_FILE})
ksm_stored_sleep=$(cat ${KSM_ENABLE_FILE})
}
set_ksm_aggressive(){
echo "setting KSM to aggressive mode"
function set_ksm_aggressive()
{
info "setting KSM to aggressive mode"
# Flip the run off/on to ensure a restart/rescan
sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}"
sudo bash -c "echo ${KSM_AGGRESIVE_PAGES} > ${KSM_PAGES_FILE}"
@@ -245,7 +251,7 @@ set_ksm_aggressive(){
if [ "${KATA_HYPERVISOR}" == "qemu" ]; then
# Disable virtio-fs and save whether it was enabled previously
set_virtio_out=$(sudo -E PATH="$PATH" "${LIB_DIR}/../../.ci/set_kata_config.sh" shared_fs virtio-9p)
echo "${set_virtio_out}"
info "${set_virtio_out}"
grep -q "already" <<< "${set_virtio_out}" || was_virtio_fs=true;
fi
}
@@ -256,8 +262,9 @@ restore_virtio_fs(){
info "Not restoring virtio-fs since it wasn't enabled previously"
}
restore_ksm_settings(){
echo "restoring KSM settings"
function restore_ksm_settings()
{
info "restoring KSM settings"
# First turn off the run to ensure if we are then re-enabling
# that any changes take effect
sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}"
@@ -267,15 +274,17 @@ restore_ksm_settings(){
[ "${KATA_HYPERVISOR}" == "qemu" ] && restore_virtio_fs
}
disable_ksm(){
echo "disabling KSM"
function disable_ksm()
{
info "disabling KSM"
sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}"
[ "${KATA_HYPERVISOR}" == "qemu" ] && restore_virtio_fs
}
# See if KSM is enabled.
# If so, amend the test name to reflect that
check_for_ksm(){
function check_for_ksm()
{
if [ ! -f ${KSM_ENABLE_FILE} ]; then
return
fi
@@ -294,7 +303,8 @@ check_for_ksm(){
# a full scan has managed to do few new merges)
#
# arg1 - timeout in seconds
wait_ksm_settle(){
function wait_ksm_settle()
{
[[ "$RUNTIME" == "runc" ]] || [[ "$CTR_RUNTIME" == "io.containerd.runc.v2" ]] && return
local t pcnt
local oldscan=-1 newscan
@@ -305,7 +315,7 @@ wait_ksm_settle(){
# Wait some time for KSM to kick in to avoid early dismissal
for ((t=0; t<5; t++)); do
pages=$(cat "${KSM_PAGES_SHARED}")
[[ "$pages" -ne 0 ]] && echo "Discovered KSM activity" && break
[[ "$pages" -ne 0 ]] && info "Discovered KSM activity" && break
sleep 1
done
@@ -315,13 +325,13 @@ wait_ksm_settle(){
newscan=$(cat /sys/kernel/mm/ksm/full_scans)
newpages=$(cat "${KSM_PAGES_SHARED}")
[[ "$newpages" -eq 0 ]] && echo "No need to wait for KSM to settle" && return
[[ "$newpages" -eq 0 ]] && info "No need to wait for KSM to settle" && return
if (( newscan != oldscan )); then
echo -e "\nnew full_scan ($oldscan to $newscan)"
info -e "\nnew full_scan ($oldscan to $newscan)"
# Do we have a previous scan to compare with
echo "check pages $oldpages to $newpages"
info "check pages $oldpages to $newpages"
if (( oldpages != -1 )); then
# avoid divide by zero problems
@@ -330,14 +340,14 @@ wait_ksm_settle(){
# abs()
pcnt=$(( $pcnt * -1 ))
echo "$oldpages to $newpages is ${pcnt}%"
info "$oldpages to $newpages is ${pcnt}%"
if (( $pcnt <= 5 )); then
echo "KSM stabilised at ${t}s"
info "KSM stabilised at ${t}s"
return
fi
else
echo "$oldpages KSM pages... waiting"
info "$oldpages KSM pages... waiting"
fi
fi
oldscan=$newscan
@@ -347,7 +357,7 @@ wait_ksm_settle(){
fi
sleep 1
done
echo "Timed out after ${1}s waiting for KSM to settle"
info "Timed out after ${1}s waiting for KSM to settle"
}
common_init

View File

@@ -0,0 +1,124 @@
#!/bin/bash
#
# Copyright (c) 2018-2023 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
# Description of the test:
# This test runs the 'blogbench', and extracts the 'scores' for reads
# and writes
# Note - the scores are *not* normalised for the number of iterations run,
# they are total scores for all iterations (this is the blogbench default output)
set -e
# General env
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
source "${SCRIPT_PATH}/../lib/common.bash"
TEST_NAME="blogbench"
IMAGE="docker.io/library/local-blogbench:latest"
DOCKERFILE="${SCRIPT_PATH}/blogbench_dockerfile/Dockerfile"
# Number of iterations for blogbench to run - note, results are not
# scaled to iterations - more iterations results in bigger results
ITERATIONS="${ITERATIONS:-30}"
# Directory to run the test on
# This is run inside of the container
TESTDIR="${TESTDIR:-/tmp}"
CMD="blogbench -i ${ITERATIONS} -d ${TESTDIR}"
function main() {
# Check tools/commands dependencies
cmds=("awk" "docker")
init_env
check_cmds "${cmds[@]}"
check_ctr_images "${IMAGE}" "${DOCKERFILE}"
metrics_json_init
local output=$(sudo -E ${CTR_EXE} run --rm --runtime=${CTR_RUNTIME} ${IMAGE} test ${CMD})
# Save configuration
metrics_json_start_array
local frequency=$(echo "${output}" | grep "Frequency" | cut -d "=" -f2 | cut -d ' ' -f2)
local iterations=$(echo "${output}" | grep -w "iterations" | cut -d ' ' -f3)
local spawing_writers=$(echo "${output}" | grep -w "writers" | cut -d ' ' -f2)
local spawing_rewriters=$(echo "${output}" | grep -w "rewriters" | cut -d ' ' -f2)
local spawing_commenters=$(echo "${output}" | grep -w "commenters" | cut -d ' ' -f2)
local spawing_readers=$(echo "${output}" | grep -w "readers" | cut -d ' ' -f2)
local json="$(cat << EOF
{
"Frequency" : ${frequency},
"Iterations" : ${iterations},
"Number of spawing writers" : ${spawing_writers},
"Number of spawing rewriters" : ${spawing_rewriters},
"Number of spawing commenters" : ${spawing_commenters},
"Number of spawing readers" : ${spawing_readers}
}
EOF
)"
metrics_json_add_array_element "${json}"
metrics_json_end_array "Config"
# Save results
metrics_json_start_array
local writes=$(tail -2 <<< "${output}" | head -1 | awk '{print $5}')
local reads=$(tail -1 <<< "${output}" | awk '{print $6}')
# Obtaining other Blogbench results
local -r data=$(echo "${output}" | tail -n +12 | head -n -3)
local nb_blogs=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $1} ' | tr '\t' ',' | sed '$ s/.$//')
local r_articles=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $2} ' | tr '\t' ',' | sed '$ s/.$//')
local w_articles=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $3} ' | tr '\t' ',' | sed '$ s/.$//')
local r_pictures=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $4} ' | tr '\t' ',' | sed '$ s/.$//')
local w_pictures=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $5} ' | tr '\t' ',' | sed '$ s/.$//')
local r_comments=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $6} ' | tr '\t' ',' | sed '$ s/.$//')
local w_comments=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $7} ' | tr '\t' ',' | sed '$ s/.$//')
local json="$(cat << EOF
{
"write": {
"Result" : "${writes}",
"Units" : "items"
},
"read": {
"Result" : "${reads}",
"Units" : "items"
},
"Nb blogs": {
"Result" : "${nb_blogs}"
},
"R articles": {
"Result" : "${r_articles}"
},
"W articles": {
"Result" : "${w_articles}"
},
"R pictures": {
"Result" : "${r_pictures}"
},
"W pictures": {
"Result" : "${w_pictures}"
},
"R comments": {
"Result" : "${r_comments}"
},
"W comments": {
"Result" : "${w_comments}"
}
}
EOF
)"
metrics_json_add_array_element "${json}"
metrics_json_end_array "Results"
metrics_json_save
clean_env_ctr
}
main "$@"

Some files were not shown because too many files have changed in this diff Show More