diff --git a/.github/workflows/build-kata-static-tarball-amd64.yaml b/.github/workflows/build-kata-static-tarball-amd64.yaml
index b5c7584fe..f0f606850 100644
--- a/.github/workflows/build-kata-static-tarball-amd64.yaml
+++ b/.github/workflows/build-kata-static-tarball-amd64.yaml
@@ -13,14 +13,15 @@ on:
required: false
type: string
default: no
+ commit-hash:
+ required: false
+ type: string
jobs:
build-asset:
runs-on: ubuntu-latest
strategy:
matrix:
- stage:
- - ${{ inputs.stage }}
asset:
- cloud-hypervisor
- cloud-hypervisor-glibc
@@ -46,9 +47,11 @@ jobs:
- shim-v2
- tdvf
- virtiofsd
+ stage:
+ - ${{ inputs.stage }}
exclude:
- - stage: release
- asset: cloud-hypervisor-glibc
+ - asset: cloud-hypervisor-glibc
+ stage: release
steps:
- name: Login to Kata Containers quay.io
if: ${{ inputs.push-to-registry == 'yes' }}
@@ -60,7 +63,7 @@ jobs:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
fetch-depth: 0 # This is needed in order to keep the commit ids history
- name: Build ${{ matrix.asset }}
@@ -88,7 +91,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
- name: get-artifacts
uses: actions/download-artifact@v3
with:
diff --git a/.github/workflows/build-kata-static-tarball-arm64.yaml b/.github/workflows/build-kata-static-tarball-arm64.yaml
index 1fc981733..2ad97a0ba 100644
--- a/.github/workflows/build-kata-static-tarball-arm64.yaml
+++ b/.github/workflows/build-kata-static-tarball-arm64.yaml
@@ -9,6 +9,9 @@ on:
required: false
type: string
default: no
+ commit-hash:
+ required: false
+ type: string
jobs:
build-asset:
@@ -41,7 +44,7 @@ jobs:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
fetch-depth: 0 # This is needed in order to keep the commit ids history
- name: Build ${{ matrix.asset }}
run: |
@@ -72,7 +75,7 @@ jobs:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
- name: get-artifacts
uses: actions/download-artifact@v3
with:
diff --git a/.github/workflows/build-kata-static-tarball-s390x.yaml b/.github/workflows/build-kata-static-tarball-s390x.yaml
index 58186ab8c..cf2831033 100644
--- a/.github/workflows/build-kata-static-tarball-s390x.yaml
+++ b/.github/workflows/build-kata-static-tarball-s390x.yaml
@@ -9,6 +9,9 @@ on:
required: false
type: string
default: no
+ commit-hash:
+ required: false
+ type: string
jobs:
build-asset:
@@ -37,7 +40,7 @@ jobs:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
fetch-depth: 0 # This is needed in order to keep the commit ids history
- name: Build ${{ matrix.asset }}
run: |
@@ -69,7 +72,7 @@ jobs:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
- name: get-artifacts
uses: actions/download-artifact@v3
with:
diff --git a/.github/workflows/ci-nightly.yaml b/.github/workflows/ci-nightly.yaml
new file mode 100644
index 000000000..9a47ce0e4
--- /dev/null
+++ b/.github/workflows/ci-nightly.yaml
@@ -0,0 +1,14 @@
+name: Kata Containers Nightly CI
+on:
+ schedule:
+ - cron: '0 0 * * *'
+ workflow_dispatch:
+
+jobs:
+ kata-containers-ci-on-push:
+ uses: ./.github/workflows/ci.yaml
+ with:
+ commit-hash: ${{ github.sha }}
+ pr-number: "nightly"
+ tag: ${{ github.sha }}-nightly
+ secrets: inherit
diff --git a/.github/workflows/ci-on-push.yaml b/.github/workflows/ci-on-push.yaml
index 9f7c82eaf..6d4cc7fc0 100644
--- a/.github/workflows/ci-on-push.yaml
+++ b/.github/workflows/ci-on-push.yaml
@@ -12,65 +12,14 @@ on:
- synchronize
- reopened
- labeled
-
+ paths-ignore:
+ - 'docs/**'
jobs:
- build-kata-static-tarball-amd64:
+ kata-containers-ci-on-push:
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
- uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml
+ uses: ./.github/workflows/ci.yaml
with:
- tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }}
-
- publish-kata-deploy-payload-amd64:
- if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
- needs: build-kata-static-tarball-amd64
- uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml
- with:
- tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }}
- registry: ghcr.io
- repo: ${{ github.repository_owner }}/kata-deploy-ci
- tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
+ commit-hash: ${{ github.event.pull_request.head.sha }}
+ pr-number: ${{ github.event.pull_request.number }}
+ tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}
secrets: inherit
-
- run-k8s-tests-on-aks:
- if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
- needs: publish-kata-deploy-payload-amd64
- uses: ./.github/workflows/run-k8s-tests-on-aks.yaml
- with:
- registry: ghcr.io
- repo: ${{ github.repository_owner }}/kata-deploy-ci
- tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
- secrets: inherit
-
- run-k8s-tests-on-sev:
- if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
- needs: publish-kata-deploy-payload-amd64
- uses: ./.github/workflows/run-k8s-tests-on-sev.yaml
- with:
- registry: ghcr.io
- repo: ${{ github.repository_owner }}/kata-deploy-ci
- tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
-
- run-k8s-tests-on-snp:
- if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
- needs: publish-kata-deploy-payload-amd64
- uses: ./.github/workflows/run-k8s-tests-on-snp.yaml
- with:
- registry: ghcr.io
- repo: ${{ github.repository_owner }}/kata-deploy-ci
- tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
-
- run-k8s-tests-on-tdx:
- if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
- needs: publish-kata-deploy-payload-amd64
- uses: ./.github/workflows/run-k8s-tests-on-tdx.yaml
- with:
- registry: ghcr.io
- repo: ${{ github.repository_owner }}/kata-deploy-ci
- tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
-
- run-metrics-tests:
- if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
- needs: build-kata-static-tarball-amd64
- uses: ./.github/workflows/run-metrics.yaml
- with:
- tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }}
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 000000000..faec7fca4
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,76 @@
+name: Run the Kata Containers CI
+on:
+ workflow_call:
+ inputs:
+ commit-hash:
+ required: true
+ type: string
+ pr-number:
+ required: true
+ type: string
+ tag:
+ required: true
+ type: string
+
+jobs:
+ build-kata-static-tarball-amd64:
+ uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml
+ with:
+ tarball-suffix: -${{ inputs.tag }}
+ commit-hash: ${{ inputs.commit-hash }}
+
+ publish-kata-deploy-payload-amd64:
+ needs: build-kata-static-tarball-amd64
+ uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml
+ with:
+ tarball-suffix: -${{ inputs.tag }}
+ registry: ghcr.io
+ repo: ${{ github.repository_owner }}/kata-deploy-ci
+ tag: ${{ inputs.tag }}-amd64
+ commit-hash: ${{ inputs.commit-hash }}
+ secrets: inherit
+
+ run-k8s-tests-on-aks:
+ needs: publish-kata-deploy-payload-amd64
+ uses: ./.github/workflows/run-k8s-tests-on-aks.yaml
+ with:
+ registry: ghcr.io
+ repo: ${{ github.repository_owner }}/kata-deploy-ci
+ tag: ${{ inputs.tag }}-amd64
+ commit-hash: ${{ inputs.commit-hash }}
+ pr-number: ${{ inputs.pr-number }}
+ secrets: inherit
+
+ run-k8s-tests-on-sev:
+ needs: publish-kata-deploy-payload-amd64
+ uses: ./.github/workflows/run-k8s-tests-on-sev.yaml
+ with:
+ registry: ghcr.io
+ repo: ${{ github.repository_owner }}/kata-deploy-ci
+ tag: ${{ inputs.tag }}-amd64
+ commit-hash: ${{ inputs.commit-hash }}
+
+ run-k8s-tests-on-snp:
+ needs: publish-kata-deploy-payload-amd64
+ uses: ./.github/workflows/run-k8s-tests-on-snp.yaml
+ with:
+ registry: ghcr.io
+ repo: ${{ github.repository_owner }}/kata-deploy-ci
+ tag: ${{ inputs.tag }}-amd64
+ commit-hash: ${{ inputs.commit-hash }}
+
+ run-k8s-tests-on-tdx:
+ needs: publish-kata-deploy-payload-amd64
+ uses: ./.github/workflows/run-k8s-tests-on-tdx.yaml
+ with:
+ registry: ghcr.io
+ repo: ${{ github.repository_owner }}/kata-deploy-ci
+ tag: ${{ inputs.tag }}-amd64
+ commit-hash: ${{ inputs.commit-hash }}
+
+ run-metrics-tests:
+ needs: build-kata-static-tarball-amd64
+ uses: ./.github/workflows/run-metrics.yaml
+ with:
+ tarball-suffix: -${{ inputs.tag }}
+ commit-hash: ${{ inputs.commit-hash }}
diff --git a/.github/workflows/payload-after-push.yaml b/.github/workflows/payload-after-push.yaml
index 97bb309b1..871d73388 100644
--- a/.github/workflows/payload-after-push.yaml
+++ b/.github/workflows/payload-after-push.yaml
@@ -9,18 +9,21 @@ jobs:
build-assets-amd64:
uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml
with:
+ commit-hash: ${{ github.sha }}
push-to-registry: yes
secrets: inherit
build-assets-arm64:
uses: ./.github/workflows/build-kata-static-tarball-arm64.yaml
with:
+ commit-hash: ${{ github.sha }}
push-to-registry: yes
secrets: inherit
build-assets-s390x:
uses: ./.github/workflows/build-kata-static-tarball-s390x.yaml
with:
+ commit-hash: ${{ github.sha }}
push-to-registry: yes
secrets: inherit
@@ -28,6 +31,7 @@ jobs:
needs: build-assets-amd64
uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml
with:
+ commit-hash: ${{ github.sha }}
registry: quay.io
repo: kata-containers/kata-deploy-ci
tag: kata-containers-amd64
@@ -37,6 +41,7 @@ jobs:
needs: build-assets-arm64
uses: ./.github/workflows/publish-kata-deploy-payload-arm64.yaml
with:
+ commit-hash: ${{ github.sha }}
registry: quay.io
repo: kata-containers/kata-deploy-ci
tag: kata-containers-arm64
@@ -46,6 +51,7 @@ jobs:
needs: build-assets-s390x
uses: ./.github/workflows/publish-kata-deploy-payload-s390x.yaml
with:
+ commit-hash: ${{ github.sha }}
registry: quay.io
repo: kata-containers/kata-deploy-ci
tag: kata-containers-s390x
diff --git a/.github/workflows/publish-kata-deploy-payload-amd64.yaml b/.github/workflows/publish-kata-deploy-payload-amd64.yaml
index 91c7a0612..b5ba900d8 100644
--- a/.github/workflows/publish-kata-deploy-payload-amd64.yaml
+++ b/.github/workflows/publish-kata-deploy-payload-amd64.yaml
@@ -14,6 +14,9 @@ on:
tag:
required: true
type: string
+ commit-hash:
+ required: false
+ type: string
jobs:
kata-payload:
@@ -21,7 +24,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
- name: get-kata-tarball
uses: actions/download-artifact@v3
diff --git a/.github/workflows/publish-kata-deploy-payload-arm64.yaml b/.github/workflows/publish-kata-deploy-payload-arm64.yaml
index c4fd32477..6c35ed8a3 100644
--- a/.github/workflows/publish-kata-deploy-payload-arm64.yaml
+++ b/.github/workflows/publish-kata-deploy-payload-arm64.yaml
@@ -14,6 +14,9 @@ on:
tag:
required: true
type: string
+ commit-hash:
+ required: false
+ type: string
jobs:
kata-payload:
@@ -25,7 +28,7 @@ jobs:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
- name: get-kata-tarball
uses: actions/download-artifact@v3
diff --git a/.github/workflows/publish-kata-deploy-payload-s390x.yaml b/.github/workflows/publish-kata-deploy-payload-s390x.yaml
index 2a0ea8071..ee7fa3fd7 100644
--- a/.github/workflows/publish-kata-deploy-payload-s390x.yaml
+++ b/.github/workflows/publish-kata-deploy-payload-s390x.yaml
@@ -14,6 +14,9 @@ on:
tag:
required: true
type: string
+ commit-hash:
+ required: false
+ type: string
jobs:
kata-payload:
@@ -25,7 +28,7 @@ jobs:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
- name: get-kata-tarball
uses: actions/download-artifact@v3
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index c553ca514..a50313fd0 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -72,8 +72,7 @@ jobs:
- uses: actions/checkout@v3
- name: install hub
run: |
- HUB_VER=$(curl -s "https://api.github.com/repos/github/hub/releases/latest" | jq -r .tag_name | sed 's/^v//')
- wget -q -O- https://github.com/github/hub/releases/download/v$HUB_VER/hub-linux-amd64-$HUB_VER.tgz | \
+ wget -q -O- https://github.com/mislav/hub/releases/download/v2.14.2/hub-linux-amd64-2.14.2.tgz | \
tar xz --strip-components=2 --wildcards '*/bin/hub' && sudo mv hub /usr/local/bin/hub
- name: download-artifacts-amd64
diff --git a/.github/workflows/run-k8s-tests-on-aks.yaml b/.github/workflows/run-k8s-tests-on-aks.yaml
index a39c2bbcd..d8658270a 100644
--- a/.github/workflows/run-k8s-tests-on-aks.yaml
+++ b/.github/workflows/run-k8s-tests-on-aks.yaml
@@ -11,6 +11,12 @@ on:
tag:
required: true
type: string
+ pr-number:
+ required: true
+ type: string
+ commit-hash:
+ required: false
+ type: string
jobs:
run-k8s-tests:
@@ -31,13 +37,13 @@ jobs:
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}
DOCKER_TAG: ${{ inputs.tag }}
- GH_PR_NUMBER: ${{ github.event.pull_request.number }}
+ GH_PR_NUMBER: ${{ inputs.pr-number }}
KATA_HOST_OS: ${{ matrix.host_os }}
KATA_HYPERVISOR: ${{ matrix.vmm }}
steps:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
- name: Download Azure CLI
run: bash tests/integration/gha-run.sh install-azure-cli
diff --git a/.github/workflows/run-k8s-tests-on-sev.yaml b/.github/workflows/run-k8s-tests-on-sev.yaml
index 52ab7f955..3fc4ca835 100644
--- a/.github/workflows/run-k8s-tests-on-sev.yaml
+++ b/.github/workflows/run-k8s-tests-on-sev.yaml
@@ -11,6 +11,9 @@ on:
tag:
required: true
type: string
+ commit-hash:
+ required: false
+ type: string
jobs:
run-k8s-tests:
@@ -29,7 +32,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
- name: Run tests
timeout-minutes: 30
diff --git a/.github/workflows/run-k8s-tests-on-snp.yaml b/.github/workflows/run-k8s-tests-on-snp.yaml
index 535c6de6d..8aa1763d2 100644
--- a/.github/workflows/run-k8s-tests-on-snp.yaml
+++ b/.github/workflows/run-k8s-tests-on-snp.yaml
@@ -11,6 +11,9 @@ on:
tag:
required: true
type: string
+ commit-hash:
+ required: false
+ type: string
jobs:
run-k8s-tests:
@@ -29,7 +32,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
- name: Run tests
timeout-minutes: 30
diff --git a/.github/workflows/run-k8s-tests-on-tdx.yaml b/.github/workflows/run-k8s-tests-on-tdx.yaml
index 886b1c026..ccbc16db7 100644
--- a/.github/workflows/run-k8s-tests-on-tdx.yaml
+++ b/.github/workflows/run-k8s-tests-on-tdx.yaml
@@ -11,6 +11,9 @@ on:
tag:
required: true
type: string
+ commit-hash:
+ required: false
+ type: string
jobs:
run-k8s-tests:
@@ -29,7 +32,7 @@ jobs:
steps:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
- name: Run tests
timeout-minutes: 30
diff --git a/.github/workflows/run-metrics.yaml b/.github/workflows/run-metrics.yaml
index 5e06c3074..92a5f8af9 100644
--- a/.github/workflows/run-metrics.yaml
+++ b/.github/workflows/run-metrics.yaml
@@ -5,16 +5,25 @@ on:
tarball-suffix:
required: false
type: string
+ commit-hash:
+ required: false
+ type: string
jobs:
run-metrics:
+ strategy:
+ fail-fast: true
+ matrix:
+ vmm: ['clh', 'qemu']
+ max-parallel: 1
runs-on: metrics
env:
GOPATH: ${{ github.workspace }}
+ KATA_HYPERVISOR: ${{ matrix.vmm }}
steps:
- uses: actions/checkout@v3
with:
- ref: ${{ github.event.pull_request.head.sha }}
+ ref: ${{ inputs.commit-hash }}
- name: get-kata-tarball
uses: actions/download-artifact@v3
@@ -25,8 +34,25 @@ jobs:
- name: Install kata
run: bash tests/metrics/gha-run.sh install-kata kata-artifacts
- - name: run launch times on qemu
- run: bash tests/metrics/gha-run.sh run-test-launchtimes-qemu
+ - name: run launch times test
+ run: bash tests/metrics/gha-run.sh run-test-launchtimes
- - name: run launch times on clh
- run: bash tests/metrics/gha-run.sh run-test-launchtimes-clh
+ - name: run memory foot print test
+ run: bash tests/metrics/gha-run.sh run-test-memory-usage
+
+ - name: run memory usage inside container test
+ run: bash tests/metrics/gha-run.sh run-test-memory-usage-inside-container
+
+ - name: run blogbench test
+ run: bash tests/metrics/gha-run.sh run-test-blogbench
+
+ - name: make metrics tarball ${{ matrix.vmm }}
+ run: bash tests/metrics/gha-run.sh make-tarball-results
+
+ - name: archive metrics results ${{ matrix.vmm }}
+ uses: actions/upload-artifact@v3
+ with:
+ name: metrics-artifacts-${{ matrix.vmm }}
+ path: results-${{ matrix.vmm }}.tar.gz
+ retention-days: 1
+ if-no-files-found: error
diff --git a/.github/workflows/static-checks-dragonball.yaml b/.github/workflows/static-checks-dragonball.yaml
index 9c8ae694b..61e3fe2c4 100644
--- a/.github/workflows/static-checks-dragonball.yaml
+++ b/.github/workflows/static-checks-dragonball.yaml
@@ -23,7 +23,7 @@ jobs:
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
run: |
./ci/install_rust.sh
- PATH=$PATH:"$HOME/.cargo/bin"
+ echo PATH="$HOME/.cargo/bin:$PATH" >> $GITHUB_ENV
- name: Run Unit Test
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
run: |
diff --git a/README.md b/README.md
index 7ea956d9d..78a62179c 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-[](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml)
+[](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml) [](https://github.com/kata-containers/kata-containers/actions/workflows/ci-nightly.yaml)
# Kata Containers
diff --git a/docs/design/README.md b/docs/design/README.md
index 2fe93b5f6..63d52c655 100644
--- a/docs/design/README.md
+++ b/docs/design/README.md
@@ -17,6 +17,7 @@ Kata Containers design documents:
- [Design for Kata Containers `Lazyload` ability with `nydus`](kata-nydus-design.md)
- [Design for direct-assigned volume](direct-blk-device-assignment.md)
- [Design for core-scheduling](core-scheduling.md)
+- [Virtualization Reference Architecture](kata-vra.md)
---
- [Design proposals](proposals)
diff --git a/docs/design/kata-vra.md b/docs/design/kata-vra.md
new file mode 100644
index 000000000..ba53c3371
--- /dev/null
+++ b/docs/design/kata-vra.md
@@ -0,0 +1,434 @@
+# Virtualization Reference Architecture
+
+## Subject to Change | © 2022 by NVIDIA Corporation. All rights reserved. | For test and development only_
+
+Before digging deeper into the virtualization reference architecture, let's
+first look at the various GPUDirect use cases in the following table. We’re
+distinguishing between two top-tier use cases where the devices are (1)
+passthrough and (2) virtualized, where a VM gets assigned a virtual function
+(VF) and not the physical function (PF). A combination of PF and VF would also
+be possible.
+
+| Device #1 (passthrough) | Device #2 (passthrough) | P2P Compatibility and Mode |
+| ------------------------- | ----------------------- | -------------------------------------------- |
+| GPU PF | GPU PF | GPUDirect P2P |
+| GPU PF | NIC PF | GPUDirect RDMA |
+| MIG-slice | MIG-slice | _No GPUDirect P2P_ |
+| MIG-slice | NIC PF | GPUDirect RDMA |
+| **PDevice #1 (virtualized)** | **Device #2 (virtualized)** | **P2P Compatibility and Mode** |
+| Time-slice vGPU VF | Time-slice vGPU VF | _No GPUDirect P2P but NVLINK P2P available_ |
+| Time-slice vGPU VF | NIC VF | GPUDirect RDMA |
+| MIG-slice vGPU | MIG-slice vGPU | _No GPUDirect P2P_ |
+| MIG-slice vGPU | NIC VF | GPUDirect RDMA |
+
+In a virtualized environment we have several distinct features that may prevent
+Peer-to-peer (P2P) communication of two endpoints in a PCI Express topology. The
+IOMMU translates IO virtual addresses (IOVA) to physical addresses (PA). Each
+device behind an IOMMU has its own IOVA memory space, usually, no two devices
+share the same IOVA memory space but it’s up to the hypervisor or OS how it
+chooses to map devices to IOVA spaces. Any PCI Express DMA transactions will
+use IOVAs, which the IOMMU must translate. By default, all the traffic is routed
+to the root complex and not issued directly to the peer device.
+
+An IOMMU can be used to isolate and protect devices even if virtualization is
+not used; since devices can only access memory regions that are mapped for it, a
+DMA from one device to another is not possible. DPDK uses the IOMMU to have
+better isolation between devices, another benefit is that IOVA space can be
+represented as a contiguous memory even if the PA space is heavily scattered.
+
+In the case of virtualization, the IOMMU is responsible for isolating the device
+and memory between VMs for safe device assignment without compromising the host
+and other guest OSes. Without an IOMMU, any device can access the entire system
+and perform DMA transactions _anywhere_.
+
+The second feature is ACS (Access Control Services), which controls which
+devices are allowed to communicate with one another and thus avoids improper
+routing of packets irrespectively of whether IOMMU is enabled or not.
+
+When IOMMU is enabled, ACS is normally configured to force all PCI Express DMA
+to go through the root complex so IOMMU can translate it, impacting performance
+between peers with higher latency and reduced bandwidth.
+
+A way to avoid the performance hit is to enable Address Translation Services
+(ATS). ATS-capable endpoints can prefetch IOVA -> PA translations from the IOMMU
+and then perform DMA transactions directly to another endpoint. Hypervisors
+enable this by enabling ATS in such endpoints, configuring ACS to enable Direct
+Translated P2P, and configuring the IOMMU to allow Address Translation requests.
+
+Another important factor is that the NVIDIA driver stack will use the PCI
+Express topology of the system it is running on to determine whether the
+hardware is capable of supporting P2P. The driver stack qualifies specific
+chipsets, and PCI Express switches for use with GPUDirect P2P. In virtual
+environments, the PCI Express topology is flattened and obfuscated to present a
+uniform environment to the software inside the VM, which breaks the GPUDirect
+P2P use case.
+
+On a bare metal machine, the driver stack groups GPUs into cliques that can
+perform GPUDirect P2P communication, excluding peer mappings where P2P
+communication is not possible, prominently if GPUs are attached to multiple CPU
+sockets.
+
+CPUs and local memory banks are referred to as NUMA nodes. In a two-socket
+server, each of the CPUs has a local memory bank for a total of two NUMA nodes.
+Some servers provide the ability to configure additional NUMA nodes per CPU,
+which means a CPU socket can have two NUMA nodes (some servers support four
+NUMA nodes per socket) with local memory banks and L3 NUMA domains for improved
+performance.
+
+One of the current solutions is that the hypervisor provides additional topology
+information that the driver stack can pick up and enable GPUDirect P2P between
+GPUs, even if the virtualized environment does not directly expose it. The PCI
+Express virtual P2P approval capability structure in the PCI configuration space
+is entirely emulated by the hypervisor of passthrough GPU devices.
+
+A clique ID is provided where GPUs with the same clique ID belong to a group of
+GPUs capable of P2P communication
+
+On vSphere, Azure, and other CPSs, the hypervisor lays down a `topologies.xml`
+which NCCL can pick up and deduce the right P2P level[^1]. NCCL is leveraging
+Infiniband (IB) and/or Unified Communication X (UCX) for communication, and
+GPUDirect P2P and GPUDirect RDMA should just work in this case. The only culprit
+is that software or applications that do not use the XML file to deduce the
+topology will fail and not enable GPUDirect ( [`nccl-p2p-level`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-p2p-level) )
+
+## Hypervisor PCI Express Topology
+
+To enable every part of the accelerator stack, we propose a virtualized
+reference architecture to enable GPUDirect P2P and GPUDirect RDMA for any
+hypervisor. The idea is split into two parts to enable the right PCI Express
+topology. The first part builds upon extending the PCI Express virtual P2P
+approval capability structure to every device that wants to do P2P in some way
+and groups devices by clique ID. The other part involves replicating a subset of
+the host topology so that applications running in the VM do not need to read
+additional information and enable the P2P capability like in the bare-metal use
+case described above. The driver stack can then deduce automatically if the
+topology presented in the VM is capable of P2P communication.
+
+We will work with the following host topology for the following sections. It is
+a system with two converged DPUs, each having an `A100X` GPU and two `ConnectX-6`
+network ports connected to the downstream ports of a PCI Express switch.
+
+```sh
++-00.0-[d8-df]----00.0-[d9-df]--+-00.0-[da-db]--+-00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
+ | +-00.1 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
+ | \-00.2 Mellanox Tech MT42822 BlueField-2 SoC Management Interface
+ \-01.0-[dc-df]----00.0-[dd-df]----08.0-[de-df]----00.0 NVIDIA Corporation GA100 [A100X]
+
++-00.0-[3b-42]----00.0-[3c-42]--+-00.0-[3d-3e]--+-00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
+ | +-00.1 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
+ | \-00.2 Mellanox Tech MT42822 BlueField-2 SoC Management Interface
+ \-01.0-[3f-42]----00.0-[40-42]----08.0-[41-42]----00.0 NVIDIA Corporation GA100 [A100X]
+```
+
+The green path highlighted above is the optimal and preferred path for
+efficient P2P communication.
+
+## PCI Express Virtual P2P Approval Capability
+
+Most of the time, the PCI Express topology is flattened and obfuscated to ensure
+easy migration of the VM image between different physical hardware topologies.
+In Kata, we can configure the hypervisor to use PCI Express root ports to
+hotplug the VFIO devices one is passing through. A user can select how many PCI
+Express root ports to allocate depending on how many devices are passed through.
+A recent addition to Kata will detect the right amount of PCI Express devices
+that need hotplugging and bail out if the number of root ports is insufficient.
+In Kata, we do not automatically increase the number of root ports, we want the
+user to be in full control of the topology.
+
+```toml
+# /etc/kata-containers/configuration.toml
+
+# VFIO devices are hotplugged on a bridge by default.
+# Enable hot-plugging on the root bus. This may be required for devices with
+# a large PCI bar, as this is a current limitation with hot-plugging on
+# a bridge.
+# Default “bridge-port”
+hotplug_vfio = "root-port"
+
+# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
+# Use this parameter when using some large PCI bar devices, such as NVIDIA GPU
+# The value means the number of pcie_root_port
+# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35"
+# Default 0
+pcie_root_port = 8
+```
+
+VFIO devices are hotplugged on a PCIe-PCI bridge by default. Hotplug of PCI
+Express devices is only supported on PCI Express root or downstream ports. With
+this configuration set, if we start up a Kata container, we can inspect our
+topology and see the allocated PCI Express root ports and the hotplugged
+devices.
+
+```sh
+$ lspci -tv
+ -[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
+ +-01.0 Red Hat, Inc. Virtio console
+ +-02.0 Red Hat, Inc. Virtio SCSI
+ +-03.0 Red Hat, Inc. Virtio RNG
+ +-04.0-[01]----00.0 Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6
+ +-05.0-[02]----00.0 Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6
+ +-06.0-[03]----00.0 NVIDIA Corporation Device 20b8
+ +-07.0-[04]----00.0 NVIDIA Corporation Device 20b8
+ +-08.0-[05]--
+ +-09.0-[06]--
+ +-0a.0-[07]--
+ +-0b.0-[08]--
+ +-0c.0 Red Hat, Inc. Virtio socket
+ +-0d.0 Red Hat, Inc. Virtio file system
+ +-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller
+ +-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller
+ \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
+```
+
+For devices with huge BARs (Base Address Registers) like the GPU (we need to
+configure the PCI Express root port properly and allocate enough memory for
+mapping), we have added a heuristic to Kata to deduce the right settings. Hence,
+the BARs can be mapped correctly. This functionality is added to
+[`nvidia/go-nvlib1](https://gitlab.com/nvidia/cloud-native/go-nvlib) which is part
+of Kata now.
+
+```sh
+$ sudo dmesg | grep BAR
+[ 0.179960] pci 0000:00:04.0: BAR 7: assigned [io 0x1000-0x1fff]
+[ 0.179962] pci 0000:00:05.0: BAR 7: assigned [io 0x2000-0x2fff]
+[ 0.179963] pci 0000:00:06.0: BAR 7: assigned [io 0x3000-0x3fff]
+[ 0.179964] pci 0000:00:07.0: BAR 7: assigned [io 0x4000-0x4fff]
+[ 0.179966] pci 0000:00:08.0: BAR 7: assigned [io 0x5000-0x5fff]
+[ 0.179967] pci 0000:00:09.0: BAR 7: assigned [io 0x6000-0x6fff]
+[ 0.179968] pci 0000:00:0a.0: BAR 7: assigned [io 0x7000-0x7fff]
+[ 0.179969] pci 0000:00:0b.0: BAR 7: assigned [io 0x8000-0x8fff]
+[ 2.115912] pci 0000:01:00.0: BAR 0: assigned [mem 0x13000000000-0x13001ffffff 64bit pref]
+[ 2.116203] pci 0000:01:00.0: BAR 2: assigned [mem 0x13002000000-0x130027fffff 64bit pref]
+[ 2.683132] pci 0000:02:00.0: BAR 0: assigned [mem 0x12000000000-0x12001ffffff 64bit pref]
+[ 2.683419] pci 0000:02:00.0: BAR 2: assigned [mem 0x12002000000-0x120027fffff 64bit pref]
+[ 2.959155] pci 0000:03:00.0: BAR 1: assigned [mem 0x11000000000-0x117ffffffff 64bit pref]
+[ 2.959345] pci 0000:03:00.0: BAR 3: assigned [mem 0x11800000000-0x11801ffffff 64bit pref]
+[ 2.959523] pci 0000:03:00.0: BAR 0: assigned [mem 0xf9000000-0xf9ffffff]
+[ 2.966119] pci 0000:04:00.0: BAR 1: assigned [mem 0x10000000000-0x107ffffffff 64bit pref]
+[ 2.966295] pci 0000:04:00.0: BAR 3: assigned [mem 0x10800000000-0x10801ffffff 64bit pref]
+[ 2.966472] pci 0000:04:00.0: BAR 0: assigned [mem 0xf7000000-0xf7ffffff]
+```
+
+The NVIDIA driver stack in this case would refuse to do P2P communication since
+(1) the topology is not what it expects, (2) we do not have a qualified
+chipset. Since our P2P devices are not connected to a PCI Express switch port,
+we need to provide additional information to support the P2P functionality. One
+way of providing such meta information would be to annotate the container; most
+of the settings in Kata's configuration file can be overridden via annotations,
+but this limits the flexibility, and a user would need to update all the
+containers that he wants to run with Kata. The goal is to make such things as
+transparent as possible, so we also introduced
+[CDI](https://github.com/container-orchestrated-devices/container-device-interface)
+(Container Device Interface) to Kata. CDI is a[
+specification](https://github.com/container-orchestrated-devices/container-device-interface/blob/master/SPEC.md)
+for container runtimes to support third-party devices.
+
+As written before, we can provide a clique ID for the devices that belong
+together and are capable of doing P2P. This information is provided to the
+hypervisor, which will set up things in the VM accordingly. Let's suppose the
+user wanted to do GPUDirect RDMA with the first GPU and the NIC that reside on
+the same DPU, one could provide the specification telling the hypervisor that
+they belong to the same clique.
+
+```yaml
+# /etc/cdi/nvidia.yaml
+cdiVersion: 0.4.0
+kind: nvidia.com/gpu
+devices:
+- name: gpu0
+ annotations:
+ bdf: “41:00.0”
+ clique-id: “0”
+ containerEdits:
+ deviceNodes:
+ - path: “/dev/vfio/71"
+
+# /etc/cdi/mellanox.yaml
+cdiVersion: 0.4.0
+kind: mellanox.com/nic
+devices:
+- name: nic0
+ annotations:
+ bdf: “3d:00.0”
+ clique-id: “0”
+ attach-pci: “true”
+ containerEdits:
+ deviceNodes:
+ - path: "/dev/vfio/66"
+```
+
+Since this setting is bound to the device and not the container we do not need
+to alter the container just allocate the right resource and GPUDirect RDMA would
+be set up correctly. Rather than exposing them separately, an idea would be to
+expose a GPUDirect RDMA device via NFD (Node Feature Discovery) that combines
+both of them; this way, we could make sure that the right pair is allocated and
+used more on Kubernetes deployment in the next section.
+
+The GPU driver stack is leveraging the PCI Express virtual P2P approval
+capability, but the NIC stack does not use this now. One of the action items is
+to enable MOFED to read the P2P approval capability and enable ATS and ACS
+settings as described above.
+
+This way, we could enable GPUDirect P2P and GPUDirect RDMA on any topology
+presented to the VM application. It is the responsibility of the administrator
+or infrastructure engineer to provide the right information either via
+annotations or a CDI specification.
+
+## Host Topology Replication
+
+The other way to represent the PCI Express topology in the VM is to replicate a
+subset of the topology needed to support the P2P use case inside the VM. Similar
+to the configuration for the root ports, we can easily configure the usage of
+PCI Express switch ports to hotplug the devices.
+
+```toml
+# /etc/kata-containers/configuration.toml
+
+# VFIO devices are hotplugged on a bridge by default.
+# Enable hot plugging on the root bus. This may be required for devices with
+# a large PCI bar, as this is a current limitation with hot plugging on
+# a bridge.
+# Default “bridge-port”
+hotplug_vfio = "switch-port"
+
+# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
+# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
+# The value means the number of pcie_root_port
+# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35"
+# Default 0
+pcie_switch_port = 8
+```
+
+Each device that is passed through is attached to a PCI Express downstream port
+as illustrated below. We can even replicate the host’s two DPUs topologies with
+added metadata through the CDI. Most of the time, a container only needs one
+pair of GPU and NIC for GPUDirect RDMA. This is more of a showcase of what we
+can do with the power of Kata and CDI. One could even think of adding groups of
+devices that support P2P, even from different CPU sockets or NUMA nodes, into
+one container; indeed, the first group is NUMA node 0 (red), and the second
+group is NUMA node 1 (green). Since they are grouped correctly, P2P would be
+enabled naturally inside a group, aka clique ID.
+
+```sh
+$ lspci -tv
+ -[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
+ +-01.0 Red Hat, Inc. Virtio console
+ +-02.0 Red Hat, Inc. Virtio SCSI
+ +-03.0 Red Hat, Inc. Virtio RNG
+ +-04.0-[01-04]----00.0-[02-04]--+-00.0-[03]----00.0 NVIDIA Corporation Device 20b8
+ | \-01.0-[04]----00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx
+ +-05.0-[05-08]----00.0-[06-08]--+-00.0-[07]----00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx
+ | \-01.0-[08]----00.0 NVIDIA Corporation Device 20b8
+ +-06.0 Red Hat, Inc. Virtio socket
+ +-07.0 Red Hat, Inc. Virtio file system
+ +-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller
+ +-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller [AHCI mode]
+ \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
+ \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
+```
+
+The configuration of using either the root port or switch port can be applied on
+a per Container or Pod basis, meaning we can switch PCI Express topologies on
+each run of an application.
+
+## Hypervisor Resource Limits
+
+Every hypervisor will have resource limits in terms of how many PCI Express root
+ports, switch ports, or bridge ports can be created, especially with devices
+that need to reserve a 4K IO range per PCI specification. Each instance of root
+or switch port will consume 4K IO of very limited capacity, 64k is the maximum.
+
+Simple math brings us to the conclusion that we can have a maximum of 16 PCI
+Express root ports or 16 PCI Express switch ports in QEMU if devices with IO
+BARs are used in the PCI Express hierarchy.
+
+Additionally, one can have 32 slots on the PCI root bus and a maximum of 256
+slots for the complete PCI(e) topology.
+
+Per default, QEMU will attach a multi-function device in the last slot on the
+PCI root bus,
+
+```sh
+ +-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller
+ +-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller [AHCI mode]
+ \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
+```
+
+Kata will additionally add `virtio-xxx-pci` devices consuming (5 slots) plus a
+PCIe-PCI-bridge (1 slot) and a DRAM controller (1 slot), meaning per default, we
+have already eight slots used. This leaves us 24 slots for adding other devices
+to the root bus.
+
+The problem that arises here is one use-case from a customer that uses recent
+RTX GPUs with Kata. The user wanted to pass through eight of these GPUs into one
+container and ran into issues. The problem is that those cards often consist of
+four individual device nodes: GPU, Audio, and two USB controller devices (some
+cards have a USB-C output).
+
+These devices are grouped into one IOMMU group. Since one needs to pass through
+the complete IOMMU group into the VM, we need to allocate 32 PCI Express root
+ports or 32 PCI Express switch ports, which is technically impossible due to the
+resource limits outlined above. Since all the devices appear as PCI Express
+devices, we need to hotplug those into a root or switch port.
+
+The solution to this problem is leveraging CDI. For each device, add the
+information if it is going to be hotplugged as a PCI Express or PCI device,
+which results in either using a PCI Express root/switch port or an ordinary PCI
+bridge. PCI bridges are not affected by the limited IO range. This way, the GPU
+is attached as a PCI Express device to a root/switch port and the other three
+PCI devices to a PCI bridge, leaving enough resources to create the needed PCI
+Express root/switch ports. For example, we’re going to attach the GPUs to a PCI
+Express root port and the NICs to a PCI bridge.
+
+```jsonld
+# /etc/cdi/mellanox.json
+cdiVersion: 0.4.0
+kind: mellanox.com/nic
+devices:
+- name: nic0
+ annotations:
+ bdf: “3d:00.0”
+ clique-id: “0”
+ attach-pci: “true”
+ containerEdits:
+ deviceNodes:
+ - path: "/dev/vfio/66"
+- name: nic1
+ annotations:
+ bdf: “3d:00.1”
+ clique-id: “1”
+ attach-pci: “true”
+ containerEdits:
+ deviceNodes:
+ - path: "/dev/vfio/67”
+```
+
+The configuration is set to use eight root ports for the GPUs and attach the
+NICs to a PCI bridge which is connected to a PCI Express-PCI bridge which is the
+preferred way of introducing a PCI topology in a PCI Express machine.
+
+```sh
+$ lspci -tv
+-[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
+ +-01.0 Red Hat, Inc. Virtio console
+ +-02.0 Red Hat, Inc. Virtio SCSI
+ +-03.0 Red Hat, Inc. Virtio RNG
+ +-04.0-[01]----00.0 NVIDIA Corporation Device 20b8
+ +-05.0-[02]----00.0 NVIDIA Corporation Device 20b8
+ +-06.0-[03]--
+ +-07.0-[04]--
+ +-08.0-[05]--
+ +-09.0-[06]--
+ +-0a.0-[07]--
+ +-0b.0-[08]--
+ +-0c.0-[09-0a]----00.0-[0a]--+-00.0 Mellanox Tech MT42822 BlueField-2 ConnectX-6
+ | \-01.0 Mellanox Tech MT42822 BlueField-2 ConnectX-6
+ +-0d.0 Red Hat, Inc. Virtio socket
+ +-0e.0 Red Hat, Inc. Virtio file system
+ +-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller
+ +-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller
+ \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
+```
+
+The PCI devices will consume a slot of which we have 256 in the PCI(e) topology
+and leave scarce resources for the needed PCI Express devices.
diff --git a/src/agent/rustjail/src/cgroups/fs/mod.rs b/src/agent/rustjail/src/cgroups/fs/mod.rs
index 80c31b617..90cfe3c0a 100644
--- a/src/agent/rustjail/src/cgroups/fs/mod.rs
+++ b/src/agent/rustjail/src/cgroups/fs/mod.rs
@@ -39,11 +39,9 @@ use std::path::Path;
const GUEST_CPUS_PATH: &str = "/sys/devices/system/cpu/online";
-// Convenience macro to obtain the scope logger
-macro_rules! sl {
- () => {
- slog_scope::logger().new(o!("subsystem" => "cgroups"))
- };
+// Convenience function to obtain the scope logger.
+fn sl() -> slog::Logger {
+ slog_scope::logger().new(o!("subsystem" => "cgroups"))
}
macro_rules! get_controller_or_return_singular_none {
@@ -82,7 +80,7 @@ impl CgroupManager for Manager {
fn set(&self, r: &LinuxResources, update: bool) -> Result<()> {
info!(
- sl!(),
+ sl(),
"cgroup manager set resources for container. Resources input {:?}", r
);
@@ -120,7 +118,7 @@ impl CgroupManager for Manager {
// set devices resources
set_devices_resources(&self.cgroup, &r.devices, res);
- info!(sl!(), "resources after processed {:?}", res);
+ info!(sl(), "resources after processed {:?}", res);
// apply resources
self.cgroup.apply(res)?;
@@ -197,7 +195,7 @@ impl CgroupManager for Manager {
if guest_cpuset.is_empty() {
return Ok(());
}
- info!(sl!(), "update_cpuset_path to: {}", guest_cpuset);
+ info!(sl(), "update_cpuset_path to: {}", guest_cpuset);
let h = cgroups::hierarchies::auto();
let root_cg = h.root_control_group();
@@ -205,12 +203,12 @@ impl CgroupManager for Manager {
let root_cpuset_controller: &CpuSetController = root_cg.controller_of().unwrap();
let path = root_cpuset_controller.path();
let root_path = Path::new(path);
- info!(sl!(), "root cpuset path: {:?}", &path);
+ info!(sl(), "root cpuset path: {:?}", &path);
let container_cpuset_controller: &CpuSetController = self.cgroup.controller_of().unwrap();
let path = container_cpuset_controller.path();
let container_path = Path::new(path);
- info!(sl!(), "container cpuset path: {:?}", &path);
+ info!(sl(), "container cpuset path: {:?}", &path);
let mut paths = vec![];
for ancestor in container_path.ancestors() {
@@ -219,7 +217,7 @@ impl CgroupManager for Manager {
}
paths.push(ancestor);
}
- info!(sl!(), "parent paths to update cpuset: {:?}", &paths);
+ info!(sl(), "parent paths to update cpuset: {:?}", &paths);
let mut i = paths.len();
loop {
@@ -233,7 +231,7 @@ impl CgroupManager for Manager {
.to_str()
.unwrap()
.trim_start_matches(root_path.to_str().unwrap());
- info!(sl!(), "updating cpuset for parent path {:?}", &r_path);
+ info!(sl(), "updating cpuset for parent path {:?}", &r_path);
let cg = new_cgroup(cgroups::hierarchies::auto(), r_path)?;
let cpuset_controller: &CpuSetController = cg.controller_of().unwrap();
cpuset_controller.set_cpus(guest_cpuset)?;
@@ -241,7 +239,7 @@ impl CgroupManager for Manager {
if !container_cpuset.is_empty() {
info!(
- sl!(),
+ sl(),
"updating cpuset for container path: {:?} cpuset: {}",
&container_path,
container_cpuset
@@ -276,7 +274,7 @@ fn set_network_resources(
network: &LinuxNetwork,
res: &mut cgroups::Resources,
) {
- info!(sl!(), "cgroup manager set network");
+ info!(sl(), "cgroup manager set network");
// set classid
// description can be found at https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v1/net_cls.html
@@ -303,7 +301,7 @@ fn set_devices_resources(
device_resources: &[LinuxDeviceCgroup],
res: &mut cgroups::Resources,
) {
- info!(sl!(), "cgroup manager set devices");
+ info!(sl(), "cgroup manager set devices");
let mut devices = vec![];
for d in device_resources.iter() {
@@ -332,7 +330,7 @@ fn set_hugepages_resources(
hugepage_limits: &[LinuxHugepageLimit],
res: &mut cgroups::Resources,
) {
- info!(sl!(), "cgroup manager set hugepage");
+ info!(sl(), "cgroup manager set hugepage");
let mut limits = vec![];
let hugetlb_controller = cg.controller_of::();
@@ -346,7 +344,7 @@ fn set_hugepages_resources(
limits.push(hr);
} else {
warn!(
- sl!(),
+ sl(),
"{} page size support cannot be verified, dropping requested limit", l.page_size
);
}
@@ -359,7 +357,7 @@ fn set_block_io_resources(
blkio: &LinuxBlockIo,
res: &mut cgroups::Resources,
) {
- info!(sl!(), "cgroup manager set block io");
+ info!(sl(), "cgroup manager set block io");
res.blkio.weight = blkio.weight;
res.blkio.leaf_weight = blkio.leaf_weight;
@@ -387,13 +385,13 @@ fn set_block_io_resources(
}
fn set_cpu_resources(cg: &cgroups::Cgroup, cpu: &LinuxCpu) -> Result<()> {
- info!(sl!(), "cgroup manager set cpu");
+ info!(sl(), "cgroup manager set cpu");
let cpuset_controller: &CpuSetController = cg.controller_of().unwrap();
if !cpu.cpus.is_empty() {
if let Err(e) = cpuset_controller.set_cpus(&cpu.cpus) {
- warn!(sl!(), "write cpuset failed: {:?}", e);
+ warn!(sl(), "write cpuset failed: {:?}", e);
}
}
@@ -424,7 +422,7 @@ fn set_cpu_resources(cg: &cgroups::Cgroup, cpu: &LinuxCpu) -> Result<()> {
}
fn set_memory_resources(cg: &cgroups::Cgroup, memory: &LinuxMemory, update: bool) -> Result<()> {
- info!(sl!(), "cgroup manager set memory");
+ info!(sl(), "cgroup manager set memory");
let mem_controller: &MemController = cg.controller_of().unwrap();
if !update {
@@ -493,7 +491,7 @@ fn set_memory_resources(cg: &cgroups::Cgroup, memory: &LinuxMemory, update: bool
}
fn set_pids_resources(cg: &cgroups::Cgroup, pids: &LinuxPids) -> Result<()> {
- info!(sl!(), "cgroup manager set pids");
+ info!(sl(), "cgroup manager set pids");
let pid_controller: &PidController = cg.controller_of().unwrap();
let v = if pids.limit > 0 {
MaxValue::Value(pids.limit)
@@ -965,7 +963,7 @@ pub fn get_paths() -> Result> {
for l in fs::read_to_string(PATHS)?.lines() {
let fl: Vec<&str> = l.split(':').collect();
if fl.len() != 3 {
- info!(sl!(), "Corrupted cgroup data!");
+ info!(sl(), "Corrupted cgroup data!");
continue;
}
@@ -986,7 +984,7 @@ pub fn get_mounts(paths: &HashMap) -> Result = p[1].split(' ').collect();
if post.len() != 3 {
- warn!(sl!(), "can't parse {} line {:?}", MOUNTS, l);
+ warn!(sl(), "can't parse {} line {:?}", MOUNTS, l);
continue;
}
diff --git a/src/agent/rustjail/src/cgroups/notifier.rs b/src/agent/rustjail/src/cgroups/notifier.rs
index 9f91b3584..5260a3d3f 100644
--- a/src/agent/rustjail/src/cgroups/notifier.rs
+++ b/src/agent/rustjail/src/cgroups/notifier.rs
@@ -16,11 +16,9 @@ use inotify::{Inotify, WatchMask};
use tokio::io::AsyncReadExt;
use tokio::sync::mpsc::{channel, Receiver};
-// Convenience macro to obtain the scope logger
-macro_rules! sl {
- () => {
- slog_scope::logger().new(o!("subsystem" => "cgroups_notifier"))
- };
+// Convenience function to obtain the scope logger.
+fn sl() -> slog::Logger {
+ slog_scope::logger().new(o!("subsystem" => "cgroups_notifier"))
}
pub async fn notify_oom(cid: &str, cg_dir: String) -> Result> {
@@ -38,7 +36,7 @@ pub async fn notify_oom(cid: &str, cg_dir: String) -> Result> {
fn get_value_from_cgroup(path: &Path, key: &str) -> Result {
let content = fs::read_to_string(path)?;
info!(
- sl!(),
+ sl(),
"get_value_from_cgroup file: {:?}, content: {}", &path, &content
);
@@ -67,11 +65,11 @@ async fn register_memory_event_v2(
let event_control_path = Path::new(&cg_dir).join(memory_event_name);
let cgroup_event_control_path = Path::new(&cg_dir).join(cgroup_event_name);
info!(
- sl!(),
+ sl(),
"register_memory_event_v2 event_control_path: {:?}", &event_control_path
);
info!(
- sl!(),
+ sl(),
"register_memory_event_v2 cgroup_event_control_path: {:?}", &cgroup_event_control_path
);
@@ -82,8 +80,8 @@ async fn register_memory_event_v2(
// Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited
let cg_wd = inotify.add_watch(&cgroup_event_control_path, WatchMask::MODIFY)?;
- info!(sl!(), "ev_wd: {:?}", ev_wd);
- info!(sl!(), "cg_wd: {:?}", cg_wd);
+ info!(sl(), "ev_wd: {:?}", ev_wd);
+ info!(sl(), "cg_wd: {:?}", cg_wd);
let (sender, receiver) = channel(100);
let containere_id = containere_id.to_string();
@@ -97,17 +95,17 @@ async fn register_memory_event_v2(
while let Some(event_or_error) = stream.next().await {
let event = event_or_error.unwrap();
info!(
- sl!(),
+ sl(),
"container[{}] get event for container: {:?}", &containere_id, &event
);
// info!("is1: {}", event.wd == wd1);
- info!(sl!(), "event.wd: {:?}", event.wd);
+ info!(sl(), "event.wd: {:?}", event.wd);
if event.wd == ev_wd {
let oom = get_value_from_cgroup(&event_control_path, "oom_kill");
if oom.unwrap_or(0) > 0 {
let _ = sender.send(containere_id.clone()).await.map_err(|e| {
- error!(sl!(), "send containere_id failed, error: {:?}", e);
+ error!(sl(), "send containere_id failed, error: {:?}", e);
});
return;
}
@@ -171,13 +169,13 @@ async fn register_memory_event(
let mut buf = [0u8; 8];
match eventfd_stream.read(&mut buf).await {
Err(err) => {
- warn!(sl!(), "failed to read from eventfd: {:?}", err);
+ warn!(sl(), "failed to read from eventfd: {:?}", err);
return;
}
Ok(_) => {
let content = fs::read_to_string(path.clone());
info!(
- sl!(),
+ sl(),
"cgroup event for container: {}, path: {:?}, content: {:?}",
&containere_id,
&path,
@@ -193,7 +191,7 @@ async fn register_memory_event(
}
let _ = sender.send(containere_id.clone()).await.map_err(|e| {
- error!(sl!(), "send containere_id failed, error: {:?}", e);
+ error!(sl(), "send containere_id failed, error: {:?}", e);
});
}
});
diff --git a/src/agent/rustjail/src/container.rs b/src/agent/rustjail/src/container.rs
index b1d7499cd..2964ae377 100644
--- a/src/agent/rustjail/src/container.rs
+++ b/src/agent/rustjail/src/container.rs
@@ -1596,10 +1596,8 @@ mod tests {
use tempfile::tempdir;
use test_utils::skip_if_not_root;
- macro_rules! sl {
- () => {
- slog_scope::logger()
- };
+ fn sl() -> slog::Logger {
+ slog_scope::logger()
}
#[test]
@@ -1854,7 +1852,7 @@ mod tests {
let _ = new_linux_container_and_then(|mut c: LinuxContainer| {
c.processes.insert(
1,
- Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap(),
+ Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap(),
);
let p = c.get_process("123");
assert!(p.is_ok(), "Expecting Ok, Got {:?}", p);
@@ -1881,7 +1879,7 @@ mod tests {
let (c, _dir) = new_linux_container();
let ret = c
.unwrap()
- .start(Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap())
+ .start(Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap())
.await;
assert!(ret.is_err(), "Expecting Err, Got {:?}", ret);
}
@@ -1891,7 +1889,7 @@ mod tests {
let (c, _dir) = new_linux_container();
let ret = c
.unwrap()
- .run(Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap())
+ .run(Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap())
.await;
assert!(ret.is_err(), "Expecting Err, Got {:?}", ret);
}
diff --git a/src/agent/rustjail/src/process.rs b/src/agent/rustjail/src/process.rs
index cdecae130..0e7fe73ef 100644
--- a/src/agent/rustjail/src/process.rs
+++ b/src/agent/rustjail/src/process.rs
@@ -161,7 +161,7 @@ impl Process {
pub fn notify_term_close(&mut self) {
let notify = self.term_exit_notifier.clone();
- notify.notify_waiters();
+ notify.notify_one();
}
pub fn close_stdin(&mut self) {
diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs
index f292299dc..2b8af8229 100644
--- a/src/agent/src/device.rs
+++ b/src/agent/src/device.rs
@@ -26,11 +26,9 @@ use oci::{LinuxDeviceCgroup, LinuxResources, Spec};
use protocols::agent::Device;
use tracing::instrument;
-// Convenience macro to obtain the scope logger
-macro_rules! sl {
- () => {
- slog_scope::logger().new(o!("subsystem" => "device"))
- };
+// Convenience function to obtain the scope logger.
+fn sl() -> slog::Logger {
+ slog_scope::logger().new(o!("subsystem" => "device"))
}
const VM_ROOTFS: &str = "/";
@@ -78,7 +76,7 @@ where
{
let syspci = Path::new(&syspci);
let drv = drv.as_ref();
- info!(sl!(), "rebind_pci_driver: {} => {:?}", dev, drv);
+ info!(sl(), "rebind_pci_driver: {} => {:?}", dev, drv);
let devpath = syspci.join("devices").join(dev.to_string());
let overridepath = &devpath.join("driver_override");
@@ -606,7 +604,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) -
let host_minor = specdev.minor;
info!(
- sl!(),
+ sl(),
"update_spec_devices() updating device";
"container_path" => &specdev.path,
"type" => &specdev.r#type,
@@ -659,7 +657,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) -
if let Some(update) = res_updates.get(&(host_type.as_str(), host_major, host_minor))
{
info!(
- sl!(),
+ sl(),
"update_spec_devices() updating resource";
"type" => &host_type,
"host_major" => host_major,
@@ -923,7 +921,7 @@ pub async fn add_devices(
#[instrument]
async fn add_device(device: &Device, sandbox: &Arc>) -> Result {
// log before validation to help with debugging gRPC protocol version differences.
- info!(sl!(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}",
+ info!(sl(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}",
device.id, device.type_, device.vm_path, device.container_path, device.options);
if device.type_.is_empty() {
diff --git a/src/agent/src/image_rpc.rs b/src/agent/src/image_rpc.rs
index 571994a0d..d9cd45b84 100644
--- a/src/agent/src/image_rpc.rs
+++ b/src/agent/src/image_rpc.rs
@@ -38,11 +38,9 @@ const KATA_CC_IMAGE_WORK_DIR: &str = "/run/image/";
const KATA_CC_PAUSE_BUNDLE: &str = "/pause_bundle";
const CONFIG_JSON: &str = "config.json";
-// Convenience macro to obtain the scope logger
-macro_rules! sl {
- () => {
- slog_scope::logger()
- };
+// Convenience function to obtain the scope logger.
+fn sl() -> slog::Logger {
+ slog_scope::logger().new(o!("subsystem" => "cgroups"))
}
pub struct ImageService {
@@ -57,18 +55,17 @@ impl ImageService {
env::set_var("CC_IMAGE_WORK_DIR", KATA_CC_IMAGE_WORK_DIR);
let mut image_client = ImageClient::default();
- let image_policy_file = &AGENT_CONFIG.read().await.image_policy_file;
+ let image_policy_file = &AGENT_CONFIG.image_policy_file;
if !image_policy_file.is_empty() {
image_client.config.file_paths.sigstore_config = image_policy_file.clone();
}
- let simple_signing_sigstore_config =
- &AGENT_CONFIG.read().await.simple_signing_sigstore_config;
+ let simple_signing_sigstore_config = &AGENT_CONFIG.simple_signing_sigstore_config;
if !simple_signing_sigstore_config.is_empty() {
image_client.config.file_paths.sigstore_config = simple_signing_sigstore_config.clone();
}
- let image_registry_auth_file = &AGENT_CONFIG.read().await.image_registry_auth_file;
+ let image_registry_auth_file = &AGENT_CONFIG.image_registry_auth_file;
if !image_registry_auth_file.is_empty() {
image_client.config.file_paths.auth_file = image_registry_auth_file.clone();
}
@@ -88,7 +85,7 @@ impl ImageService {
return Err(anyhow!("Pause image not present in rootfs"));
}
- info!(sl!(), "use guest pause image cid {:?}", cid);
+ info!(sl(), "use guest pause image cid {:?}", cid);
let pause_bundle = Path::new(CONTAINER_BASE).join(cid);
let pause_rootfs = pause_bundle.join("rootfs");
let pause_config = pause_bundle.join(CONFIG_JSON);
@@ -159,12 +156,12 @@ impl ImageService {
async fn pull_image(&self, req: &image::PullImageRequest) -> Result {
env::set_var("OCICRYPT_KEYPROVIDER_CONFIG", OCICRYPT_CONFIG_PATH);
- let https_proxy = &AGENT_CONFIG.read().await.https_proxy;
+ let https_proxy = &AGENT_CONFIG.https_proxy;
if !https_proxy.is_empty() {
env::set_var("HTTPS_PROXY", https_proxy);
}
- let no_proxy = &AGENT_CONFIG.read().await.no_proxy;
+ let no_proxy = &AGENT_CONFIG.no_proxy;
if !no_proxy.is_empty() {
env::set_var("NO_PROXY", no_proxy);
}
@@ -179,7 +176,7 @@ impl ImageService {
return Ok(image.to_owned());
}
- let aa_kbc_params = &AGENT_CONFIG.read().await.aa_kbc_params;
+ let aa_kbc_params = &AGENT_CONFIG.aa_kbc_params;
if !aa_kbc_params.is_empty() {
match self.attestation_agent_started.compare_exchange_weak(
false,
@@ -188,22 +185,21 @@ impl ImageService {
Ordering::SeqCst,
) {
Ok(_) => Self::init_attestation_agent()?,
- Err(_) => info!(sl!(), "Attestation Agent already running"),
+ Err(_) => info!(sl(), "Attestation Agent already running"),
}
}
// If the attestation-agent is being used, then enable the authenticated credentials support
info!(
- sl!(),
+ sl(),
"image_client.config.auth set to: {}",
!aa_kbc_params.is_empty()
);
self.image_client.lock().await.config.auth = !aa_kbc_params.is_empty();
// Read enable signature verification from the agent config and set it in the image_client
- let enable_signature_verification =
- &AGENT_CONFIG.read().await.enable_signature_verification;
+ let enable_signature_verification = &AGENT_CONFIG.enable_signature_verification;
info!(
- sl!(),
+ sl(),
"enable_signature_verification set to: {}", enable_signature_verification
);
self.image_client.lock().await.config.security_validate = *enable_signature_verification;
@@ -215,7 +211,7 @@ impl ImageService {
let decrypt_config = format!("provider:attestation-agent:{}", aa_kbc_params);
- info!(sl!(), "pull image {:?}, bundle path {:?}", cid, bundle_path);
+ info!(sl(), "pull image {:?}, bundle path {:?}", cid, bundle_path);
// Image layers will store at KATA_CC_IMAGE_WORK_DIR, generated bundles
// with rootfs and config.json will store under CONTAINER_BASE/cid.
let res = self
@@ -228,13 +224,13 @@ impl ImageService {
match res {
Ok(image) => {
info!(
- sl!(),
+ sl(),
"pull and unpack image {:?}, cid: {:?}, with image-rs succeed. ", image, cid
);
}
Err(e) => {
error!(
- sl!(),
+ sl(),
"pull and unpack image {:?}, cid: {:?}, with image-rs failed with {:?}. ",
image,
cid,
diff --git a/src/agent/src/main.rs b/src/agent/src/main.rs
index 1b6b5fe42..a869e5afa 100644
--- a/src/agent/src/main.rs
+++ b/src/agent/src/main.rs
@@ -65,7 +65,7 @@ use tokio::{
io::AsyncWrite,
sync::{
watch::{channel, Receiver},
- Mutex, RwLock,
+ Mutex,
},
task::JoinHandle,
};
@@ -84,12 +84,11 @@ cfg_if! {
const NAME: &str = "kata-agent";
lazy_static! {
- static ref AGENT_CONFIG: Arc> = Arc::new(RwLock::new(
+ static ref AGENT_CONFIG: AgentConfig =
// Note: We can't do AgentOpts.parse() here to send through the processed arguments to AgentConfig
// clap::Parser::parse() greedily process all command line input including cargo test parameters,
// so should only be used inside main.
- AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap()
- ));
+ AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap();
}
#[derive(Parser)]
@@ -182,13 +181,13 @@ async fn real_main() -> std::result::Result<(), Box> {
lazy_static::initialize(&AGENT_CONFIG);
- init_agent_as_init(&logger, AGENT_CONFIG.read().await.unified_cgroup_hierarchy)?;
+ init_agent_as_init(&logger, AGENT_CONFIG.unified_cgroup_hierarchy)?;
drop(logger_async_guard);
} else {
lazy_static::initialize(&AGENT_CONFIG);
}
- let config = AGENT_CONFIG.read().await;
+ let config = &AGENT_CONFIG;
let log_vport = config.log_vport as u32;
let log_handle = tokio::spawn(create_logger_task(rfd, log_vport, shutdown_rx.clone()));
@@ -201,7 +200,7 @@ async fn real_main() -> std::result::Result<(), Box> {
let (logger, logger_async_guard) =
logging::create_logger(NAME, "agent", config.log_level, writer);
- announce(&logger, &config);
+ announce(&logger, config);
// This variable is required as it enables the global (and crucially static) logger,
// which is required to satisfy the the lifetime constraints of the auto-generated gRPC code.
@@ -229,7 +228,7 @@ async fn real_main() -> std::result::Result<(), Box> {
let span_guard = root_span.enter();
// Start the sandbox and wait for its ttRPC server to end
- start_sandbox(&logger, &config, init_mode, &mut tasks, shutdown_rx.clone()).await?;
+ start_sandbox(&logger, config, init_mode, &mut tasks, shutdown_rx.clone()).await?;
// Install a NOP logger for the remainder of the shutdown sequence
// to ensure any log calls made by local crates using the scope logger
diff --git a/src/agent/src/metrics.rs b/src/agent/src/metrics.rs
index a5522c0eb..d7fc4d12b 100644
--- a/src/agent/src/metrics.rs
+++ b/src/agent/src/metrics.rs
@@ -15,11 +15,9 @@ use tracing::instrument;
const NAMESPACE_KATA_AGENT: &str = "kata_agent";
const NAMESPACE_KATA_GUEST: &str = "kata_guest";
-// Convenience macro to obtain the scope logger
-macro_rules! sl {
- () => {
- slog_scope::logger().new(o!("subsystem" => "metrics"))
- };
+// Convenience function to obtain the scope logger.
+fn sl() -> slog::Logger {
+ slog_scope::logger().new(o!("subsystem" => "metrics"))
}
lazy_static! {
@@ -139,7 +137,7 @@ fn update_agent_metrics() -> Result<()> {
Ok(p) => p,
Err(e) => {
// FIXME: return Ok for all errors?
- warn!(sl!(), "failed to create process instance: {:?}", e);
+ warn!(sl(), "failed to create process instance: {:?}", e);
return Ok(());
}
@@ -160,7 +158,7 @@ fn update_agent_metrics() -> Result<()> {
// io
match me.io() {
Err(err) => {
- info!(sl!(), "failed to get process io stat: {:?}", err);
+ info!(sl(), "failed to get process io stat: {:?}", err);
}
Ok(io) => {
set_gauge_vec_proc_io(&AGENT_IO_STAT, &io);
@@ -169,7 +167,7 @@ fn update_agent_metrics() -> Result<()> {
match me.stat() {
Err(err) => {
- info!(sl!(), "failed to get process stat: {:?}", err);
+ info!(sl(), "failed to get process stat: {:?}", err);
}
Ok(stat) => {
set_gauge_vec_proc_stat(&AGENT_PROC_STAT, &stat);
@@ -177,7 +175,7 @@ fn update_agent_metrics() -> Result<()> {
}
match me.status() {
- Err(err) => error!(sl!(), "failed to get process status: {:?}", err),
+ Err(err) => error!(sl(), "failed to get process status: {:?}", err),
Ok(status) => set_gauge_vec_proc_status(&AGENT_PROC_STATUS, &status),
}
@@ -189,7 +187,7 @@ fn update_guest_metrics() {
// try get load and task info
match procfs::LoadAverage::new() {
Err(err) => {
- info!(sl!(), "failed to get guest LoadAverage: {:?}", err);
+ info!(sl(), "failed to get guest LoadAverage: {:?}", err);
}
Ok(load) => {
GUEST_LOAD
@@ -209,7 +207,7 @@ fn update_guest_metrics() {
// try to get disk stats
match procfs::diskstats() {
Err(err) => {
- info!(sl!(), "failed to get guest diskstats: {:?}", err);
+ info!(sl(), "failed to get guest diskstats: {:?}", err);
}
Ok(diskstats) => {
for diskstat in diskstats {
@@ -221,7 +219,7 @@ fn update_guest_metrics() {
// try to get vm stats
match procfs::vmstat() {
Err(err) => {
- info!(sl!(), "failed to get guest vmstat: {:?}", err);
+ info!(sl(), "failed to get guest vmstat: {:?}", err);
}
Ok(vmstat) => {
for (k, v) in vmstat {
@@ -233,7 +231,7 @@ fn update_guest_metrics() {
// cpu stat
match procfs::KernelStats::new() {
Err(err) => {
- info!(sl!(), "failed to get guest KernelStats: {:?}", err);
+ info!(sl(), "failed to get guest KernelStats: {:?}", err);
}
Ok(kernel_stats) => {
set_gauge_vec_cpu_time(&GUEST_CPU_TIME, "total", &kernel_stats.total);
@@ -246,7 +244,7 @@ fn update_guest_metrics() {
// try to get net device stats
match procfs::net::dev_status() {
Err(err) => {
- info!(sl!(), "failed to get guest net::dev_status: {:?}", err);
+ info!(sl(), "failed to get guest net::dev_status: {:?}", err);
}
Ok(devs) => {
// netdev: map[string]procfs::net::DeviceStatus
@@ -259,7 +257,7 @@ fn update_guest_metrics() {
// get statistics about memory from /proc/meminfo
match procfs::Meminfo::new() {
Err(err) => {
- info!(sl!(), "failed to get guest Meminfo: {:?}", err);
+ info!(sl(), "failed to get guest Meminfo: {:?}", err);
}
Ok(meminfo) => {
set_gauge_vec_meminfo(&GUEST_MEMINFO, &meminfo);
diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs
index 9049cd5c2..4e7429e49 100644
--- a/src/agent/src/rpc.rs
+++ b/src/agent/src/rpc.rs
@@ -118,33 +118,25 @@ const ERR_NO_SANDBOX_PIDNS: &str = "Sandbox does not have sandbox_pidns";
// not available.
const IPTABLES_RESTORE_WAIT_SEC: u64 = 5;
-// Convenience macro to obtain the scope logger
-macro_rules! sl {
- () => {
- slog_scope::logger()
- };
+// Convenience function to obtain the scope logger.
+fn sl() -> slog::Logger {
+ slog_scope::logger()
}
-// Convenience macro to wrap an error and response to ttrpc client
-macro_rules! ttrpc_error {
- ($code:path, $err:expr $(,)?) => {
- get_rpc_status($code, format!("{:?}", $err))
- };
+// Convenience function to wrap an error and response to ttrpc client
+fn ttrpc_error(code: ttrpc::Code, err: impl std::fmt::Debug) -> ttrpc::Error {
+ get_rpc_status(code, format!("{:?}", err))
}
-macro_rules! is_allowed {
- ($req:ident) => {
- if !AGENT_CONFIG
- .read()
- .await
- .is_allowed_endpoint($req.descriptor_dyn().name())
- {
- return Err(ttrpc_error!(
- ttrpc::Code::UNIMPLEMENTED,
- format!("{} is blocked", $req.descriptor_dyn().name()),
- ));
- }
- };
+fn is_allowed(req: &impl MessageDyn) -> ttrpc::Result<()> {
+ if !AGENT_CONFIG.is_allowed_endpoint(req.descriptor_dyn().name()) {
+ Err(ttrpc_error(
+ ttrpc::Code::UNIMPLEMENTED,
+ format!("{} is blocked", req.descriptor_dyn().name()),
+ ))
+ } else {
+ Ok(())
+ }
}
#[derive(Clone, Debug)]
@@ -207,14 +199,14 @@ impl AgentService {
let mut oci = match oci_spec.as_mut() {
Some(spec) => rustjail::grpc_to_oci(spec),
None => {
- error!(sl!(), "no oci spec in the create container request!");
+ error!(sl(), "no oci spec in the create container request!");
return Err(anyhow!(nix::Error::EINVAL));
}
};
- info!(sl!(), "receive createcontainer, spec: {:?}", &oci);
+ info!(sl(), "receive createcontainer, spec: {:?}", &oci);
info!(
- sl!(),
+ sl(),
"receive createcontainer, storages: {:?}", &req.storages
);
@@ -237,9 +229,9 @@ impl AgentService {
let dev_major_minor = format!("{}:{}", specdev.major, specdev.minor);
if specdev.path == TRUSTED_STORAGE_DEVICE {
- let data_integrity = AGENT_CONFIG.read().await.data_integrity;
+ let data_integrity = AGENT_CONFIG.data_integrity;
info!(
- sl!(),
+ sl(),
"trusted_store device major:min {}, enable data integrity {}",
dev_major_minor,
data_integrity.to_string()
@@ -260,7 +252,7 @@ impl AgentService {
// here, the agent will rely on rustjail (using the oci.Mounts
// list) to bind mount all of them inside the container.
let m = add_storages(
- sl!(),
+ sl(),
req.storages.to_vec(),
self.sandbox.clone(),
Some(req.container_id.clone()),
@@ -308,33 +300,33 @@ impl AgentService {
};
let mut ctr: LinuxContainer =
- LinuxContainer::new(cid.as_str(), CONTAINER_BASE, opts, &sl!())?;
+ LinuxContainer::new(cid.as_str(), CONTAINER_BASE, opts, &sl())?;
- let pipe_size = AGENT_CONFIG.read().await.container_pipe_size;
+ let pipe_size = AGENT_CONFIG.container_pipe_size;
let p = if let Some(p) = oci.process {
- Process::new(&sl!(), &p, cid.as_str(), true, pipe_size)?
+ Process::new(&sl(), &p, cid.as_str(), true, pipe_size)?
} else {
- info!(sl!(), "no process configurations!");
+ info!(sl(), "no process configurations!");
return Err(anyhow!(nix::Error::EINVAL));
};
// if starting container failed, we will do some rollback work
// to ensure no resources are leaked.
if let Err(err) = ctr.start(p).await {
- error!(sl!(), "failed to start container: {:?}", err);
+ error!(sl(), "failed to start container: {:?}", err);
if let Err(e) = ctr.destroy().await {
- error!(sl!(), "failed to destroy container: {:?}", e);
+ error!(sl(), "failed to destroy container: {:?}", e);
}
if let Err(e) = remove_container_resources(&mut s, &cid) {
- error!(sl!(), "failed to remove container resources: {:?}", e);
+ error!(sl(), "failed to remove container resources: {:?}", e);
}
return Err(err);
}
s.update_shared_pidns(&ctr)?;
s.add_container(ctr);
- info!(sl!(), "created container!");
+ info!(sl(), "created container!");
Ok(())
}
@@ -431,7 +423,7 @@ impl AgentService {
let cid = req.container_id.clone();
let exec_id = req.exec_id.clone();
- info!(sl!(), "do_exec_process cid: {} eid: {}", cid, exec_id);
+ info!(sl(), "do_exec_process cid: {} eid: {}", cid, exec_id);
let s = self.sandbox.clone();
let mut sandbox = s.lock().await;
@@ -444,9 +436,9 @@ impl AgentService {
// Apply any necessary corrections for PCI addresses
update_env_pci(&mut process.Env, &sandbox.pcimap)?;
- let pipe_size = AGENT_CONFIG.read().await.container_pipe_size;
+ let pipe_size = AGENT_CONFIG.container_pipe_size;
let ocip = rustjail::process_grpc_to_oci(&process);
- let p = Process::new(&sl!(), &ocip, exec_id.as_str(), false, pipe_size)?;
+ let p = Process::new(&sl(), &ocip, exec_id.as_str(), false, pipe_size)?;
let ctr = sandbox
.get_container(&cid)
@@ -464,7 +456,7 @@ impl AgentService {
let s = self.sandbox.clone();
info!(
- sl!(),
+ sl(),
"signal process";
"container-id" => cid.clone(),
"exec-id" => eid.clone(),
@@ -486,7 +478,7 @@ impl AgentService {
match p.signal(sig) {
Err(Errno::ESRCH) => {
info!(
- sl!(),
+ sl(),
"signal encounter ESRCH, continue";
"container-id" => cid.clone(),
"exec-id" => eid.clone(),
@@ -502,7 +494,7 @@ impl AgentService {
if eid.is_empty() {
// eid is empty, signal all the remaining processes in the container cgroup
info!(
- sl!(),
+ sl(),
"signal all the remaining processes";
"container-id" => cid.clone(),
"exec-id" => eid.clone(),
@@ -510,7 +502,7 @@ impl AgentService {
if let Err(err) = self.freeze_cgroup(&cid, FreezerState::Frozen).await {
warn!(
- sl!(),
+ sl(),
"freeze cgroup failed";
"container-id" => cid.clone(),
"exec-id" => eid.clone(),
@@ -523,7 +515,7 @@ impl AgentService {
let res = unsafe { libc::kill(*pid, sig) };
if let Err(err) = Errno::result(res).map(drop) {
warn!(
- sl!(),
+ sl(),
"signal failed";
"container-id" => cid.clone(),
"exec-id" => eid.clone(),
@@ -534,7 +526,7 @@ impl AgentService {
}
if let Err(err) = self.freeze_cgroup(&cid, FreezerState::Thawed).await {
warn!(
- sl!(),
+ sl(),
"unfreeze cgroup failed";
"container-id" => cid.clone(),
"exec-id" => eid.clone(),
@@ -579,7 +571,7 @@ impl AgentService {
let (exit_send, mut exit_recv) = tokio::sync::mpsc::channel(100);
info!(
- sl!(),
+ sl(),
"wait process";
"container-id" => cid.clone(),
"exec-id" => eid.clone()
@@ -596,9 +588,9 @@ impl AgentService {
};
if let Some(mut exit_rx) = exit_rx {
- info!(sl!(), "cid {} eid {} waiting for exit signal", &cid, &eid);
+ info!(sl(), "cid {} eid {} waiting for exit signal", &cid, &eid);
while exit_rx.changed().await.is_ok() {}
- info!(sl!(), "cid {} eid {} received exit signal", &cid, &eid);
+ info!(sl(), "cid {} eid {} received exit signal", &cid, &eid);
}
let mut sandbox = s.lock().await;
@@ -673,16 +665,15 @@ impl AgentService {
let cid = req.container_id;
let eid = req.exec_id;
- let term_exit_notifier;
+ let mut term_exit_notifier = Arc::new(tokio::sync::Notify::new());
let reader = {
let s = self.sandbox.clone();
let mut sandbox = s.lock().await;
let p = sandbox.find_container_process(cid.as_str(), eid.as_str())?;
- term_exit_notifier = p.term_exit_notifier.clone();
-
if p.term_master.is_some() {
+ term_exit_notifier = p.term_exit_notifier.clone();
p.get_reader(StreamType::TermMaster)
} else if stdout {
if p.parent_stdout.is_some() {
@@ -727,7 +718,7 @@ impl AgentService {
.join(container_id)
.join(CONFIG_JSON);
debug!(
- sl!(),
+ sl(),
"Image bundle config path: {:?}", image_oci_config_path
);
@@ -772,9 +763,9 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::CreateContainerRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "create_container", req);
- is_allowed!(req);
+ is_allowed(&req)?;
match self.do_create_container(req).await {
- Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
+ Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)),
Ok(_) => Ok(Empty::new()),
}
}
@@ -785,9 +776,9 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::StartContainerRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "start_container", req);
- is_allowed!(req);
+ is_allowed(&req)?;
match self.do_start_container(req).await {
- Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
+ Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)),
Ok(_) => Ok(Empty::new()),
}
}
@@ -798,10 +789,10 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::RemoveContainerRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "remove_container", req);
- is_allowed!(req);
+ is_allowed(&req)?;
match self.do_remove_container(req).await {
- Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
+ Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)),
Ok(_) => Ok(Empty::new()),
}
}
@@ -812,9 +803,9 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::ExecProcessRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "exec_process", req);
- is_allowed!(req);
+ is_allowed(&req)?;
match self.do_exec_process(req).await {
- Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
+ Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)),
Ok(_) => Ok(Empty::new()),
}
}
@@ -825,9 +816,9 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::SignalProcessRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "signal_process", req);
- is_allowed!(req);
+ is_allowed(&req)?;
match self.do_signal_process(req).await {
- Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
+ Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)),
Ok(_) => Ok(Empty::new()),
}
}
@@ -838,10 +829,10 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::WaitProcessRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "wait_process", req);
- is_allowed!(req);
+ is_allowed(&req)?;
self.do_wait_process(req)
.await
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))
}
async fn update_container(
@@ -850,7 +841,7 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::UpdateContainerRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "update_container", req);
- is_allowed!(req);
+ is_allowed(&req)?;
let cid = req.container_id.clone();
let res = req.resources;
@@ -858,7 +849,7 @@ impl agent_ttrpc::AgentService for AgentService {
let mut sandbox = s.lock().await;
let ctr = sandbox.get_container(&cid).ok_or_else(|| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::INVALID_ARGUMENT,
"invalid container id".to_string(),
)
@@ -870,7 +861,7 @@ impl agent_ttrpc::AgentService for AgentService {
let oci_res = rustjail::resources_grpc_to_oci(res);
match ctr.set(oci_res) {
Err(e) => {
- return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e));
+ return Err(ttrpc_error(ttrpc::Code::INTERNAL, e));
}
Ok(_) => return Ok(resp),
@@ -886,20 +877,20 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::StatsContainerRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "stats_container", req);
- is_allowed!(req);
+ is_allowed(&req)?;
let cid = req.container_id;
let s = Arc::clone(&self.sandbox);
let mut sandbox = s.lock().await;
let ctr = sandbox.get_container(&cid).ok_or_else(|| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::INVALID_ARGUMENT,
"invalid container id".to_string(),
)
})?;
ctr.stats()
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))
}
async fn pause_container(
@@ -908,20 +899,20 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::PauseContainerRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "pause_container", req);
- is_allowed!(req);
+ is_allowed(&req)?;
let cid = req.container_id();
let s = Arc::clone(&self.sandbox);
let mut sandbox = s.lock().await;
let ctr = sandbox.get_container(cid).ok_or_else(|| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::INVALID_ARGUMENT,
"invalid container id".to_string(),
)
})?;
ctr.pause()
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
Ok(Empty::new())
}
@@ -932,20 +923,20 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::ResumeContainerRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "resume_container", req);
- is_allowed!(req);
+ is_allowed(&req)?;
let cid = req.container_id();
let s = Arc::clone(&self.sandbox);
let mut sandbox = s.lock().await;
let ctr = sandbox.get_container(cid).ok_or_else(|| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::INVALID_ARGUMENT,
"invalid container id".to_string(),
)
})?;
ctr.resume()
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
Ok(Empty::new())
}
@@ -956,16 +947,16 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::RemoveStaleVirtiofsShareMountsRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "remove_stale_virtiofs_share_mounts", req);
- is_allowed!(req);
+ is_allowed(&req)?;
let mount_infos = parse_mount_table("/proc/self/mountinfo")
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
for m in &mount_infos {
if m.mount_point.starts_with(KATA_GUEST_SHARE_DIR) {
// stat the mount point, virtiofs daemon will remove the stale cache and release the fds if the mount point doesn't exist any more.
// More details in https://github.com/kata-containers/kata-containers/issues/6455#issuecomment-1477137277
match stat::stat(Path::new(&m.mount_point)) {
- Ok(_) => info!(sl!(), "stat {} success", m.mount_point),
- Err(e) => info!(sl!(), "stat {} failed: {}", m.mount_point, e),
+ Ok(_) => info!(sl(), "stat {} success", m.mount_point),
+ Err(e) => info!(sl(), "stat {} failed: {}", m.mount_point, e),
}
}
}
@@ -978,10 +969,10 @@ impl agent_ttrpc::AgentService for AgentService {
_ctx: &TtrpcContext,
req: protocols::agent::WriteStreamRequest,
) -> ttrpc::Result {
- is_allowed!(req);
+ is_allowed(&req)?;
self.do_write_stream(req)
.await
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))
}
async fn read_stdout(
@@ -989,10 +980,10 @@ impl agent_ttrpc::AgentService for AgentService {
_ctx: &TtrpcContext,
req: protocols::agent::ReadStreamRequest,
) -> ttrpc::Result {
- is_allowed!(req);
+ is_allowed(&req)?;
self.do_read_stream(req, true)
.await
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))
}
async fn read_stderr(
@@ -1000,10 +991,10 @@ impl agent_ttrpc::AgentService for AgentService {
_ctx: &TtrpcContext,
req: protocols::agent::ReadStreamRequest,
) -> ttrpc::Result {
- is_allowed!(req);
+ is_allowed(&req)?;
self.do_read_stream(req, false)
.await
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))
}
async fn close_stdin(
@@ -1012,7 +1003,7 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::CloseStdinRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "close_stdin", req);
- is_allowed!(req);
+ is_allowed(&req)?;
let cid = req.container_id.clone();
let eid = req.exec_id;
@@ -1022,7 +1013,7 @@ impl agent_ttrpc::AgentService for AgentService {
let p = sandbox
.find_container_process(cid.as_str(), eid.as_str())
.map_err(|e| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::INVALID_ARGUMENT,
format!("invalid argument: {:?}", e),
)
@@ -1039,7 +1030,7 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::TtyWinResizeRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "tty_win_resize", req);
- is_allowed!(req);
+ is_allowed(&req)?;
let cid = req.container_id.clone();
let eid = req.exec_id.clone();
@@ -1048,7 +1039,7 @@ impl agent_ttrpc::AgentService for AgentService {
let p = sandbox
.find_container_process(cid.as_str(), eid.as_str())
.map_err(|e| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::UNAVAILABLE,
format!("invalid argument: {:?}", e),
)
@@ -1065,11 +1056,11 @@ impl agent_ttrpc::AgentService for AgentService {
let err = libc::ioctl(fd, TIOCSWINSZ, &win);
Errno::result(err).map(drop).map_err(|e| {
- ttrpc_error!(ttrpc::Code::INTERNAL, format!("ioctl error: {:?}", e))
+ ttrpc_error(ttrpc::Code::INTERNAL, format!("ioctl error: {:?}", e))
})?;
}
} else {
- return Err(ttrpc_error!(ttrpc::Code::UNAVAILABLE, "no tty".to_string()));
+ return Err(ttrpc_error(ttrpc::Code::UNAVAILABLE, "no tty".to_string()));
}
Ok(Empty::new())
@@ -1081,10 +1072,10 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::UpdateInterfaceRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "update_interface", req);
- is_allowed!(req);
+ is_allowed(&req)?;
let interface = req.interface.into_option().ok_or_else(|| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::INVALID_ARGUMENT,
"empty update interface request".to_string(),
)
@@ -1097,7 +1088,7 @@ impl agent_ttrpc::AgentService for AgentService {
.update_interface(&interface)
.await
.map_err(|e| {
- ttrpc_error!(ttrpc::Code::INTERNAL, format!("update interface: {:?}", e))
+ ttrpc_error(ttrpc::Code::INTERNAL, format!("update interface: {:?}", e))
})?;
Ok(interface)
@@ -1109,10 +1100,10 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::UpdateRoutesRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "update_routes", req);
- is_allowed!(req);
+ is_allowed(&req)?;
let new_routes = req.routes.into_option().map(|r| r.Routes).ok_or_else(|| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::INVALID_ARGUMENT,
"empty update routes request".to_string(),
)
@@ -1121,14 +1112,14 @@ impl agent_ttrpc::AgentService for AgentService {
let mut sandbox = self.sandbox.lock().await;
sandbox.rtnl.update_routes(new_routes).await.map_err(|e| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::INTERNAL,
format!("Failed to update routes: {:?}", e),
)
})?;
let list = sandbox.rtnl.list_routes().await.map_err(|e| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::INTERNAL,
format!("Failed to list routes after update: {:?}", e),
)
@@ -1146,11 +1137,11 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::UpdateEphemeralMountsRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "update_mounts", req);
- is_allowed!(req);
+ is_allowed(&req)?;
- match update_ephemeral_mounts(sl!(), req.storages.to_vec(), self.sandbox.clone()).await {
+ match update_ephemeral_mounts(sl(), req.storages.to_vec(), self.sandbox.clone()).await {
Ok(_) => Ok(Empty::new()),
- Err(e) => Err(ttrpc_error!(
+ Err(e) => Err(ttrpc_error(
ttrpc::Code::INTERNAL,
format!("Failed to update mounts: {:?}", e),
)),
@@ -1163,9 +1154,9 @@ impl agent_ttrpc::AgentService for AgentService {
req: GetIPTablesRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "get_iptables", req);
- is_allowed!(req);
+ is_allowed(&req)?;
- info!(sl!(), "get_ip_tables: request received");
+ info!(sl(), "get_ip_tables: request received");
// the binary could exists in either /usr/sbin or /sbin
// here check both of the places and return the one exists
@@ -1190,8 +1181,8 @@ impl agent_ttrpc::AgentService for AgentService {
..Default::default()
}),
Err(e) => {
- warn!(sl!(), "failed to run {}: {:?}", cmd, e.kind());
- return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e));
+ warn!(sl(), "failed to run {}: {:?}", cmd, e.kind());
+ return Err(ttrpc_error(ttrpc::Code::INTERNAL, e));
}
}
}
@@ -1202,9 +1193,9 @@ impl agent_ttrpc::AgentService for AgentService {
req: SetIPTablesRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "set_iptables", req);
- is_allowed!(req);
+ is_allowed(&req)?;
- info!(sl!(), "set_ip_tables request received");
+ info!(sl(), "set_ip_tables request received");
// the binary could exists in both /usr/sbin and /sbin
// here check both of the places and return the one exists
@@ -1233,8 +1224,8 @@ impl agent_ttrpc::AgentService for AgentService {
{
Ok(child) => child,
Err(e) => {
- warn!(sl!(), "failure to spawn {}: {:?}", cmd, e.kind());
- return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e));
+ warn!(sl(), "failure to spawn {}: {:?}", cmd, e.kind());
+ return Err(ttrpc_error(ttrpc::Code::INTERNAL, e));
}
};
@@ -1242,9 +1233,9 @@ impl agent_ttrpc::AgentService for AgentService {
Some(si) => si,
None => {
println!("failed to get stdin from child");
- return Err(ttrpc_error!(
+ return Err(ttrpc_error(
ttrpc::Code::INTERNAL,
- "failed to take stdin from child".to_string()
+ "failed to take stdin from child".to_string(),
));
}
};
@@ -1254,12 +1245,12 @@ impl agent_ttrpc::AgentService for AgentService {
let _ = match stdin.write_all(&req.data) {
Ok(o) => o,
Err(e) => {
- warn!(sl!(), "error writing stdin: {:?}", e.kind());
+ warn!(sl(), "error writing stdin: {:?}", e.kind());
return;
}
};
if tx.send(1).is_err() {
- warn!(sl!(), "stdin writer thread receiver dropped");
+ warn!(sl(), "stdin writer thread receiver dropped");
};
});
@@ -1267,16 +1258,16 @@ impl agent_ttrpc::AgentService for AgentService {
.await
.is_err()
{
- return Err(ttrpc_error!(
+ return Err(ttrpc_error(
ttrpc::Code::INTERNAL,
- "timeout waiting for stdin writer to complete".to_string()
+ "timeout waiting for stdin writer to complete".to_string(),
));
}
if handle.await.is_err() {
- return Err(ttrpc_error!(
+ return Err(ttrpc_error(
ttrpc::Code::INTERNAL,
- "stdin writer thread failure".to_string()
+ "stdin writer thread failure".to_string(),
));
}
@@ -1284,24 +1275,24 @@ impl agent_ttrpc::AgentService for AgentService {
Ok(o) => o,
Err(e) => {
warn!(
- sl!(),
+ sl(),
"failure waiting for spawned {} to complete: {:?}",
cmd,
e.kind()
);
- return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e));
+ return Err(ttrpc_error(ttrpc::Code::INTERNAL, e));
}
};
if !output.status.success() {
- warn!(sl!(), "{} failed: {:?}", cmd, output.stderr);
- return Err(ttrpc_error!(
+ warn!(sl(), "{} failed: {:?}", cmd, output.stderr);
+ return Err(ttrpc_error(
ttrpc::Code::INTERNAL,
format!(
"{} failed: {:?}",
cmd,
String::from_utf8_lossy(&output.stderr)
- )
+ ),
));
}
@@ -1317,7 +1308,7 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::ListInterfacesRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "list_interfaces", req);
- is_allowed!(req);
+ is_allowed(&req)?;
let list = self
.sandbox
@@ -1327,7 +1318,7 @@ impl agent_ttrpc::AgentService for AgentService {
.list_interfaces()
.await
.map_err(|e| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::INTERNAL,
format!("Failed to list interfaces: {:?}", e),
)
@@ -1345,7 +1336,7 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::ListRoutesRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "list_routes", req);
- is_allowed!(req);
+ is_allowed(&req)?;
let list = self
.sandbox
@@ -1354,7 +1345,7 @@ impl agent_ttrpc::AgentService for AgentService {
.rtnl
.list_routes()
.await
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, format!("list routes: {:?}", e)))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, format!("list routes: {:?}", e)))?;
Ok(protocols::agent::Routes {
Routes: list,
@@ -1368,7 +1359,7 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::CreateSandboxRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "create_sandbox", req);
- is_allowed!(req);
+ is_allowed(&req)?;
{
let sandbox = self.sandbox.clone();
@@ -1383,7 +1374,7 @@ impl agent_ttrpc::AgentService for AgentService {
if !req.guest_hook_path.is_empty() {
let _ = s.add_hooks(&req.guest_hook_path).map_err(|e| {
error!(
- sl!(),
+ sl(),
"add guest hook {} failed: {:?}", req.guest_hook_path, e
);
});
@@ -1394,24 +1385,24 @@ impl agent_ttrpc::AgentService for AgentService {
}
for m in req.kernel_modules.iter() {
- load_kernel_module(m).map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ load_kernel_module(m).map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
}
s.setup_shared_namespaces()
.await
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
}
- match add_storages(sl!(), req.storages.to_vec(), self.sandbox.clone(), None).await {
+ match add_storages(sl(), req.storages.to_vec(), self.sandbox.clone(), None).await {
Ok(m) => {
let sandbox = self.sandbox.clone();
let mut s = sandbox.lock().await;
s.mounts = m
}
- Err(e) => return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
+ Err(e) => return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)),
};
- match setup_guest_dns(sl!(), req.dns.to_vec()) {
+ match setup_guest_dns(sl(), req.dns.to_vec()) {
Ok(_) => {
let sandbox = self.sandbox.clone();
let mut s = sandbox.lock().await;
@@ -1421,7 +1412,7 @@ impl agent_ttrpc::AgentService for AgentService {
.iter()
.map(|dns| s.network.set_dns(dns.to_string()));
}
- Err(e) => return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
+ Err(e) => return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)),
};
Ok(Empty::new())
@@ -1433,7 +1424,7 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::DestroySandboxRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "destroy_sandbox", req);
- is_allowed!(req);
+ is_allowed(&req)?;
let s = Arc::clone(&self.sandbox);
let mut sandbox = s.lock().await;
@@ -1442,7 +1433,7 @@ impl agent_ttrpc::AgentService for AgentService {
sandbox
.destroy()
.await
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
// Close get_oom_event connection,
// otherwise it will block the shutdown of ttrpc.
sandbox.event_tx.take();
@@ -1451,13 +1442,13 @@ impl agent_ttrpc::AgentService for AgentService {
.sender
.take()
.ok_or_else(|| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::INTERNAL,
"failed to get sandbox sender channel".to_string(),
)
})?
.send(1)
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
Ok(Empty::new())
}
@@ -1468,14 +1459,14 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::AddARPNeighborsRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "add_arp_neighbors", req);
- is_allowed!(req);
+ is_allowed(&req)?;
let neighs = req
.neighbors
.into_option()
.map(|n| n.ARPNeighbors)
.ok_or_else(|| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::INVALID_ARGUMENT,
"empty add arp neighbours request".to_string(),
)
@@ -1488,7 +1479,7 @@ impl agent_ttrpc::AgentService for AgentService {
.add_arp_neighbors(neighs)
.await
.map_err(|e| {
- ttrpc_error!(
+ ttrpc_error(
ttrpc::Code::INTERNAL,
format!("Failed to add ARP neighbours: {:?}", e),
)
@@ -1502,14 +1493,14 @@ impl agent_ttrpc::AgentService for AgentService {
ctx: &TtrpcContext,
req: protocols::agent::OnlineCPUMemRequest,
) -> ttrpc::Result {
- is_allowed!(req);
+ is_allowed(&req)?;
let s = Arc::clone(&self.sandbox);
let sandbox = s.lock().await;
trace_rpc_call!(ctx, "online_cpu_mem", req);
sandbox
.online_cpu_memory(&req)
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
Ok(Empty::new())
}
@@ -1520,10 +1511,10 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::ReseedRandomDevRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "reseed_random_dev", req);
- is_allowed!(req);
+ is_allowed(&req)?;
random::reseed_rng(req.data.as_slice())
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
Ok(Empty::new())
}
@@ -1534,9 +1525,9 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::GuestDetailsRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "get_guest_details", req);
- is_allowed!(req);
+ is_allowed(&req)?;
- info!(sl!(), "get guest details!");
+ info!(sl(), "get guest details!");
let mut resp = GuestDetailsResponse::new();
// to get memory block size
match get_memory_info(
@@ -1550,8 +1541,8 @@ impl agent_ttrpc::AgentService for AgentService {
resp.support_mem_hotplug_probe = v;
}
Err(e) => {
- info!(sl!(), "fail to get memory info!");
- return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e));
+ info!(sl(), "fail to get memory info!");
+ return Err(ttrpc_error(ttrpc::Code::INTERNAL, e));
}
}
@@ -1568,10 +1559,10 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::MemHotplugByProbeRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "mem_hotplug_by_probe", req);
- is_allowed!(req);
+ is_allowed(&req)?;
do_mem_hotplug_by_probe(&req.memHotplugProbeAddr)
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
Ok(Empty::new())
}
@@ -1582,10 +1573,10 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::SetGuestDateTimeRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "set_guest_date_time", req);
- is_allowed!(req);
+ is_allowed(&req)?;
do_set_guest_date_time(req.Sec, req.Usec)
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
Ok(Empty::new())
}
@@ -1596,9 +1587,9 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::CopyFileRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "copy_file", req);
- is_allowed!(req);
+ is_allowed(&req)?;
- do_copy_file(&req).map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ do_copy_file(&req).map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
Ok(Empty::new())
}
@@ -1609,10 +1600,10 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::GetMetricsRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "get_metrics", req);
- is_allowed!(req);
+ is_allowed(&req)?;
match get_metrics(&req) {
- Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)),
+ Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)),
Ok(s) => {
let mut metrics = Metrics::new();
metrics.set_metrics(s);
@@ -1626,7 +1617,7 @@ impl agent_ttrpc::AgentService for AgentService {
_ctx: &TtrpcContext,
req: protocols::agent::GetOOMEventRequest,
) -> ttrpc::Result {
- is_allowed!(req);
+ is_allowed(&req)?;
let sandbox = self.sandbox.clone();
let s = sandbox.lock().await;
let event_rx = &s.event_rx.clone();
@@ -1635,7 +1626,7 @@ impl agent_ttrpc::AgentService for AgentService {
drop(sandbox);
if let Some(container_id) = event_rx.recv().await {
- info!(sl!(), "get_oom_event return {}", &container_id);
+ info!(sl(), "get_oom_event return {}", &container_id);
let mut resp = OOMEvent::new();
resp.container_id = container_id;
@@ -1643,7 +1634,7 @@ impl agent_ttrpc::AgentService for AgentService {
return Ok(resp);
}
- Err(ttrpc_error!(ttrpc::Code::INTERNAL, ""))
+ Err(ttrpc_error(ttrpc::Code::INTERNAL, ""))
}
async fn get_volume_stats(
@@ -1652,9 +1643,9 @@ impl agent_ttrpc::AgentService for AgentService {
req: VolumeStatsRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "get_volume_stats", req);
- is_allowed!(req);
+ is_allowed(&req)?;
- info!(sl!(), "get volume stats!");
+ info!(sl(), "get volume stats!");
let mut resp = VolumeStatsResponse::new();
let mut condition = VolumeCondition::new();
@@ -1665,8 +1656,8 @@ impl agent_ttrpc::AgentService for AgentService {
condition.message = String::from("OK");
}
Err(e) => {
- info!(sl!(), "failed to open the volume");
- return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e));
+ info!(sl(), "failed to open the volume");
+ return Err(ttrpc_error(ttrpc::Code::INTERNAL, e));
}
};
@@ -1675,12 +1666,12 @@ impl agent_ttrpc::AgentService for AgentService {
// to get volume capacity stats
get_volume_capacity_stats(&req.volume_guest_path)
.map(|u| usage_vec.push(u))
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
// to get volume inode stats
get_volume_inode_stats(&req.volume_guest_path)
.map(|u| usage_vec.push(u))
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
resp.usage = usage_vec;
resp.volume_condition = MessageField::some(condition);
@@ -1693,11 +1684,11 @@ impl agent_ttrpc::AgentService for AgentService {
req: protocols::agent::AddSwapRequest,
) -> ttrpc::Result {
trace_rpc_call!(ctx, "add_swap", req);
- is_allowed!(req);
+ is_allowed(&req)?;
do_add_swap(&self.sandbox, &req)
.await
- .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?;
+ .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?;
Ok(Empty::new())
}
@@ -1724,7 +1715,7 @@ impl health_ttrpc::Health for HealthService {
_ctx: &TtrpcContext,
req: protocols::health::CheckRequest,
) -> ttrpc::Result {
- info!(sl!(), "version {:?}", req);
+ info!(sl(), "version {:?}", req);
let mut rep = protocols::health::VersionCheckResponse::new();
rep.agent_version = AGENT_VERSION.to_string();
rep.grpc_version = API_VERSION.to_string();
@@ -1745,17 +1736,17 @@ fn get_memory_info(
match fs::read_to_string(block_size_path) {
Ok(v) => {
if v.is_empty() {
- warn!(sl!(), "file {} is empty", block_size_path);
+ warn!(sl(), "file {} is empty", block_size_path);
return Err(anyhow!(ERR_INVALID_BLOCK_SIZE));
}
size = u64::from_str_radix(v.trim(), 16).map_err(|_| {
- warn!(sl!(), "failed to parse the str {} to hex", size);
+ warn!(sl(), "failed to parse the str {} to hex", size);
anyhow!(ERR_INVALID_BLOCK_SIZE)
})?;
}
Err(e) => {
- warn!(sl!(), "memory block size error: {:?}", e.kind());
+ warn!(sl(), "memory block size error: {:?}", e.kind());
if e.kind() != std::io::ErrorKind::NotFound {
return Err(anyhow!(e));
}
@@ -1767,7 +1758,7 @@ fn get_memory_info(
match stat::stat(hotplug_probe_path) {
Ok(_) => plug = true,
Err(e) => {
- warn!(sl!(), "hotplug memory error: {:?}", e);
+ warn!(sl(), "hotplug memory error: {:?}", e);
match e {
nix::Error::ENOENT => plug = false,
_ => return Err(anyhow!(e)),
@@ -1873,7 +1864,7 @@ pub async fn start(
.register_service(hservice)
.register_service(iservice);
- info!(sl!(), "ttRPC server started"; "address" => server_address);
+ info!(sl(), "ttRPC server started"; "address" => server_address);
Ok(server)
}
@@ -1948,7 +1939,7 @@ fn remove_container_resources(sandbox: &mut Sandbox, cid: &str) -> Result<()> {
for m in cmounts.iter() {
if let Err(err) = sandbox.unset_and_remove_sandbox_storage(m) {
error!(
- sl!(),
+ sl(),
"failed to unset_and_remove_sandbox_storage for container {}, error: {:?}",
cid,
err
@@ -1984,7 +1975,7 @@ fn is_signal_handled(proc_status_file: &str, signum: u32) -> bool {
return fs::metadata(proc_status_file).is_ok();
} else if signum > 64 {
// Ensure invalid signum won't break bit shift logic
- warn!(sl!(), "received invalid signum {}", signum);
+ warn!(sl(), "received invalid signum {}", signum);
return false;
} else {
(signum - 1).into()
@@ -1994,7 +1985,7 @@ fn is_signal_handled(proc_status_file: &str, signum: u32) -> bool {
let file = match File::open(proc_status_file) {
Ok(f) => f,
Err(_) => {
- warn!(sl!(), "failed to open file {}", proc_status_file);
+ warn!(sl(), "failed to open file {}", proc_status_file);
return false;
}
};
@@ -2174,7 +2165,7 @@ pub fn setup_bundle(cid: &str, spec: &mut Spec) -> Result {
let rootfs_exists = Path::new(&rootfs_path).exists();
info!(
- sl!(),
+ &sl(),
"The rootfs_path is {:?} and exists: {}", rootfs_path, rootfs_exists
);
@@ -2186,7 +2177,7 @@ pub fn setup_bundle(cid: &str, spec: &mut Spec) -> Result {
"bind",
MsFlags::MS_BIND,
"",
- &sl!(),
+ &sl(),
)?;
}
@@ -2222,7 +2213,7 @@ fn load_kernel_module(module: &protocols::agent::KernelModule) -> Result<()> {
}
info!(
- sl!(),
+ sl(),
"load_kernel_module {}: {:?}", module.name, module.parameters
);
@@ -3008,7 +2999,7 @@ OtherField:other
for cmd in iptables_cmd_list {
if !check_command(cmd) {
warn!(
- sl!(),
+ sl(),
"one or more commands for ip tables test are missing, skip it"
);
return;
diff --git a/src/agent/src/tracer.rs b/src/agent/src/tracer.rs
index bad4a6f50..1199b601c 100644
--- a/src/agent/src/tracer.rs
+++ b/src/agent/src/tracer.rs
@@ -69,7 +69,7 @@ macro_rules! trace_rpc_call {
propagator.extract(&extract_carrier_from_ttrpc($ctx))
});
- info!(sl!(), "rpc call from shim to agent: {:?}", $name);
+ info!(sl(), "rpc call from shim to agent: {:?}", $name);
// generate tracing span
let rpc_span = span!(tracing::Level::INFO, $name, "mod"="rpc.rs", req=?$req);
diff --git a/src/agent/src/uevent.rs b/src/agent/src/uevent.rs
index 5d1f55494..53b7c103d 100644
--- a/src/agent/src/uevent.rs
+++ b/src/agent/src/uevent.rs
@@ -19,11 +19,9 @@ use tokio::sync::watch::Receiver;
use tokio::sync::Mutex;
use tracing::instrument;
-// Convenience macro to obtain the scope logger
-macro_rules! sl {
- () => {
- slog_scope::logger().new(o!("subsystem" => "uevent"))
- };
+// Convenience function to obtain the scope logger.
+fn sl() -> slog::Logger {
+ slog_scope::logger().new(o!("subsystem" => "uevent"))
}
#[derive(Debug, Default, Clone, PartialEq, Eq)]
@@ -120,11 +118,11 @@ pub async fn wait_for_uevent(
) -> Result {
let logprefix = format!("Waiting for {:?}", &matcher);
- info!(sl!(), "{}", logprefix);
+ info!(sl(), "{}", logprefix);
let mut sb = sandbox.lock().await;
for uev in sb.uevent_map.values() {
if matcher.is_match(uev) {
- info!(sl!(), "{}: found {:?} in uevent map", logprefix, &uev);
+ info!(sl(), "{}: found {:?} in uevent map", logprefix, &uev);
return Ok(uev.clone());
}
}
@@ -139,9 +137,9 @@ pub async fn wait_for_uevent(
sb.uevent_watchers.push(Some((Box::new(matcher), tx)));
drop(sb); // unlock
- info!(sl!(), "{}: waiting on channel", logprefix);
+ info!(sl(), "{}: waiting on channel", logprefix);
- let hotplug_timeout = AGENT_CONFIG.read().await.hotplug_timeout;
+ let hotplug_timeout = AGENT_CONFIG.hotplug_timeout;
let uev = match tokio::time::timeout(hotplug_timeout, rx).await {
Ok(v) => v?,
@@ -157,7 +155,7 @@ pub async fn wait_for_uevent(
}
};
- info!(sl!(), "{}: found {:?} on channel", logprefix, &uev);
+ info!(sl(), "{}: found {:?} on channel", logprefix, &uev);
Ok(uev)
}
diff --git a/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs b/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs
index c7c9cb082..91f9406b8 100644
--- a/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs
+++ b/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs
@@ -341,7 +341,10 @@ impl DragonballInner {
// cannot exceed maximum value
if new_vcpus > self.config.cpu_info.default_maxvcpus {
- return Err(anyhow!("resize vcpu error: cannot greater than maxvcpus"));
+ warn!(
+ sl!(),
+ "Cannot allocate more vcpus than the max allowed number of vcpus. The maximum allowed amount of vcpus will be used instead.");
+ return Ok((current_vcpus, self.config.cpu_info.default_maxvcpus));
}
Ok((current_vcpus, new_vcpus))
diff --git a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs
index 53eccc52b..2a8b6e600 100644
--- a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs
+++ b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs
@@ -105,7 +105,12 @@ impl InitialSizeManager {
hv.cpu_info.default_vcpus = self.resource.vcpu as i32
}
if self.resource.mem_mb > 0 {
- hv.memory_info.default_memory = self.resource.mem_mb;
+ // since the memory overhead introduced by kata-agent and system components
+ // will really affect the amount of memory the user can use, so we choose to
+ // plus the default_memory here, instead of overriding it.
+ // (if we override the default_memory here, and user apllications still
+ // use memory as they orignally expected, it would be easy to OOM.)
+ hv.memory_info.default_memory += self.resource.mem_mb;
}
Ok(())
}
diff --git a/src/runtime/cmd/kata-runtime/kata-env.go b/src/runtime/cmd/kata-runtime/kata-env.go
index f17480aba..f74cb9f89 100644
--- a/src/runtime/cmd/kata-runtime/kata-env.go
+++ b/src/runtime/cmd/kata-runtime/kata-env.go
@@ -17,7 +17,7 @@ import (
"github.com/prometheus/procfs"
"github.com/urfave/cli"
- hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
+ "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
"github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
@@ -113,8 +113,8 @@ type HypervisorInfo struct {
SocketPath string
Msize9p uint32
MemorySlots uint32
- PCIeRootPort uint32
- ColdPlugVFIO hv.PCIePort
+ HotPlugVFIO config.PCIePort
+ ColdPlugVFIO config.PCIePort
HotplugVFIOOnRootBus bool
Debug bool
}
@@ -317,9 +317,9 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) {
EntropySource: config.HypervisorConfig.EntropySource,
SharedFS: config.HypervisorConfig.SharedFS,
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
+ HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
- PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
SocketPath: socketPath,
}, nil
}
diff --git a/src/runtime/cmd/kata-runtime/kata-env_test.go b/src/runtime/cmd/kata-runtime/kata-env_test.go
index 3760104d0..c8d4d0ea9 100644
--- a/src/runtime/cmd/kata-runtime/kata-env_test.go
+++ b/src/runtime/cmd/kata-runtime/kata-env_test.go
@@ -19,12 +19,12 @@ import (
"testing"
"github.com/BurntSushi/toml"
- hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
vcUtils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/urfave/cli"
+ "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
@@ -74,8 +74,9 @@ func createConfig(configPath string, fileData string) error {
return nil
}
-func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeConfig, err error) {
- var coldPlugVFIO hv.PCIePort
+func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.RuntimeConfig, err error) {
+ var hotPlugVFIO config.PCIePort
+ var coldPlugVFIO config.PCIePort
const logPath = "/log/path"
hypervisorPath := filepath.Join(prefixDir, "hypervisor")
kernelPath := filepath.Join(prefixDir, "kernel")
@@ -87,8 +88,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
blockStorageDriver := "virtio-scsi"
enableIOThreads := true
hotplugVFIOOnRootBus := true
- pcieRootPort := uint32(2)
- coldPlugVFIO = hv.NoPort
+ hotPlugVFIO = config.BridgePort
+ coldPlugVFIO = config.NoPort
disableNewNetNs := false
sharedFS := "virtio-9p"
virtioFSdaemon := filepath.Join(prefixDir, "virtiofsd")
@@ -132,8 +133,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
BlockDeviceDriver: blockStorageDriver,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
+ HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
- PCIeRootPort: pcieRootPort,
DisableNewNetNs: disableNewNetNs,
DefaultVCPUCount: hypConfig.NumVCPUs,
DefaultMaxVCPUCount: hypConfig.DefaultMaxVCPUs,
@@ -156,12 +157,12 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
return "", oci.RuntimeConfig{}, err
}
- _, config, err = katautils.LoadConfiguration(configFile, true)
+ _, ociConfig, err = katautils.LoadConfiguration(configFile, true)
if err != nil {
return "", oci.RuntimeConfig{}, err
}
- return configFile, config, nil
+ return configFile, ociConfig, nil
}
func getExpectedAgentDetails(config oci.RuntimeConfig) (AgentInfo, error) {
@@ -277,7 +278,7 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo {
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
- PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
+ HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
}
diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in
index e7c115f92..1cc8b54dc 100644
--- a/src/runtime/config/configuration-clh.toml.in
+++ b/src/runtime/config/configuration-clh.toml.in
@@ -131,6 +131,11 @@ default_maxmemory = @DEFMAXMEMSZ@
# Shared file system type:
# - virtio-fs (default)
# - virtio-fs-nydus
+# - none
+# WARNING: "none" should be carefully used, and only used in very few specific cases, as
+# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
+# issues with rotation of secrets, certs, or configurations via kubernetes objects like
+# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
shared_fs = "@DEFSHAREDFS_CLH_VIRTIOFS@"
# Path to vhost-user-fs daemon.
diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in
index 33574b17d..4861cb1ed 100644
--- a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in
+++ b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in
@@ -178,6 +178,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
# - virtio-fs (default)
# - virtio-9p
# - virtio-fs-nydus
+# - none
+# WARNING: "none" should be carefully used, and only used in very few specific cases, as
+# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
+# issues with rotation of secrets, certs, or configurations via kubernetes objects like
+# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@"
# Path to vhost-user-fs daemon.
diff --git a/src/runtime/config/configuration-qemu-sev.toml.in b/src/runtime/config/configuration-qemu-sev.toml.in
index 7571cc6d0..db373e6cc 100644
--- a/src/runtime/config/configuration-qemu-sev.toml.in
+++ b/src/runtime/config/configuration-qemu-sev.toml.in
@@ -186,6 +186,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
# - virtio-fs (default)
# - virtio-9p
# - virtio-fs-nydus
+# - none
+# WARNING: "none" should be carefully used, and only used in very few specific cases, as
+# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
+# issues with rotation of secrets, certs, or configurations via kubernetes objects like
+# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
shared_fs = "@DEFSHAREDFS_QEMU_SEV_VIRTIOFS@"
# Path to vhost-user-fs daemon.
@@ -669,4 +674,4 @@ service_offload = @DEFSERVICEOFFLOAD@
#
# Keys can be remotely provisioned. The Kata agent fetches them from e.g.
# a HTTPS URL:
-#provision=https://my-key-broker.foo/tenant/
\ No newline at end of file
+#provision=https://my-key-broker.foo/tenant/
diff --git a/src/runtime/config/configuration-qemu-snp.toml.in b/src/runtime/config/configuration-qemu-snp.toml.in
index d4fd77a88..f7aa09678 100644
--- a/src/runtime/config/configuration-qemu-snp.toml.in
+++ b/src/runtime/config/configuration-qemu-snp.toml.in
@@ -184,6 +184,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
# - virtio-fs (default)
# - virtio-9p
# - virtio-fs-nydus
+# - none
+# WARNING: "none" should be carefully used, and only used in very few specific cases, as
+# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
+# issues with rotation of secrets, certs, or configurations via kubernetes objects like
+# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
shared_fs = "@DEFSHAREDFS_QEMU_SNP_VIRTIOFS@"
# Path to vhost-user-fs daemon.
diff --git a/src/runtime/config/configuration-qemu-tdx.toml.in b/src/runtime/config/configuration-qemu-tdx.toml.in
index 384bea041..e679677c8 100644
--- a/src/runtime/config/configuration-qemu-tdx.toml.in
+++ b/src/runtime/config/configuration-qemu-tdx.toml.in
@@ -172,6 +172,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
# - virtio-fs (default)
# - virtio-9p
# - virtio-fs-nydus
+# - none
+# WARNING: "none" should be carefully used, and only used in very few specific cases, as
+# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
+# issues with rotation of secrets, certs, or configurations via kubernetes objects like
+# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
shared_fs = "@DEFSHAREDFS_QEMU_TDX_VIRTIOFS@"
# Path to vhost-user-fs daemon.
diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in
index bc0dcb736..8d34386a3 100644
--- a/src/runtime/config/configuration-qemu.toml.in
+++ b/src/runtime/config/configuration-qemu.toml.in
@@ -206,6 +206,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
# - virtio-fs (default)
# - virtio-9p
# - virtio-fs-nydus
+# - none
+# WARNING: "none" should be carefully used, and only used in very few specific cases, as
+# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
+# issues with rotation of secrets, certs, or configurations via kubernetes objects like
+# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@"
# Path to vhost-user-fs daemon.
@@ -380,8 +385,15 @@ pflashes = []
# Default false
#hotplug_vfio_on_root_bus = true
+# Enable hot-plugging of VFIO devices to a bridge-port,
+# root-port or switch-port.
+# The default setting is "no-port"
+#hot_plug_vfio = "root-port"
+
# In a confidential compute environment hot-plugging can compromise
-# security. Enable cold-plugging of VFIO devices to a root-port.
+# security.
+# Enable cold-plugging of VFIO devices to a bridge-port,
+# root-port or switch-port.
# The default setting is "no-port", which means disabled.
#cold_plug_vfio = "root-port"
diff --git a/src/runtime/pkg/containerd-shim-v2/create_test.go b/src/runtime/pkg/containerd-shim-v2/create_test.go
index e3e8e9369..c24e3ced3 100644
--- a/src/runtime/pkg/containerd-shim-v2/create_test.go
+++ b/src/runtime/pkg/containerd-shim-v2/create_test.go
@@ -20,7 +20,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/assert"
- hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
+ "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
@@ -308,8 +308,9 @@ func TestCreateContainerConfigFail(t *testing.T) {
assert.Error(err)
}
-func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err error) {
- var coldPlugVFIO hv.PCIePort
+func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, err error) {
+ var hotPlugVFIO config.PCIePort
+ var coldPlugVFIO config.PCIePort
if dir == "" {
return "", fmt.Errorf("BUG: need directory")
}
@@ -330,11 +331,11 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
blockDeviceDriver := "virtio-scsi"
enableIOThreads := true
hotplugVFIOOnRootBus := true
- pcieRootPort := uint32(2)
disableNewNetNs := false
sharedFS := "virtio-9p"
virtioFSdaemon := path.Join(dir, "virtiofsd")
- coldPlugVFIO = hv.RootPort
+ hotPlugVFIO = config.BridgePort
+ coldPlugVFIO = config.RootPort
configFileOptions := ktu.RuntimeConfigOptions{
Hypervisor: "qemu",
@@ -349,10 +350,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
BlockDeviceDriver: blockDeviceDriver,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
- PCIeRootPort: pcieRootPort,
DisableNewNetNs: disableNewNetNs,
SharedFS: sharedFS,
VirtioFSDaemon: virtioFSdaemon,
+ HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
}
diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go
index dee6291ed..ef2a5c4b0 100644
--- a/src/runtime/pkg/device/config/config.go
+++ b/src/runtime/pkg/device/config/config.go
@@ -81,6 +81,17 @@ const (
// VirtioFSNydus means use nydus for the shared file system
VirtioFSNydus = "virtio-fs-nydus"
+
+ // NoSharedFS means *no* shared file system solution will be used
+ // and files will be copied into the guest system.
+ //
+ // WARNING: This should be carefully used, and only used in very few
+ // specific cases, as any update to the mount will *NOT* be reflected
+ // during the lifecycle of the pod, causing issues with rotation of
+ // secrets, certs, or configurations via kubernetes objects like
+ // configMaps or secrets, as those will be copied into the guest at
+ // *pod* *creation* *time*.
+ NoSharedFS = "none"
)
const (
@@ -114,14 +125,117 @@ const (
// SysDevPrefix is static string of /sys/dev
var SysDevPrefix = "/sys/dev"
-// SysIOMMUPath is static string of /sys/kernel/iommu_groups
-var SysIOMMUPath = "/sys/kernel/iommu_groups"
+// SysIOMMUGroupPath is static string of /sys/kernel/iommu_groups
+var SysIOMMUGroupPath = "/sys/kernel/iommu_groups"
// SysBusPciDevicesPath is static string of /sys/bus/pci/devices
var SysBusPciDevicesPath = "/sys/bus/pci/devices"
var getSysDevPath = getSysDevPathImpl
+// PCIePortBusPrefix gives us the correct bus nameing dependeing on the port
+// used to hot(cold)-plug the device
+type PCIePortBusPrefix string
+
+const (
+ PCIeRootPortPrefix PCIePortBusPrefix = "rp"
+ PCIeSwitchPortPrefix PCIePortBusPrefix = "sw"
+ PCIeSwitchUpstreamPortPrefix PCIePortBusPrefix = "swup"
+ PCIeSwitchhDownstreamPortPrefix PCIePortBusPrefix = "swdp"
+ PCIBridgePortPrefix PCIePortBusPrefix = "bp"
+)
+
+func (p PCIePortBusPrefix) String() string {
+ switch p {
+ case PCIeRootPortPrefix:
+ fallthrough
+ case PCIeSwitchPortPrefix:
+ fallthrough
+ case PCIeSwitchUpstreamPortPrefix:
+ fallthrough
+ case PCIeSwitchhDownstreamPortPrefix:
+ fallthrough
+ case PCIBridgePortPrefix:
+ return string(p)
+ }
+ return fmt.Sprintf("", string(p))
+}
+
+// PCIePort distinguish only between root and switch port
+type PCIePort string
+
+const (
+ // RootPort attach VFIO devices to a root-port
+ RootPort PCIePort = "root-port"
+ // SwitchPort attach VFIO devices to a switch-port
+ SwitchPort = "switch-port"
+ // BridgePort is the default
+ BridgePort = "bridge-port"
+ // NoPort is for disabling VFIO hotplug/coldplug
+ NoPort = "no-port"
+ // InvalidPort is for invalid port
+ InvalidPort = "invalid-port"
+)
+
+func (p PCIePort) String() string {
+ switch p {
+ case RootPort:
+ fallthrough
+ case SwitchPort:
+ fallthrough
+ case BridgePort:
+ fallthrough
+ case NoPort:
+ fallthrough
+ case InvalidPort:
+ return string(p)
+ }
+ return fmt.Sprintf("", string(p))
+}
+
+var PCIePortPrefixMapping = map[PCIePort]PCIePortBusPrefix{
+ RootPort: PCIeRootPortPrefix,
+ SwitchPort: PCIeSwitchhDownstreamPortPrefix,
+ BridgePort: PCIBridgePortPrefix,
+}
+
+func (p PCIePort) Invalid() bool {
+ switch p {
+ case RootPort:
+ fallthrough
+ case SwitchPort:
+ fallthrough
+ case BridgePort:
+ fallthrough
+ case NoPort:
+ return false
+ }
+ return true
+}
+
+func (p PCIePort) Valid() bool {
+ switch p {
+ case RootPort:
+ fallthrough
+ case SwitchPort:
+ fallthrough
+ case BridgePort:
+ fallthrough
+ case NoPort:
+ return true
+ }
+ return false
+}
+
+type PCIePortMapping map[string]bool
+
+var (
+ // Each of this structures keeps track of the devices attached to the
+ // different types of PCI ports. We can deduces the Bus number from it
+ // and eliminate duplicates being assigned.
+ PCIeDevices = map[PCIePort]PCIePortMapping{}
+)
+
// DeviceInfo is an embedded type that contains device data common to all types of devices.
type DeviceInfo struct {
// DriverOptions is specific options for each device driver
@@ -167,6 +281,9 @@ type DeviceInfo struct {
// ColdPlug specifies whether the device must be cold plugged (true)
// or hot plugged (false).
ColdPlug bool
+
+ // Specifies the PCIe port type to which the device is attached
+ Port PCIePort
}
// BlockDrive represents a block storage drive which may be used in case the storage
@@ -268,14 +385,8 @@ const (
VFIOAPDeviceMediatedType
)
-type VFIODev interface {
- GetID() *string
- GetType() VFIODeviceType
- GetSysfsDev() *string
-}
-
-// VFIOPCIDev represents a VFIO PCI device used for hotplugging
-type VFIOPCIDev struct {
+// VFIODev represents a VFIO PCI device used for hotplugging
+type VFIODev struct {
// ID is used to identify this drive in the hypervisor options.
ID string
@@ -305,44 +416,15 @@ type VFIOPCIDev struct {
// IsPCIe specifies device is PCIe or PCI
IsPCIe bool
-}
-
-func (d VFIOPCIDev) GetID() *string {
- return &d.ID
-}
-
-func (d VFIOPCIDev) GetType() VFIODeviceType {
- return d.Type
-}
-
-func (d VFIOPCIDev) GetSysfsDev() *string {
- return &d.SysfsDev
-}
-
-type VFIOAPDev struct {
- // ID is used to identify this drive in the hypervisor options.
- ID string
-
- // sysfsdev of VFIO mediated device
- SysfsDev string
// APDevices are the Adjunct Processor devices assigned to the mdev
APDevices []string
- // Type of VFIO device
- Type VFIODeviceType
-}
+ // Rank identifies a device in a IOMMU group
+ Rank int
-func (d VFIOAPDev) GetID() *string {
- return &d.ID
-}
-
-func (d VFIOAPDev) GetType() VFIODeviceType {
- return d.Type
-}
-
-func (d VFIOAPDev) GetSysfsDev() *string {
- return &d.SysfsDev
+ // Port is the PCIe port type to which the device is attached
+ Port PCIePort
}
// RNGDev represents a random number generator device
diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go
index bfffa31a2..8c9055ae2 100644
--- a/src/runtime/pkg/device/drivers/utils.go
+++ b/src/runtime/pkg/device/drivers/utils.go
@@ -47,9 +47,9 @@ func deviceLogger() *logrus.Entry {
return api.DeviceLogger()
}
-// Identify PCIe device by reading the size of the PCI config space
+// IsPCIeDevice identifies PCIe device by reading the size of the PCI config space
// Plain PCI device have 256 bytes of config space where PCIe devices have 4K
-func isPCIeDevice(bdf string) bool {
+func IsPCIeDevice(bdf string) bool {
if len(strings.Split(bdf, ":")) == 2 {
bdf = PCIDomain + ":" + bdf
}
@@ -157,14 +157,12 @@ func checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (boo
// GetAllVFIODevicesFromIOMMUGroup returns all the VFIO devices in the IOMMU group
// We can reuse this function at various levels, sandbox, container.
-// Only the VFIO module is allowed to do bus assignments, all other modules need to
-// ignore it if used as helper function to get VFIO information.
-func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignment bool) ([]*config.VFIODev, error) {
+func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODev, error) {
vfioDevs := []*config.VFIODev{}
vfioGroup := filepath.Base(device.HostPath)
- iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices")
+ iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioGroup, "devices")
deviceFiles, err := os.ReadDir(iommuDevicesPath)
if err != nil {
@@ -174,7 +172,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
// Pass all devices in iommu group
for i, deviceFile := range deviceFiles {
//Get bdf of device eg 0000:00:1c.0
- deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath)
+ deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(deviceFile.Name(), iommuDevicesPath)
if err != nil {
return nil, err
}
@@ -196,27 +194,24 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
- isPCIe := isPCIeDevice(deviceBDF)
// Do not directly assign to `vfio` -- need to access field still
- vfioPCI := config.VFIOPCIDev{
+ vfio = config.VFIODev{
ID: id,
Type: vfioDeviceType,
BDF: deviceBDF,
SysfsDev: deviceSysfsDev,
- IsPCIe: isPCIe,
+ IsPCIe: IsPCIeDevice(deviceBDF),
Class: pciClass,
+ Rank: -1,
+ Port: device.Port,
}
- if isPCIe && !ignoreBusAssignment {
- vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs))
- AllPCIeDevs[deviceBDF] = true
- }
- vfio = vfioPCI
+
case config.VFIOAPDeviceMediatedType:
devices, err := GetAPVFIODevices(deviceSysfsDev)
if err != nil {
return nil, err
}
- vfio = config.VFIOAPDev{
+ vfio = config.VFIODev{
ID: id,
SysfsDev: deviceSysfsDev,
Type: config.VFIOAPDeviceMediatedType,
diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go
index 106220dcf..801b1a81f 100644
--- a/src/runtime/pkg/device/drivers/vfio.go
+++ b/src/runtime/pkg/device/drivers/vfio.go
@@ -28,14 +28,9 @@ const (
vfioRemoveIDPath = "/sys/bus/pci/drivers/vfio-pci/remove_id"
iommuGroupPath = "/sys/bus/pci/devices/%s/iommu_group"
vfioDevPath = "/dev/vfio/%s"
- pcieRootPortPrefix = "rp"
vfioAPSysfsDir = "/sys/devices/vfio_ap"
)
-var (
- AllPCIeDevs = map[string]bool{}
-)
-
// VFIODevice is a vfio device meant to be passed to the hypervisor
// to be used by the Virtual Machine.
type VFIODevice struct {
@@ -70,10 +65,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
}
}()
- device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo, false)
+ device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo)
if err != nil {
return err
}
+ for _, vfio := range device.VfioDevs {
+ if vfio.IsPCIe {
+ busIndex := len(config.PCIeDevices[vfio.Port])
+ vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
+ config.PCIeDevices[vfio.Port][vfio.BDF] = true
+ }
+ }
coldPlug := device.DeviceInfo.ColdPlug
deviceLogger().WithField("cold-plug", coldPlug).Info("Attaching VFIO device")
@@ -169,23 +171,18 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
for _, dev := range ds.VFIODevs {
var vfio config.VFIODev
- vfioDeviceType := (*device.VfioDevs[0]).GetType()
- switch vfioDeviceType {
+ switch dev.Type {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
- bdf := ""
- if pciDev, ok := (*dev).(config.VFIOPCIDev); ok {
- bdf = pciDev.BDF
- }
- vfio = config.VFIOPCIDev{
- ID: *(*dev).GetID(),
- Type: config.VFIODeviceType((*dev).GetType()),
- BDF: bdf,
- SysfsDev: *(*dev).GetSysfsDev(),
+ vfio = config.VFIODev{
+ ID: dev.ID,
+ Type: config.VFIODeviceType(dev.Type),
+ BDF: dev.BDF,
+ SysfsDev: dev.SysfsDev,
}
case config.VFIOAPDeviceMediatedType:
- vfio = config.VFIOAPDev{
- ID: *(*dev).GetID(),
- SysfsDev: *(*dev).GetSysfsDev(),
+ vfio = config.VFIODev{
+ ID: dev.ID,
+ SysfsDev: dev.SysfsDev,
}
default:
deviceLogger().WithError(
@@ -200,7 +197,7 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
// It should implement GetAttachCount() and DeviceID() as api.Device implementation
// here it shares function from *GenericDevice so we don't need duplicate codes
-func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
+func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
vfioDeviceType, err = GetVFIODeviceType(sysfsDevStr)
if err != nil {
@@ -210,14 +207,18 @@ func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceS
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType:
// Get bdf of device eg. 0000:00:1c.0
- deviceBDF = getBDF(deviceFileName)
+ // OLD IMPL: deviceBDF = getBDF(deviceFileName)
+ // The old implementation did not consider the case where
+ // vfio devices are located on different root busses. The
+ // kata-agent will handle the case now, here, use the full PCI addr
+ deviceBDF = deviceFileName
// Get sysfs path used by cloud-hypervisor
deviceSysfsDev = filepath.Join(config.SysBusPciDevicesPath, deviceFileName)
case config.VFIOPCIDeviceMediatedType:
// Get sysfsdev of device eg. /sys/devices/pci0000:00/0000:00:02.0/f79944e4-5a3d-11e8-99ce-479cbab002e4
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
- deviceBDF = getBDF(getMediatedBDF(deviceSysfsDev))
+ deviceBDF = GetBDF(getMediatedBDF(deviceSysfsDev))
case config.VFIOAPDeviceMediatedType:
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
@@ -240,7 +241,7 @@ func getMediatedBDF(deviceSysfsDev string) string {
// getBDF returns the BDF of pci device
// Expected input string format is []:[][].[] eg. 0000:02:10.0
-func getBDF(deviceSysStr string) string {
+func GetBDF(deviceSysStr string) string {
tokens := strings.SplitN(deviceSysStr, ":", 2)
if len(tokens) == 1 {
return ""
diff --git a/src/runtime/pkg/device/drivers/vfio_test.go b/src/runtime/pkg/device/drivers/vfio_test.go
index 6a1ab61eb..9a03fa030 100644
--- a/src/runtime/pkg/device/drivers/vfio_test.go
+++ b/src/runtime/pkg/device/drivers/vfio_test.go
@@ -20,7 +20,7 @@ func TestGetVFIODetails(t *testing.T) {
}
data := []testData{
- {"0000:02:10.0", "02:10.0"},
+ {"0000:02:10.0", "0000:02:10.0"},
{"0000:0210.0", ""},
{"f79944e4-5a3d-11e8-99ce-", ""},
{"f79944e4-5a3d-11e8-99ce", ""},
@@ -29,7 +29,7 @@ func TestGetVFIODetails(t *testing.T) {
}
for _, d := range data {
- deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(d.deviceStr, "")
+ deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(d.deviceStr, "")
switch vfioDeviceType {
case config.VFIOPCIDeviceNormalType:
diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go
index baf1209a7..735061d9e 100644
--- a/src/runtime/pkg/device/manager/manager.go
+++ b/src/runtime/pkg/device/manager/manager.go
@@ -71,7 +71,11 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS
dm.blockDriver = config.VirtioSCSI
}
- drivers.AllPCIeDevs = make(map[string]bool)
+ config.PCIeDevices = make(map[config.PCIePort]config.PCIePortMapping)
+
+ config.PCIeDevices[config.RootPort] = make(map[string]bool)
+ config.PCIeDevices[config.SwitchPort] = make(map[string]bool)
+ config.PCIeDevices[config.BridgePort] = make(map[string]bool)
for _, dev := range devices {
dm.devices[dev.DeviceID()] = dev
@@ -118,7 +122,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device
}
if IsVFIO(devInfo.HostPath) {
return drivers.NewVFIODevice(&devInfo), nil
- } else if isVhostUserBlk(devInfo) {
+ } else if IsVhostUserBlk(devInfo) {
if devInfo.DriverOptions == nil {
devInfo.DriverOptions = make(map[string]string)
}
diff --git a/src/runtime/pkg/device/manager/manager_test.go b/src/runtime/pkg/device/manager/manager_test.go
index 49e339f60..70c76b67d 100644
--- a/src/runtime/pkg/device/manager/manager_test.go
+++ b/src/runtime/pkg/device/manager/manager_test.go
@@ -116,14 +116,14 @@ func TestAttachVFIODevice(t *testing.T) {
_, err = os.Create(deviceConfigFile)
assert.Nil(t, err)
- savedIOMMUPath := config.SysIOMMUPath
- config.SysIOMMUPath = tmpDir
+ savedIOMMUPath := config.SysIOMMUGroupPath
+ config.SysIOMMUGroupPath = tmpDir
savedSysBusPciDevicesPath := config.SysBusPciDevicesPath
config.SysBusPciDevicesPath = devicesDir
defer func() {
- config.SysIOMMUPath = savedIOMMUPath
+ config.SysIOMMUGroupPath = savedIOMMUPath
config.SysBusPciDevicesPath = savedSysBusPciDevicesPath
}()
diff --git a/src/runtime/pkg/device/manager/utils.go b/src/runtime/pkg/device/manager/utils.go
index e78205d0c..a9e4ee8c6 100644
--- a/src/runtime/pkg/device/manager/utils.go
+++ b/src/runtime/pkg/device/manager/utils.go
@@ -37,7 +37,7 @@ func isBlock(devInfo config.DeviceInfo) bool {
}
// isVhostUserBlk checks if the device is a VhostUserBlk device.
-func isVhostUserBlk(devInfo config.DeviceInfo) bool {
+func IsVhostUserBlk(devInfo config.DeviceInfo) bool {
return devInfo.DevType == "b" && devInfo.Major == config.VhostUserBlkMajor
}
diff --git a/src/runtime/pkg/device/manager/utils_test.go b/src/runtime/pkg/device/manager/utils_test.go
index b57992b3d..6752719dd 100644
--- a/src/runtime/pkg/device/manager/utils_test.go
+++ b/src/runtime/pkg/device/manager/utils_test.go
@@ -70,7 +70,7 @@ func TestIsVhostUserBlk(t *testing.T) {
}
for _, d := range data {
- isVhostUserBlk := isVhostUserBlk(
+ isVhostUserBlk := IsVhostUserBlk(
config.DeviceInfo{
DevType: d.devType,
Major: d.major,
diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go
index 5ff258aed..73b6aba44 100644
--- a/src/runtime/pkg/govmm/qemu/qemu.go
+++ b/src/runtime/pkg/govmm/qemu/qemu.go
@@ -123,6 +123,14 @@ const (
// PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port.
PCIeRootPort DeviceDriver = "pcie-root-port"
+ // PCIeSwitchUpstreamPort is a PCIe switch upstream port
+ // A upstream port connects to a PCIe Root Port
+ PCIeSwitchUpstreamPort DeviceDriver = "x3130-upstream"
+
+ // PCIeSwitchDownstreamPort is a PCIe switch downstream port
+ // PCIe devices can be hot-plugged to the downstream port.
+ PCIeSwitchDownstreamPort DeviceDriver = "xio3130-downstream"
+
// Loader is the Loader device driver.
Loader DeviceDriver = "loader"
@@ -236,6 +244,7 @@ const (
// SecExecGuest represents an s390x Secure Execution (Protected Virtualization in QEMU) object
SecExecGuest ObjectType = "s390-pv-guest"
+
// PEFGuest represent ppc64le PEF(Protected Execution Facility) object.
PEFGuest ObjectType = "pef-guest"
)
@@ -410,7 +419,6 @@ func (object Object) QemuParams(config *Config) []string {
deviceParams = append(deviceParams, string(object.Driver))
deviceParams = append(deviceParams, fmt.Sprintf("id=%s", object.DeviceID))
deviceParams = append(deviceParams, fmt.Sprintf("host-path=%s", object.File))
-
}
if len(deviceParams) > 0 {
@@ -1722,6 +1730,106 @@ func (b PCIeRootPortDevice) Valid() bool {
return true
}
+// PCIeSwitchUpstreamPortDevice is the port connecting to the root port
+type PCIeSwitchUpstreamPortDevice struct {
+ ID string // format: sup{n}, n>=0
+ Bus string // default is rp0
+}
+
+// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
+func (b PCIeSwitchUpstreamPortDevice) QemuParams(config *Config) []string {
+ var qemuParams []string
+ var deviceParams []string
+
+ driver := PCIeSwitchUpstreamPort
+
+ deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
+ deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
+
+ qemuParams = append(qemuParams, "-device")
+ qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
+ return qemuParams
+}
+
+// Valid returns true if the PCIeSwitchUpstreamPortDevice structure is valid and complete.
+func (b PCIeSwitchUpstreamPortDevice) Valid() bool {
+ if b.ID == "" {
+ return false
+ }
+ if b.Bus == "" {
+ return false
+ }
+ return true
+}
+
+// PCIeSwitchDownstreamPortDevice is the port connecting to the root port
+type PCIeSwitchDownstreamPortDevice struct {
+ ID string // format: sup{n}, n>=0
+ Bus string // default is rp0
+ Chassis string // (slot, chassis) pair is mandatory and must be unique for each downstream port, >=0, default is 0x00
+ Slot string // >=0, default is 0x00
+ // This to work needs patches to QEMU
+ BusReserve string
+ // Pref64 and Pref32 are not allowed to be set simultaneously
+ Pref64Reserve string // reserve prefetched MMIO aperture, 64-bit
+ Pref32Reserve string // reserve prefetched MMIO aperture, 32-bit
+ MemReserve string // reserve non-prefetched MMIO aperture, 32-bit *only*
+ IOReserve string // IO reservation
+
+}
+
+// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
+func (b PCIeSwitchDownstreamPortDevice) QemuParams(config *Config) []string {
+ var qemuParams []string
+ var deviceParams []string
+ driver := PCIeSwitchDownstreamPort
+
+ deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
+ deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
+ deviceParams = append(deviceParams, fmt.Sprintf("chassis=%s", b.Chassis))
+ deviceParams = append(deviceParams, fmt.Sprintf("slot=%s", b.Slot))
+ if b.BusReserve != "" {
+ deviceParams = append(deviceParams, fmt.Sprintf("bus-reserve=%s", b.BusReserve))
+ }
+
+ if b.Pref64Reserve != "" {
+ deviceParams = append(deviceParams, fmt.Sprintf("pref64-reserve=%s", b.Pref64Reserve))
+ }
+
+ if b.Pref32Reserve != "" {
+ deviceParams = append(deviceParams, fmt.Sprintf("pref32-reserve=%s", b.Pref32Reserve))
+ }
+
+ if b.MemReserve != "" {
+ deviceParams = append(deviceParams, fmt.Sprintf("mem-reserve=%s", b.MemReserve))
+ }
+
+ if b.IOReserve != "" {
+ deviceParams = append(deviceParams, fmt.Sprintf("io-reserve=%s", b.IOReserve))
+ }
+
+ qemuParams = append(qemuParams, "-device")
+ qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
+ return qemuParams
+}
+
+// Valid returns true if the PCIeSwitchUpstremPortDevice structure is valid and complete.
+func (b PCIeSwitchDownstreamPortDevice) Valid() bool {
+ if b.ID == "" {
+ return false
+ }
+ if b.Bus == "" {
+ return false
+ }
+ if b.Chassis == "" {
+ return false
+ }
+ if b.Slot == "" {
+ return false
+ }
+ return true
+}
+
// VFIODevice represents a qemu vfio device meant for direct access by guest OS.
type VFIODevice struct {
// Bus-Device-Function of device
diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go
index 482b7e9e2..f0ba941de 100644
--- a/src/runtime/pkg/hypervisors/hypervisor_state.go
+++ b/src/runtime/pkg/hypervisors/hypervisor_state.go
@@ -5,7 +5,7 @@
package hypervisors
-import "fmt"
+import "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
// Bridge is a bridge where devices can be hot plugged
type Bridge struct {
@@ -28,37 +28,8 @@ type CPUDevice struct {
ID string
}
-// PCIePort distinguish only between root and switch port
-type PCIePort string
-
-const (
- // RootPort attach VFIO devices to a root-port
- RootPort PCIePort = "root-port"
- // SwitchPort attach VFIO devices to a switch-port
- SwitchPort = "switch-port"
- // BridgePort is the default
- BridgePort = "bridge-port"
- // NoPort is for disabling VFIO hotplug/coldplug
- NoPort = "no-port"
-)
-
-func (p PCIePort) String() string {
- switch p {
- case RootPort:
- return "root-port"
- case SwitchPort:
- return "switch-port"
- case BridgePort:
- return "bridge-port"
- case NoPort:
- return "no-port"
- }
- return fmt.Sprintf("", string(p))
-}
-
type HypervisorState struct {
BlockIndexMap map[int]struct{}
-
// Type of hypervisor, E.g. qemu/firecracker/acrn.
Type string
UUID string
@@ -74,7 +45,7 @@ type HypervisorState struct {
HotpluggedMemory int
VirtiofsDaemonPid int
Pid int
- PCIeRootPort int
- ColdPlugVFIO PCIePort
+ HotPlugVFIO config.PCIePort
+ ColdPlugVFIO config.PCIePort
HotplugVFIOOnRootBus bool
}
diff --git a/src/runtime/pkg/katatestutils/utils.go b/src/runtime/pkg/katatestutils/utils.go
index 4c8257a40..ec1d85c3a 100644
--- a/src/runtime/pkg/katatestutils/utils.go
+++ b/src/runtime/pkg/katatestutils/utils.go
@@ -14,7 +14,7 @@ import (
"strconv"
"testing"
- hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
+ "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/assert"
)
@@ -224,8 +224,8 @@ type RuntimeConfigOptions struct {
JaegerUser string
JaegerPassword string
PFlash []string
- PCIeRootPort uint32
- ColdPlugVFIO hv.PCIePort
+ HotPlugVFIO config.PCIePort
+ ColdPlugVFIO config.PCIePort
DefaultVCPUCount uint32
DefaultMaxVCPUCount uint32
DefaultMemSize uint32
@@ -318,7 +318,6 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string {
disable_block_device_use = ` + strconv.FormatBool(config.DisableBlock) + `
enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + `
hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + `
- pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + `
cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `"
msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + `
enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + `
diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in
index a59ce8a17..f76986876 100644
--- a/src/runtime/pkg/katautils/config-settings.go.in
+++ b/src/runtime/pkg/katautils/config-settings.go.in
@@ -10,7 +10,7 @@
package katautils
import (
- hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
+ config "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
)
// name is the name of the runtime
@@ -82,7 +82,6 @@ const defaultEnableDebug bool = false
const defaultDisableNestingChecks bool = false
const defaultMsize9p uint32 = 8192
const defaultHotplugVFIOOnRootBus bool = false
-const defaultPCIeRootPort = 0
const defaultEntropySource = "/dev/urandom"
const defaultGuestHookPath string = ""
const defaultVirtioFSCacheMode = "never"
@@ -115,4 +114,5 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock"
// Default config file used by stateless systems.
var defaultRuntimeConfiguration = "@CONFIG_PATH@"
-const defaultColdPlugVFIO = hv.NoPort
+const defaultHotPlugVFIO = config.NoPort
+const defaultColdPlugVFIO = config.NoPort
diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go
index 6bff49cfe..1397954aa 100644
--- a/src/runtime/pkg/katautils/config.go
+++ b/src/runtime/pkg/katautils/config.go
@@ -20,7 +20,6 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
- hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
@@ -79,98 +78,98 @@ type factory struct {
}
type hypervisor struct {
- Path string `toml:"path"`
- JailerPath string `toml:"jailer_path"`
- Kernel string `toml:"kernel"`
- CtlPath string `toml:"ctlpath"`
- Initrd string `toml:"initrd"`
- Image string `toml:"image"`
- RootfsType string `toml:"rootfs_type"`
- Firmware string `toml:"firmware"`
- FirmwareVolume string `toml:"firmware_volume"`
- MachineAccelerators string `toml:"machine_accelerators"`
- CPUFeatures string `toml:"cpu_features"`
- KernelParams string `toml:"kernel_params"`
- MachineType string `toml:"machine_type"`
- BlockDeviceDriver string `toml:"block_device_driver"`
- EntropySource string `toml:"entropy_source"`
- SharedFS string `toml:"shared_fs"`
- VirtioFSDaemon string `toml:"virtio_fs_daemon"`
- VirtioFSCache string `toml:"virtio_fs_cache"`
- VhostUserStorePath string `toml:"vhost_user_store_path"`
- FileBackedMemRootDir string `toml:"file_mem_backend"`
- GuestHookPath string `toml:"guest_hook_path"`
- GuestMemoryDumpPath string `toml:"guest_memory_dump_path"`
- SeccompSandbox string `toml:"seccompsandbox"`
- GuestPreAttestationURI string `toml:"guest_pre_attestation_kbs_uri"`
- GuestPreAttestationMode string `toml:"guest_pre_attestation_kbs_mode"`
- GuestPreAttestationKeyset string `toml:"guest_pre_attestation_keyset"`
- SEVCertChainPath string `toml:"sev_cert_chain"`
- BlockDeviceAIO string `toml:"block_device_aio"`
- RemoteHypervisorSocket string `toml:"remote_hypervisor_socket"`
- HypervisorPathList []string `toml:"valid_hypervisor_paths"`
- JailerPathList []string `toml:"valid_jailer_paths"`
- CtlPathList []string `toml:"valid_ctlpaths"`
- VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"`
- VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"`
- PFlashList []string `toml:"pflashes"`
- VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"`
- FileBackedMemRootList []string `toml:"valid_file_mem_backends"`
- EntropySourceList []string `toml:"valid_entropy_sources"`
- EnableAnnotations []string `toml:"enable_annotations"`
- RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"`
- TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"`
- MemOffset uint64 `toml:"memory_offset"`
- DefaultMaxMemorySize uint64 `toml:"default_maxmemory"`
- DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"`
- DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"`
- DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"`
- DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"`
- NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"`
- NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"`
- NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"`
- NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"`
- VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"`
- VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"`
- DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"`
- MemorySize uint32 `toml:"default_memory"`
- MemSlots uint32 `toml:"memory_slots"`
- DefaultBridges uint32 `toml:"default_bridges"`
- Msize9p uint32 `toml:"msize_9p"`
- PCIeRootPort uint32 `toml:"pcie_root_port"`
- GuestPreAttestationGRPCTimeout uint32 `toml:"guest_pre_attestation_grpc_timeout"`
- SEVGuestPolicy uint32 `toml:"sev_guest_policy"`
- SNPGuestPolicy uint64 `toml:"snp_guest_policy"`
- RemoteHypervisorTimeout uint32 `toml:"remote_hypervisor_timeout"`
- NumVCPUs int32 `toml:"default_vcpus"`
- BlockDeviceCacheSet bool `toml:"block_device_cache_set"`
- BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"`
- BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"`
- EnableVhostUserStore bool `toml:"enable_vhost_user_store"`
- VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"`
- DisableBlockDeviceUse bool `toml:"disable_block_device_use"`
- MemPrealloc bool `toml:"enable_mem_prealloc"`
- HugePages bool `toml:"enable_hugepages"`
- VirtioMem bool `toml:"enable_virtio_mem"`
- IOMMU bool `toml:"enable_iommu"`
- IOMMUPlatform bool `toml:"enable_iommu_platform"`
- Debug bool `toml:"enable_debug"`
- DisableNestingChecks bool `toml:"disable_nesting_checks"`
- EnableIOThreads bool `toml:"enable_iothreads"`
- DisableImageNvdimm bool `toml:"disable_image_nvdimm"`
- HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"`
- ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"`
- DisableVhostNet bool `toml:"disable_vhost_net"`
- GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"`
- ConfidentialGuest bool `toml:"confidential_guest"`
- SevSnpGuest bool `toml:"sev_snp_guest"`
- GuestSwap bool `toml:"enable_guest_swap"`
- Rootless bool `toml:"rootless"`
- DisableSeccomp bool `toml:"disable_seccomp"`
- DisableSeLinux bool `toml:"disable_selinux"`
- DisableGuestSeLinux bool `toml:"disable_guest_selinux"`
- LegacySerial bool `toml:"use_legacy_serial"`
- GuestPreAttestation bool `toml:"guest_pre_attestation"`
+ Path string `toml:"path"`
+ JailerPath string `toml:"jailer_path"`
+ Kernel string `toml:"kernel"`
+ CtlPath string `toml:"ctlpath"`
+ Initrd string `toml:"initrd"`
+ Image string `toml:"image"`
+ RootfsType string `toml:"rootfs_type"`
+ Firmware string `toml:"firmware"`
+ FirmwareVolume string `toml:"firmware_volume"`
+ MachineAccelerators string `toml:"machine_accelerators"`
+ CPUFeatures string `toml:"cpu_features"`
+ KernelParams string `toml:"kernel_params"`
+ MachineType string `toml:"machine_type"`
+ BlockDeviceDriver string `toml:"block_device_driver"`
+ EntropySource string `toml:"entropy_source"`
+ SharedFS string `toml:"shared_fs"`
+ VirtioFSDaemon string `toml:"virtio_fs_daemon"`
+ VirtioFSCache string `toml:"virtio_fs_cache"`
+ VhostUserStorePath string `toml:"vhost_user_store_path"`
+ FileBackedMemRootDir string `toml:"file_mem_backend"`
+ GuestHookPath string `toml:"guest_hook_path"`
+ GuestMemoryDumpPath string `toml:"guest_memory_dump_path"`
+ SeccompSandbox string `toml:"seccompsandbox"`
+ GuestPreAttestationURI string `toml:"guest_pre_attestation_kbs_uri"`
+ GuestPreAttestationMode string `toml:"guest_pre_attestation_kbs_mode"`
+ GuestPreAttestationKeyset string `toml:"guest_pre_attestation_keyset"`
+ SEVCertChainPath string `toml:"sev_cert_chain"`
+ BlockDeviceAIO string `toml:"block_device_aio"`
+ RemoteHypervisorSocket string `toml:"remote_hypervisor_socket"`
+ HypervisorPathList []string `toml:"valid_hypervisor_paths"`
+ JailerPathList []string `toml:"valid_jailer_paths"`
+ CtlPathList []string `toml:"valid_ctlpaths"`
+ VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"`
+ VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"`
+ PFlashList []string `toml:"pflashes"`
+ VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"`
+ FileBackedMemRootList []string `toml:"valid_file_mem_backends"`
+ EntropySourceList []string `toml:"valid_entropy_sources"`
+ EnableAnnotations []string `toml:"enable_annotations"`
+ RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"`
+ TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"`
+ MemOffset uint64 `toml:"memory_offset"`
+ DefaultMaxMemorySize uint64 `toml:"default_maxmemory"`
+ DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"`
+ DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"`
+ DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"`
+ DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"`
+ NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"`
+ NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"`
+ NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"`
+ NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"`
+ VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"`
+ VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"`
+ DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"`
+ MemorySize uint32 `toml:"default_memory"`
+ MemSlots uint32 `toml:"memory_slots"`
+ DefaultBridges uint32 `toml:"default_bridges"`
+ Msize9p uint32 `toml:"msize_9p"`
+ GuestPreAttestationGRPCTimeout uint32 `toml:"guest_pre_attestation_grpc_timeout"`
+ SEVGuestPolicy uint32 `toml:"sev_guest_policy"`
+ SNPGuestPolicy uint64 `toml:"snp_guest_policy"`
+ RemoteHypervisorTimeout uint32 `toml:"remote_hypervisor_timeout"`
+ NumVCPUs int32 `toml:"default_vcpus"`
+ BlockDeviceCacheSet bool `toml:"block_device_cache_set"`
+ BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"`
+ BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"`
+ EnableVhostUserStore bool `toml:"enable_vhost_user_store"`
+ VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"`
+ DisableBlockDeviceUse bool `toml:"disable_block_device_use"`
+ MemPrealloc bool `toml:"enable_mem_prealloc"`
+ HugePages bool `toml:"enable_hugepages"`
+ VirtioMem bool `toml:"enable_virtio_mem"`
+ IOMMU bool `toml:"enable_iommu"`
+ IOMMUPlatform bool `toml:"enable_iommu_platform"`
+ Debug bool `toml:"enable_debug"`
+ DisableNestingChecks bool `toml:"disable_nesting_checks"`
+ EnableIOThreads bool `toml:"enable_iothreads"`
+ DisableImageNvdimm bool `toml:"disable_image_nvdimm"`
+ HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"`
+ HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"`
+ ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"`
+ DisableVhostNet bool `toml:"disable_vhost_net"`
+ GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"`
+ ConfidentialGuest bool `toml:"confidential_guest"`
+ SevSnpGuest bool `toml:"sev_snp_guest"`
+ GuestSwap bool `toml:"enable_guest_swap"`
+ Rootless bool `toml:"rootless"`
+ DisableSeccomp bool `toml:"disable_seccomp"`
+ DisableSeLinux bool `toml:"disable_selinux"`
+ DisableGuestSeLinux bool `toml:"disable_guest_selinux"`
+ LegacySerial bool `toml:"use_legacy_serial"`
+ GuestPreAttestation bool `toml:"guest_pre_attestation"`
}
type runtime struct {
@@ -298,12 +297,18 @@ func (h hypervisor) firmware() (string, error) {
return ResolvePath(p)
}
-func (h hypervisor) coldPlugVFIO() hv.PCIePort {
+func (h hypervisor) coldPlugVFIO() config.PCIePort {
if h.ColdPlugVFIO == "" {
return defaultColdPlugVFIO
}
return h.ColdPlugVFIO
}
+func (h hypervisor) hotPlugVFIO() config.PCIePort {
+ if h.HotPlugVFIO == "" {
+ return defaultHotPlugVFIO
+ }
+ return h.HotPlugVFIO
+}
func (h hypervisor) firmwareVolume() (string, error) {
p := h.FirmwareVolume
@@ -523,7 +528,7 @@ func (h hypervisor) blockDeviceAIO() (string, error) {
}
func (h hypervisor) sharedFS() (string, error) {
- supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus}
+ supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus, config.NoSharedFS}
if h.SharedFS == "" {
return config.VirtioFS, nil
@@ -838,6 +843,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
KernelPath: kernel,
InitrdPath: initrd,
ImagePath: image,
+ RootfsType: rootfsType,
FirmwarePath: firmware,
FirmwareVolumePath: firmwareVolume,
PFlash: pflashes,
@@ -880,8 +886,8 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
Msize9p: h.msize9p(),
DisableImageNvdimm: h.DisableImageNvdimm,
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
+ HotPlugVFIO: h.hotPlugVFIO(),
ColdPlugVFIO: h.coldPlugVFIO(),
- PCIeRootPort: h.PCIeRootPort,
DisableVhostNet: h.DisableVhostNet,
EnableVhostUserStore: h.EnableVhostUserStore,
VhostUserStorePath: h.vhostUserStorePath(),
@@ -907,7 +913,6 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
SNPGuestPolicy: h.getSnpGuestPolicy(),
SEVCertChainPath: h.SEVCertChainPath,
DisableGuestSeLinux: h.DisableGuestSeLinux,
- RootfsType: rootfsType,
}, nil
}
@@ -1034,11 +1039,12 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
return vc.HypervisorConfig{}, err
}
- if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus {
- return vc.HypervisorConfig{}, errors.New("clh only support virtio-fs or virtio-fs-nydus")
+ if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus && sharedFS != config.NoSharedFS {
+ return vc.HypervisorConfig{},
+ fmt.Errorf("Cloud Hypervisor does not support %s shared filesystem option", sharedFS)
}
- if h.VirtioFSDaemon == "" {
+ if (sharedFS == config.VirtioFS || sharedFS == config.VirtioFSNydus) && h.VirtioFSDaemon == "" {
return vc.HypervisorConfig{},
fmt.Errorf("cannot enable %s without daemon path in configuration file", sharedFS)
}
@@ -1084,7 +1090,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
Msize9p: h.msize9p(),
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
ColdPlugVFIO: h.coldPlugVFIO(),
- PCIeRootPort: h.PCIeRootPort,
+ HotPlugVFIO: h.hotPlugVFIO(),
DisableVhostNet: true,
GuestHookPath: h.guestHookPath(),
VirtioFSExtraArgs: h.VirtioFSExtraArgs,
@@ -1302,6 +1308,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
KernelPath: defaultKernelPath,
ImagePath: defaultImagePath,
InitrdPath: defaultInitrdPath,
+ RootfsType: defaultRootfsType,
FirmwarePath: defaultFirmwarePath,
FirmwareVolumePath: defaultFirmwareVolumePath,
MachineAccelerators: defaultMachineAccelerators,
@@ -1330,9 +1337,10 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
Msize9p: defaultMsize9p,
HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus,
ColdPlugVFIO: defaultColdPlugVFIO,
- PCIeRootPort: defaultPCIeRootPort,
+ HotPlugVFIO: defaultHotPlugVFIO,
GuestHookPath: defaultGuestHookPath,
VhostUserStorePath: defaultVhostUserStorePath,
+ VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect,
VirtioFSCache: defaultVirtioFSCacheMode,
DisableImageNvdimm: defaultDisableImageNvdimm,
RxRateLimiterMaxRate: defaultRxRateLimiterMaxRate,
@@ -1352,8 +1360,6 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
SEVGuestPolicy: defaultSEVGuestPolicy,
SNPGuestPolicy: defaultSNPGuestPolicy,
SEVCertChainPath: defaultSEVCertChainPath,
- VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect,
- RootfsType: defaultRootfsType,
}
}
@@ -1711,9 +1717,10 @@ func checkConfig(config oci.RuntimeConfig) error {
return err
}
+ hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO
coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO
machineType := config.HypervisorConfig.HypervisorMachineType
- if err := checkPCIeConfig(coldPlugVFIO, machineType); err != nil {
+ if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType); err != nil {
return err
}
@@ -1723,18 +1730,32 @@ func checkConfig(config oci.RuntimeConfig) error {
// checkPCIeConfig ensures the PCIe configuration is valid.
// Only allow one of the following settings for cold-plug:
// no-port, root-port, switch-port
-func checkPCIeConfig(vfioPort hv.PCIePort, machineType string) error {
+func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string) error {
// Currently only QEMU q35 supports advanced PCIe topologies
// firecracker, dragonball do not have right now any PCIe support
if machineType != "q35" {
return nil
}
- if vfioPort == hv.NoPort || vfioPort == hv.RootPort || vfioPort == hv.SwitchPort {
+
+ if coldPlug != config.NoPort && hotPlug != config.NoPort {
+ return fmt.Errorf("invalid hot-plug=%s and cold-plug=%s settings, only one of them can be set", coldPlug, hotPlug)
+ }
+ if coldPlug == config.NoPort && hotPlug == config.NoPort {
+ return nil
+ }
+ var port config.PCIePort
+ if coldPlug != config.NoPort {
+ port = coldPlug
+ }
+ if hotPlug != config.NoPort {
+ port = hotPlug
+ }
+ if port == config.NoPort || port == config.BridgePort || port == config.RootPort || port == config.SwitchPort {
return nil
}
- return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s",
- vfioPort, hv.NoPort, hv.RootPort, hv.SwitchPort)
+ return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s, %s",
+ coldPlug, config.NoPort, config.BridgePort, config.RootPort, config.SwitchPort)
}
// checkNetNsConfig performs sanity checks on disable_new_netns config.
diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go
index 97f134579..b786ce25f 100644
--- a/src/runtime/pkg/katautils/config_test.go
+++ b/src/runtime/pkg/katautils/config_test.go
@@ -18,8 +18,8 @@ import (
"syscall"
"testing"
+ "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
- hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
@@ -63,15 +63,16 @@ func createConfig(configPath string, fileData string) error {
// createAllRuntimeConfigFiles creates all files necessary to call
// loadConfiguration().
-func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConfig, err error) {
+func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntimeConfig, err error) {
if dir == "" {
- return config, fmt.Errorf("BUG: need directory")
+ return testConfig, fmt.Errorf("BUG: need directory")
}
if hypervisor == "" {
- return config, fmt.Errorf("BUG: need hypervisor")
+ return testConfig, fmt.Errorf("BUG: need hypervisor")
}
- var coldPlugVFIO hv.PCIePort
+ var hotPlugVFIO config.PCIePort
+ var coldPlugVFIO config.PCIePort
hypervisorPath := path.Join(dir, "hypervisor")
kernelPath := path.Join(dir, "kernel")
kernelParams := "foo=bar xyz"
@@ -85,8 +86,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
blockDeviceAIO := "io_uring"
enableIOThreads := true
hotplugVFIOOnRootBus := true
- pcieRootPort := uint32(2)
- coldPlugVFIO = hv.RootPort
+ hotPlugVFIO = config.NoPort
+ coldPlugVFIO = config.BridgePort
disableNewNetNs := false
sharedFS := "virtio-9p"
virtioFSdaemon := path.Join(dir, "virtiofsd")
@@ -108,7 +109,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
BlockDeviceAIO: blockDeviceAIO,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
- PCIeRootPort: pcieRootPort,
+ HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
DisableNewNetNs: disableNewNetNs,
DefaultVCPUCount: defaultVCPUCount,
@@ -134,7 +135,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
configPath := path.Join(dir, "runtime.toml")
err = createConfig(configPath, runtimeConfigFileData)
if err != nil {
- return config, err
+ return testConfig, err
}
configPathLink := path.Join(filepath.Dir(configPath), "link-to-configuration.toml")
@@ -142,7 +143,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
// create a link to the config file
err = syscall.Symlink(configPath, configPathLink)
if err != nil {
- return config, err
+ return testConfig, err
}
files := []string{hypervisorPath, kernelPath, imagePath}
@@ -151,7 +152,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
// create the resource (which must be >0 bytes)
err := WriteFile(file, "foo", testFileMode)
if err != nil {
- return config, err
+ return testConfig, err
}
}
@@ -172,7 +173,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
DefaultBridges: defaultBridgesCount,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
- PCIeRootPort: pcieRootPort,
+ HotPlugVFIO: hotPlugVFIO,
ColdPlugVFIO: coldPlugVFIO,
Msize9p: defaultMsize9p,
MemSlots: defaultMemSlots,
@@ -217,10 +218,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
err = SetKernelParams(&runtimeConfig)
if err != nil {
- return config, err
+ return testConfig, err
}
- config = testRuntimeConfig{
+ rtimeConfig := testRuntimeConfig{
RuntimeConfig: runtimeConfig,
RuntimeConfigFile: configPath,
ConfigPath: configPath,
@@ -229,7 +230,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
LogPath: logPath,
}
- return config, nil
+ return rtimeConfig, nil
}
// testLoadConfiguration accepts an optional function that can be used
@@ -570,6 +571,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
BlockDeviceAIO: defaultBlockDeviceAIO,
DisableGuestSeLinux: defaultDisableGuestSeLinux,
SNPGuestPolicy: defaultSNPGuestPolicy,
+ HotPlugVFIO: defaultHotPlugVFIO,
ColdPlugVFIO: defaultColdPlugVFIO,
}
@@ -604,7 +606,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
func TestNewQemuHypervisorConfig(t *testing.T) {
dir := t.TempDir()
- var coldPlugVFIO hv.PCIePort
+ var coldPlugVFIO config.PCIePort
hypervisorPath := path.Join(dir, "hypervisor")
kernelPath := path.Join(dir, "kernel")
imagePath := path.Join(dir, "image")
@@ -612,8 +614,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
disableBlock := true
enableIOThreads := true
hotplugVFIOOnRootBus := true
- pcieRootPort := uint32(2)
- coldPlugVFIO = hv.RootPort
+ coldPlugVFIO = config.BridgePort
orgVHostVSockDevicePath := utils.VHostVSockDevicePath
blockDeviceAIO := "io_uring"
defer func() {
@@ -632,7 +633,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
DisableBlockDeviceUse: disableBlock,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
- PCIeRootPort: pcieRootPort,
ColdPlugVFIO: coldPlugVFIO,
RxRateLimiterMaxRate: rxRateLimiterMaxRate,
TxRateLimiterMaxRate: txRateLimiterMaxRate,
@@ -688,10 +688,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus)
}
- if config.PCIeRootPort != pcieRootPort {
- t.Errorf("Expected value for PCIeRootPort %v, got %v", pcieRootPort, config.PCIeRootPort)
- }
-
if config.RxRateLimiterMaxRate != rxRateLimiterMaxRate {
t.Errorf("Expected value for rx rate limiter %v, got %v", rxRateLimiterMaxRate, config.RxRateLimiterMaxRate)
}
@@ -814,7 +810,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
disableBlock := true
enableIOThreads := true
hotplugVFIOOnRootBus := true
- pcieRootPort := uint32(2)
hypervisor := hypervisor{
Path: hypervisorPath,
@@ -825,7 +820,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
DisableBlockDeviceUse: disableBlock,
EnableIOThreads: enableIOThreads,
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
- PCIeRootPort: pcieRootPort,
}
_, err := newQemuHypervisorConfig(hypervisor)
diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go
index b443a3f88..7881227a1 100644
--- a/src/runtime/pkg/oci/utils.go
+++ b/src/runtime/pkg/oci/utils.go
@@ -460,6 +460,10 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
return err
}
+ if err := addHypervisorHotColdPlugVfioOverrides(ocispec, config); err != nil {
+ return err
+ }
+
if value, ok := ocispec.Annotations[vcAnnotations.MachineType]; ok {
if value != "" {
config.HypervisorConfig.HypervisorMachineType = value
@@ -515,12 +519,6 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
return err
}
- if err := newAnnotationConfiguration(ocispec, vcAnnotations.PCIeRootPort).setUint(func(pcieRootPort uint64) {
- config.HypervisorConfig.PCIeRootPort = uint32(pcieRootPort)
- }); err != nil {
- return err
- }
-
if value, ok := ocispec.Annotations[vcAnnotations.EntropySource]; ok {
if !checkPathIsInGlobs(runtime.HypervisorConfig.EntropySourceList, value) {
return fmt.Errorf("entropy source %v required from annotation is not valid", value)
@@ -583,6 +581,37 @@ func addHypervisorPathOverrides(ocispec specs.Spec, config *vc.SandboxConfig, ru
return nil
}
+func addHypervisorPCIePortOverride(value string) (config.PCIePort, error) {
+ if value == "" {
+ return config.NoPort, nil
+ }
+ port := config.PCIePort(value)
+ if port.Invalid() {
+ return config.InvalidPort, fmt.Errorf("Invalid PCIe port \"%v\" specified in annotation", value)
+ }
+ return port, nil
+}
+
+func addHypervisorHotColdPlugVfioOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error {
+
+ var err error
+ if value, ok := ocispec.Annotations[vcAnnotations.HotPlugVFIO]; ok {
+ if sbConfig.HypervisorConfig.HotPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil {
+ return err
+ }
+ // If hot-plug is specified disable cold-plug and vice versa
+ sbConfig.HypervisorConfig.ColdPlugVFIO = config.NoPort
+ }
+ if value, ok := ocispec.Annotations[vcAnnotations.ColdPlugVFIO]; ok {
+ if sbConfig.HypervisorConfig.ColdPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil {
+ return err
+ }
+ // If cold-plug is specified disable hot-plug and vice versa
+ sbConfig.HypervisorConfig.HotPlugVFIO = config.NoPort
+ }
+ return nil
+}
+
func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, runtime RuntimeConfig) error {
if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMemory).setUintWithCheck(func(memorySz uint64) error {
diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go
index ebc7f019c..b27ba7f8e 100644
--- a/src/runtime/pkg/oci/utils_test.go
+++ b/src/runtime/pkg/oci/utils_test.go
@@ -599,7 +599,7 @@ func TestContainerPipeSizeAnnotation(t *testing.T) {
func TestAddHypervisorAnnotations(t *testing.T) {
assert := assert.New(t)
- config := vc.SandboxConfig{
+ sbConfig := vc.SandboxConfig{
Annotations: make(map[string]string),
}
@@ -628,8 +628,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
runtimeConfig.HypervisorConfig.VirtioFSDaemonList = []string{"/bin/*ls*"}
ocispec.Annotations[vcAnnotations.KernelParams] = "vsyscall=emulate iommu=on"
- addHypervisorConfigOverrides(ocispec, &config, runtimeConfig)
- assert.Exactly(expectedHyperConfig, config.HypervisorConfig)
+ addHypervisorConfigOverrides(ocispec, &sbConfig, runtimeConfig)
+ assert.Exactly(expectedHyperConfig, sbConfig.HypervisorConfig)
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1"
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1"
@@ -660,7 +660,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
ocispec.Annotations[vcAnnotations.GuestHookPath] = "/usr/bin/"
ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true"
ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true"
- ocispec.Annotations[vcAnnotations.PCIeRootPort] = "2"
+ ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = config.BridgePort
+ ocispec.Annotations[vcAnnotations.HotPlugVFIO] = config.NoPort
ocispec.Annotations[vcAnnotations.IOMMUPlatform] = "true"
ocispec.Annotations[vcAnnotations.SGXEPC] = "64Mi"
ocispec.Annotations[vcAnnotations.UseLegacySerial] = "true"
@@ -668,55 +669,58 @@ func TestAddHypervisorAnnotations(t *testing.T) {
ocispec.Annotations[vcAnnotations.RxRateLimiterMaxRate] = "10000000"
ocispec.Annotations[vcAnnotations.TxRateLimiterMaxRate] = "10000000"
- addAnnotations(ocispec, &config, runtimeConfig)
- assert.Equal(config.HypervisorConfig.NumVCPUs, uint32(1))
- assert.Equal(config.HypervisorConfig.DefaultMaxVCPUs, uint32(1))
- assert.Equal(config.HypervisorConfig.MemorySize, uint32(1024))
- assert.Equal(config.HypervisorConfig.MemSlots, uint32(20))
- assert.Equal(config.HypervisorConfig.MemOffset, uint64(512))
- assert.Equal(config.HypervisorConfig.VirtioMem, true)
- assert.Equal(config.HypervisorConfig.MemPrealloc, true)
- assert.Equal(config.HypervisorConfig.FileBackedMemRootDir, "/dev/shm")
- assert.Equal(config.HypervisorConfig.HugePages, true)
- assert.Equal(config.HypervisorConfig.IOMMU, true)
- assert.Equal(config.HypervisorConfig.BlockDeviceDriver, "virtio-scsi")
- assert.Equal(config.HypervisorConfig.BlockDeviceAIO, "io_uring")
- assert.Equal(config.HypervisorConfig.DisableBlockDeviceUse, true)
- assert.Equal(config.HypervisorConfig.EnableIOThreads, true)
- assert.Equal(config.HypervisorConfig.BlockDeviceCacheSet, true)
- assert.Equal(config.HypervisorConfig.BlockDeviceCacheDirect, true)
- assert.Equal(config.HypervisorConfig.BlockDeviceCacheNoflush, true)
- assert.Equal(config.HypervisorConfig.SharedFS, "virtio-fs")
- assert.Equal(config.HypervisorConfig.VirtioFSDaemon, "/bin/false")
- assert.Equal(config.HypervisorConfig.VirtioFSCache, "auto")
- assert.ElementsMatch(config.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"})
- assert.Equal(config.HypervisorConfig.Msize9p, uint32(512))
- assert.Equal(config.HypervisorConfig.HypervisorMachineType, "q35")
- assert.Equal(config.HypervisorConfig.MachineAccelerators, "nofw")
- assert.Equal(config.HypervisorConfig.CPUFeatures, "pmu=off")
- assert.Equal(config.HypervisorConfig.DisableVhostNet, true)
- assert.Equal(config.HypervisorConfig.GuestHookPath, "/usr/bin/")
- assert.Equal(config.HypervisorConfig.DisableImageNvdimm, true)
- assert.Equal(config.HypervisorConfig.HotplugVFIOOnRootBus, true)
- assert.Equal(config.HypervisorConfig.PCIeRootPort, uint32(2))
- assert.Equal(config.HypervisorConfig.IOMMUPlatform, true)
- assert.Equal(config.HypervisorConfig.SGXEPCSize, int64(67108864))
- assert.Equal(config.HypervisorConfig.LegacySerial, true)
- assert.Equal(config.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000))
- assert.Equal(config.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000))
+ err := addAnnotations(ocispec, &sbConfig, runtimeConfig)
+ assert.NoError(err)
+
+ assert.Equal(sbConfig.HypervisorConfig.NumVCPUs, uint32(1))
+ assert.Equal(sbConfig.HypervisorConfig.DefaultMaxVCPUs, uint32(1))
+ assert.Equal(sbConfig.HypervisorConfig.MemorySize, uint32(1024))
+ assert.Equal(sbConfig.HypervisorConfig.MemSlots, uint32(20))
+ assert.Equal(sbConfig.HypervisorConfig.MemOffset, uint64(512))
+ assert.Equal(sbConfig.HypervisorConfig.VirtioMem, true)
+ assert.Equal(sbConfig.HypervisorConfig.MemPrealloc, true)
+ assert.Equal(sbConfig.HypervisorConfig.FileBackedMemRootDir, "/dev/shm")
+ assert.Equal(sbConfig.HypervisorConfig.HugePages, true)
+ assert.Equal(sbConfig.HypervisorConfig.IOMMU, true)
+ assert.Equal(sbConfig.HypervisorConfig.BlockDeviceDriver, "virtio-scsi")
+ assert.Equal(sbConfig.HypervisorConfig.BlockDeviceAIO, "io_uring")
+ assert.Equal(sbConfig.HypervisorConfig.DisableBlockDeviceUse, true)
+ assert.Equal(sbConfig.HypervisorConfig.EnableIOThreads, true)
+ assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheSet, true)
+ assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheDirect, true)
+ assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheNoflush, true)
+ assert.Equal(sbConfig.HypervisorConfig.SharedFS, "virtio-fs")
+ assert.Equal(sbConfig.HypervisorConfig.VirtioFSDaemon, "/bin/false")
+ assert.Equal(sbConfig.HypervisorConfig.VirtioFSCache, "auto")
+ assert.ElementsMatch(sbConfig.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"})
+ assert.Equal(sbConfig.HypervisorConfig.Msize9p, uint32(512))
+ assert.Equal(sbConfig.HypervisorConfig.HypervisorMachineType, "q35")
+ assert.Equal(sbConfig.HypervisorConfig.MachineAccelerators, "nofw")
+ assert.Equal(sbConfig.HypervisorConfig.CPUFeatures, "pmu=off")
+ assert.Equal(sbConfig.HypervisorConfig.DisableVhostNet, true)
+ assert.Equal(sbConfig.HypervisorConfig.GuestHookPath, "/usr/bin/")
+ assert.Equal(sbConfig.HypervisorConfig.DisableImageNvdimm, true)
+ assert.Equal(sbConfig.HypervisorConfig.HotplugVFIOOnRootBus, true)
+ assert.Equal(string(sbConfig.HypervisorConfig.ColdPlugVFIO), string(config.BridgePort))
+ assert.Equal(string(sbConfig.HypervisorConfig.HotPlugVFIO), string(config.NoPort))
+ assert.Equal(sbConfig.HypervisorConfig.IOMMUPlatform, true)
+ assert.Equal(sbConfig.HypervisorConfig.SGXEPCSize, int64(67108864))
+ assert.Equal(sbConfig.HypervisorConfig.LegacySerial, true)
+ assert.Equal(sbConfig.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000))
+ assert.Equal(sbConfig.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000))
// In case an absurd large value is provided, the config value if not over-ridden
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "655536"
- err := addAnnotations(ocispec, &config, runtimeConfig)
+ err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.Error(err)
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "-1"
- err = addAnnotations(ocispec, &config, runtimeConfig)
+ err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.Error(err)
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1"
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "-1"
- err = addAnnotations(ocispec, &config, runtimeConfig)
+ err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
assert.Error(err)
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1"
diff --git a/src/runtime/virtcontainers/acrn.go b/src/runtime/virtcontainers/acrn.go
index ad54a0477..837cbaf54 100644
--- a/src/runtime/virtcontainers/acrn.go
+++ b/src/runtime/virtcontainers/acrn.go
@@ -90,7 +90,7 @@ func (a *Acrn) Capabilities(ctx context.Context) types.Capabilities {
span, _ := katatrace.Trace(ctx, a.Logger(), "Capabilities", acrnTracingTags, map[string]string{"sandbox_id": a.id})
defer span.End()
- return a.arch.capabilities()
+ return a.arch.capabilities(a.config)
}
func (a *Acrn) HypervisorConfig() HypervisorConfig {
diff --git a/src/runtime/virtcontainers/acrn_arch_base.go b/src/runtime/virtcontainers/acrn_arch_base.go
index 77fb8e9e5..0b9ee53cc 100644
--- a/src/runtime/virtcontainers/acrn_arch_base.go
+++ b/src/runtime/virtcontainers/acrn_arch_base.go
@@ -33,7 +33,7 @@ type acrnArch interface {
kernelParameters(debug bool) []Param
//capabilities returns the capabilities supported by acrn
- capabilities() types.Capabilities
+ capabilities(config HypervisorConfig) types.Capabilities
// memoryTopology returns the memory topology using the given amount of memoryMb and hostMemoryMb
memoryTopology(memMb uint64) Memory
@@ -361,7 +361,7 @@ func (a *acrnArchBase) memoryTopology(memoryMb uint64) Memory {
return memory
}
-func (a *acrnArchBase) capabilities() types.Capabilities {
+func (a *acrnArchBase) capabilities(config HypervisorConfig) types.Capabilities {
var caps types.Capabilities
caps.SetBlockDeviceSupport()
diff --git a/src/runtime/virtcontainers/acrn_arch_base_test.go b/src/runtime/virtcontainers/acrn_arch_base_test.go
index 61db47416..c34974e69 100644
--- a/src/runtime/virtcontainers/acrn_arch_base_test.go
+++ b/src/runtime/virtcontainers/acrn_arch_base_test.go
@@ -83,8 +83,9 @@ func TestAcrnArchBaseKernelParameters(t *testing.T) {
func TestAcrnArchBaseCapabilities(t *testing.T) {
assert := assert.New(t)
acrnArchBase := newAcrnArchBase()
+ config := HypervisorConfig{}
- c := acrnArchBase.capabilities()
+ c := acrnArchBase.capabilities(config)
assert.True(c.IsBlockDeviceSupported())
assert.True(c.IsBlockDeviceHotplugSupported())
assert.False(c.IsFsSharingSupported())
diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go
index cfd924099..d658e1f77 100644
--- a/src/runtime/virtcontainers/clh.go
+++ b/src/runtime/virtcontainers/clh.go
@@ -349,6 +349,10 @@ func (clh *cloudHypervisor) createVirtiofsDaemon(sharedPath string) (VirtiofsDae
}
func (clh *cloudHypervisor) setupVirtiofsDaemon(ctx context.Context) error {
+ if clh.config.SharedFS == config.NoSharedFS {
+ return nil
+ }
+
if clh.config.SharedFS == config.Virtio9P {
return errors.New("cloud-hypervisor only supports virtio based file sharing")
}
@@ -860,12 +864,12 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
defer cancel()
// Create the clh device config via the constructor to ensure default values are properly assigned
- clhDevice := *chclient.NewDeviceConfig(*(*device).GetSysfsDev())
+ clhDevice := *chclient.NewDeviceConfig(device.SysfsDev)
pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice)
if err != nil {
return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err))
}
- clh.devicesIds[*(*device).GetID()] = pciInfo.GetId()
+ clh.devicesIds[device.ID] = pciInfo.GetId()
// clh doesn't use bridges, so the PCI path is simply the slot
// number of the device. This will break if clh starts using
@@ -882,14 +886,11 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf)
}
- guestPciPath, err := types.PciPathFromString(tokens[0])
-
- pciDevice, ok := (*device).(config.VFIOPCIDev)
- if !ok {
+ if device.Type == config.VFIOAPDeviceMediatedType {
return fmt.Errorf("VFIO device %+v is not PCI, only PCI is supported in Cloud Hypervisor", device)
}
- pciDevice.GuestPciPath = guestPciPath
- *device = pciDevice
+
+ device.GuestPciPath, err = types.PciPathFromString(tokens[0])
return err
}
@@ -933,7 +934,7 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int
case BlockDev:
deviceID = clhDriveIndexToID(devInfo.(*config.BlockDrive).Index)
case VfioDev:
- deviceID = *devInfo.(config.VFIODev).GetID()
+ deviceID = devInfo.(*config.VFIODev).ID
default:
clh.Logger().WithFields(log.Fields{"devInfo": devInfo,
"deviceType": devType}).Error("HotplugRemoveDevice: unsupported device")
@@ -1210,7 +1211,9 @@ func (clh *cloudHypervisor) Capabilities(ctx context.Context) types.Capabilities
clh.Logger().WithField("function", "Capabilities").Info("get Capabilities")
var caps types.Capabilities
- caps.SetFsSharingSupport()
+ if clh.config.SharedFS != config.NoSharedFS {
+ caps.SetFsSharingSupport()
+ }
caps.SetBlockDeviceHotplugSupport()
return caps
}
diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go
index b5c800e95..d617ab4e1 100644
--- a/src/runtime/virtcontainers/clh_test.go
+++ b/src/runtime/virtcontainers/clh_test.go
@@ -673,7 +673,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) {
_, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev)
assert.NoError(err, "Hotplug remove block device expected no error")
- _, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIOPCIDev{}, VfioDev)
+ _, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIODev{}, VfioDev)
assert.NoError(err, "Hotplug remove vfio block device expected no error")
_, err = clh.HotplugRemoveDevice(context.Background(), nil, NetDev)
@@ -726,3 +726,30 @@ func TestClhSetConfig(t *testing.T) {
assert.Equal(clh.config, config)
}
+
+func TestClhCapabilities(t *testing.T) {
+ assert := assert.New(t)
+
+ hConfig, err := newClhConfig()
+ assert.NoError(err)
+
+ clh := &cloudHypervisor{}
+ assert.Equal(clh.config, HypervisorConfig{})
+
+ hConfig.SharedFS = config.VirtioFS
+
+ err = clh.setConfig(&hConfig)
+ assert.NoError(err)
+
+ var ctx context.Context
+ c := clh.Capabilities(ctx)
+ assert.True(c.IsFsSharingSupported())
+
+ hConfig.SharedFS = config.NoSharedFS
+
+ err = clh.setConfig(&hConfig)
+ assert.NoError(err)
+
+ c = clh.Capabilities(ctx)
+ assert.False(c.IsFsSharingSupported())
+}
diff --git a/src/runtime/virtcontainers/documentation/api/1.0/api.md b/src/runtime/virtcontainers/documentation/api/1.0/api.md
index d3071a86f..ca5cb4a1a 100644
--- a/src/runtime/virtcontainers/documentation/api/1.0/api.md
+++ b/src/runtime/virtcontainers/documentation/api/1.0/api.md
@@ -288,12 +288,12 @@ type HypervisorConfig struct {
// root bus instead of a bridge.
HotplugVFIOOnRootBus bool
- // PCIeRootPort is used to indicate the number of PCIe Root Port devices
- // The PCIe Root Port device is used to hot-plug the PCIe device
- PCIeRootPort uint32
+ // HotPlugVFIO is used to indicate if devices need to be hotplugged on the
+ // root port, switch, bridge or no port
+ HotPlugVFIO hv.PCIePort
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
- // root port, switch or no port
+ // root port, switch, bridge or no port
ColdPlugVFIO hv.PCIePort
// BootToBeTemplate used to indicate if the VM is created to be a template VM
diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go
index fe0b96fac..f2c86c6a6 100644
--- a/src/runtime/virtcontainers/hypervisor.go
+++ b/src/runtime/virtcontainers/hypervisor.go
@@ -389,7 +389,6 @@ type HypervisorConfig struct {
Gid uint32
SEVGuestPolicy uint32
SNPGuestPolicy uint64
- PCIeRootPort uint32
NumVCPUs uint32
RemoteHypervisorTimeout uint32
IOMMUPlatform bool
@@ -420,7 +419,10 @@ type HypervisorConfig struct {
DisableSeLinux bool
DisableGuestSeLinux bool
LegacySerial bool
- ColdPlugVFIO hv.PCIePort
+ HotPlugVFIO config.PCIePort
+ ColdPlugVFIO config.PCIePort
+ VFIODevices []config.DeviceInfo
+ VhostUserBlkDevices []config.DeviceInfo
}
// vcpu mapping from vcpu number to thread number
diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go
index b54ad9fe8..3afb2fceb 100644
--- a/src/runtime/virtcontainers/kata_agent.go
+++ b/src/runtime/virtcontainers/kata_agent.go
@@ -21,6 +21,7 @@ import (
"github.com/docker/go-units"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/api"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
+ "github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
"github.com/kata-containers/kata-containers/src/runtime/pkg/uuid"
@@ -1148,7 +1149,7 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
ContainerPath: dev.ContainerPath,
Type: kataVfioPciDevType,
Id: groupNum,
- Options: nil,
+ Options: make([]string, len(devList)),
}
// We always pass the device information to the agent, since
@@ -1158,16 +1159,16 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel {
kataDevice.Type = kataVfioPciGuestKernelDevType
}
+ for i, dev := range devList {
+ if dev.Type == config.VFIOAPDeviceMediatedType {
+ kataDevice.Type = kataVfioApDevType
+ kataDevice.Options = dev.APDevices
+ } else {
- if (*devList[0]).GetType() == config.VFIOAPDeviceMediatedType {
- kataDevice.Type = kataVfioApDevType
- kataDevice.Options = (*devList[0]).(config.VFIOAPDev).APDevices
- } else {
- kataDevice.Options = make([]string, len(devList))
- for i, device := range devList {
- pciDevice := (*device).(config.VFIOPCIDev)
- kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDevice.BDF, pciDevice.GuestPciPath)
+ devBDF := drivers.GetBDF(dev.BDF)
+ kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", devBDF, dev.GuestPciPath)
}
+
}
return kataDevice
@@ -1354,7 +1355,6 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co
if _, err = k.sendReq(ctx, req); err != nil {
return nil, err
}
-
return buildProcessFromExecID(req.ExecId)
}
diff --git a/src/runtime/virtcontainers/persist.go b/src/runtime/virtcontainers/persist.go
index cbba44e60..91ab51ebf 100644
--- a/src/runtime/virtcontainers/persist.go
+++ b/src/runtime/virtcontainers/persist.go
@@ -245,7 +245,6 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) {
DisableNestingChecks: sconfig.HypervisorConfig.DisableNestingChecks,
DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm,
HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus,
- PCIeRootPort: sconfig.HypervisorConfig.PCIeRootPort,
BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate,
BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate,
DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet,
@@ -487,8 +486,8 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) {
DisableNestingChecks: hconf.DisableNestingChecks,
DisableImageNvdimm: hconf.DisableImageNvdimm,
HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus,
+ HotPlugVFIO: hconf.HotPlugVFIO,
ColdPlugVFIO: hconf.ColdPlugVFIO,
- PCIeRootPort: hconf.PCIeRootPort,
BootToBeTemplate: hconf.BootToBeTemplate,
BootFromTemplate: hconf.BootFromTemplate,
DisableVhostNet: hconf.DisableVhostNet,
diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go
index e4facc6b9..6ca5ee690 100644
--- a/src/runtime/virtcontainers/persist/api/config.go
+++ b/src/runtime/virtcontainers/persist/api/config.go
@@ -7,7 +7,7 @@
package persistapi
import (
- hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
+ "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/opencontainers/runc/libcontainer/configs"
specs "github.com/opencontainers/runtime-spec/specs-go"
)
@@ -131,10 +131,6 @@ type HypervisorConfig struct {
// Enable SGX. Hardware-based isolation and memory encryption.
SGXEPCSize int64
- // PCIeRootPort is used to indicate the number of PCIe Root Port devices
- // The PCIe Root Port device is used to hot-plug the PCIe device
- PCIeRootPort uint32
-
// NumVCPUs specifies default number of vCPUs for the VM.
NumVCPUs uint32
@@ -199,9 +195,13 @@ type HypervisorConfig struct {
// root bus instead of a bridge.
HotplugVFIOOnRootBus bool
+ // HotPlugVFIO is used to indicate if devices need to be hotplugged on the
+ // root, switch, bridge or no-port
+ HotPlugVFIO config.PCIePort
+
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
- // root port or a switch or no-port
- ColdPlugVFIO hv.PCIePort
+ // root, bridge, switch or no-port
+ ColdPlugVFIO config.PCIePort
// BootToBeTemplate used to indicate if the VM is created to be a template VM
BootToBeTemplate bool
diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go
index a8bd4c1b2..b9a8dfc10 100644
--- a/src/runtime/virtcontainers/pkg/annotations/annotations.go
+++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go
@@ -143,9 +143,11 @@ const (
// root bus instead of a bridge.
HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus"
- // PCIeRootPort is used to indicate the number of PCIe Root Port devices
- // The PCIe Root Port device is used to hot-plug the PCIe device
- PCIeRootPort = kataAnnotHypervisorPrefix + "pcie_root_port"
+ // ColdPlugVFIO is a sandbox annotation used to indicate if devices need to be coldplugged.
+ ColdPlugVFIO = kataAnnotHypervisorPrefix + "cold_plug_vfio"
+
+ // HotPlugVFIO is a sandbox annotation used to indicate if devices need to be hotplugged.
+ HotPlugVFIO = kataAnnotHypervisorPrefix + "hot_plug_vfio"
// EntropySource is a sandbox annotation to specify the path to a host source of
// entropy (/dev/random, /dev/urandom or real hardware RNG device)
diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go
index 6ded5cbe0..43cb78f21 100644
--- a/src/runtime/virtcontainers/qemu.go
+++ b/src/runtime/virtcontainers/qemu.go
@@ -66,6 +66,11 @@ const romFile = ""
// Default value is false.
const defaultDisableModern = false
+// A deeper PCIe topology than 5 is already not advisable just for the sake
+// of having enough buffer we limit ourselves to 10 and exit if we reach
+// the root bus
+const maxPCIeTopoDepth = 10
+
type qmpChannel struct {
qmp *govmmQemu.QMP
ctx context.Context
@@ -76,15 +81,15 @@ type qmpChannel struct {
// QemuState keeps Qemu's state
type QemuState struct {
- UUID string
- Bridges []types.Bridge
- // HotpluggedCPUs is the list of CPUs that were hot-added
+ UUID string
+ HotPlugVFIO config.PCIePort
+ Bridges []types.Bridge
HotpluggedVCPUs []hv.CPUDevice
HotpluggedMemory int
VirtiofsDaemonPid int
- PCIeRootPort int
HotplugVFIOOnRootBus bool
- ColdPlugVFIO hv.PCIePort
+ HotplugVFIO config.PCIePort
+ ColdPlugVFIO config.PCIePort
}
// qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
@@ -207,7 +212,7 @@ func (q *qemu) Capabilities(ctx context.Context) types.Capabilities {
span, _ := katatrace.Trace(ctx, q.Logger(), "Capabilities", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
- return q.arch.capabilities()
+ return q.arch.capabilities(q.config)
}
func (q *qemu) HypervisorConfig() HypervisorConfig {
@@ -278,10 +283,10 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
q.Logger().Debug("Creating UUID")
q.state.UUID = uuid.Generate().String()
-
+ q.state.HotPlugVFIO = q.config.HotPlugVFIO
q.state.ColdPlugVFIO = q.config.ColdPlugVFIO
q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus
- q.state.PCIeRootPort = int(q.config.PCIeRootPort)
+ q.state.HotPlugVFIO = q.config.HotPlugVFIO
// The path might already exist, but in case of VM templating,
// we have to create it since the sandbox has not created it yet.
@@ -727,27 +732,12 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
}
}
- // Add PCIe Root Port devices to hypervisor
- // The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port.
- // For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
- memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory()
-
- if hypervisorConfig.PCIeRootPort > 0 {
- qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort, memSize32bit, memSize64bit)
- }
-
- // The default OVMF MMIO aperture is too small for some PCIe devices
- // with huge BARs so we need to increase it.
- // memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
- if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
- pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
- fwCfg := govmmQemu.FwCfg{
- Name: "opt/ovmf/X-PciMmio64Mb",
- Str: pciMmio64Mb,
+ if machine.Type == QemuQ35 || machine.Type == QemuVirt {
+ if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig, machine.Type); err != nil {
+ q.Logger().WithError(err).Errorf("Cannot create PCIe topology")
+ return err
}
- qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
}
-
q.qemuConfig = qemuConfig
q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath)
@@ -773,6 +763,101 @@ func (q *qemu) checkBpfEnabled() {
}
}
+// If a user uses 8 GPUs with 4 devices in each IOMMU Group that means we need
+// to hotplug 32 devices. We do not have enough PCIe root bus slots to
+// accomplish this task. Kata will use already some slots for vfio-xxxx-pci
+// devices.
+// Max PCI slots per root bus is 32
+// Max PCIe root ports is 16
+// Max PCIe switch ports is 16
+// There is only 64kB of IO memory each root,switch port will consume 4k hence
+// only 16 ports possible.
+func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig, machineType string) error {
+
+ // If no-port set just return no need to add PCIe Root Port or PCIe Switches
+ if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort && machineType == QemuQ35 {
+ return nil
+ }
+
+ // Add PCIe Root Port or PCIe Switches to the hypervisor
+ // The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged
+ // into a PCIe Root Port or PCIe Switch.
+ // For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
+
+ // Deduce the right values for mem-reserve and pref-64-reserve memory regions
+ memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory()
+
+ // The default OVMF MMIO aperture is too small for some PCIe devices
+ // with huge BARs so we need to increase it.
+ // memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
+ if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
+ pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
+ fwCfg := govmmQemu.FwCfg{
+ Name: "opt/ovmf/X-PciMmio64Mb",
+ Str: pciMmio64Mb,
+ }
+ qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
+ }
+
+ // Get the number of hot(cold)-pluggable ports needed from the provided
+ // VFIO devices and VhostUserBlockDevices
+ var numOfPluggablePorts uint32 = 0
+ for _, dev := range hypervisorConfig.VFIODevices {
+ var err error
+ dev.HostPath, err = config.GetHostPath(dev, false, "")
+ if err != nil {
+ return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err)
+ }
+ devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev)
+ if err != nil {
+ return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err)
+ }
+ for _, vfioDevice := range devicesPerIOMMUGroup {
+ if drivers.IsPCIeDevice(vfioDevice.BDF) {
+ numOfPluggablePorts = numOfPluggablePorts + 1
+ }
+ }
+ }
+ vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus)
+ vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort)
+
+ numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices)
+
+ // If number of PCIe root ports > 16 then bail out otherwise we may
+ // use up all slots or IO memory on the root bus and vfio-XXX-pci devices
+ // cannot be added which are crucial for Kata max slots on root bus is 32
+ // max slots on the complete pci(e) topology is 256 in QEMU
+ if vfioOnRootPort {
+ // On Arm the vhost-user-block device is a PCIe device we need
+ // to account for it in the number of pluggable ports
+ if machineType == QemuVirt {
+ numOfPluggablePorts = numOfPluggablePorts + uint32(numOfVhostUserBlockDevices)
+ }
+ if numOfPluggablePorts > maxPCIeRootPort {
+ return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
+ }
+ qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
+ return nil
+ }
+ if vfioOnSwitchPort {
+ // On Arm the vhost-user-block device is a PCIe device we need
+ // to account for it in the number of pluggable ports
+ if machineType == QemuVirt {
+ numOfPluggableRootPorts := uint32(numOfVhostUserBlockDevices)
+ if numOfPluggableRootPorts > maxPCIeRootPort {
+ return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
+ }
+ qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggableRootPorts, memSize32bit, memSize64bit)
+ }
+ if numOfPluggablePorts > maxPCIeSwitchPort {
+ return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort)
+ }
+ qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
+ return nil
+ }
+ return nil
+}
+
func (q *qemu) vhostFSSocketPath(id string) (string, error) {
return utils.BuildSocketPath(q.config.VMStorePath, id, vhostFSSocket)
}
@@ -1612,6 +1697,7 @@ func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDri
}
func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation, devID string) (err error) {
+
err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false, vAttr.ReconnectTime)
if err != nil {
return err
@@ -1629,18 +1715,14 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
switch machineType {
case QemuVirt:
- if q.state.PCIeRootPort <= 0 {
- return fmt.Errorf("Vhost-user-blk device is a PCIe device if machine type is virt. Need to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for virt")
- }
-
//The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0
//Since the dev is the first and only one on this bus(root port), it should be 0.
addr := "00"
- bridgeId := fmt.Sprintf("%s%d", pcieRootPortPrefix, len(drivers.AllPCIeDevs))
- drivers.AllPCIeDevs[devID] = true
+ bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort]))
+ config.PCIeDevices[config.RootPort][devID] = true
- bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeId)
+ bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID)
bridgeSlot, err := q.qomGetSlot(bridgeQomPath)
if err != nil {
return err
@@ -1656,7 +1738,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
return err
}
- if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeId); err != nil {
+ if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeID); err != nil {
return err
}
@@ -1770,41 +1852,108 @@ func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) {
// Query QMP to find a device's PCI path given its QOM path or ID
func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) {
- // XXX: For now we assume there's exactly one bridge, since
- // that's always how we configure qemu from Kata for now. It
- // would be good to generalize this to different PCI
- // topologies
+
+ var slots []types.PciSlot
+
devSlot, err := q.qomGetSlot(qemuID)
if err != nil {
return types.PciPath{}, err
}
+ slots = append(slots, devSlot)
- busq, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qemuID, "parent_bus")
+ // This only works for Q35 and Virt
+ r, _ := regexp.Compile(`^/machine/.*/pcie.0`)
+
+ var parentPath = qemuID
+ // We do not want to use a forever loop here, a deeper PCIe topology
+ // than 5 is already not advisable just for the sake of having enough
+ // buffer we limit ourselves to 10 and leave the loop early if we hit
+ // the root bus.
+ for i := 1; i <= maxPCIeTopoDepth; i++ {
+ parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus")
+ if err != nil {
+ return types.PciPath{}, err
+ }
+
+ busQOM, ok := parenBusQOM.(string)
+ if !ok {
+ return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM)
+ }
+
+ // If we hit /machine/q35/pcie.0 we're done this is the root bus
+ // we climbed the complete hierarchy
+ if r.Match([]byte(busQOM)) {
+ break
+ }
+
+ // `bus` is the QOM path of the QOM bus object, but we need
+ // the PCI parent_bus which manages that bus. There doesn't seem
+ // to be a way to get that other than to simply drop the last
+ // path component.
+ idx := strings.LastIndex(busQOM, "/")
+ if idx == -1 {
+ return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM)
+ }
+ parentBus := busQOM[:idx]
+
+ parentSlot, err := q.qomGetSlot(parentBus)
+ if err != nil {
+ return types.PciPath{}, err
+ }
+
+ // Prepend the slots, since we're climbing the hierarchy
+ slots = append([]types.PciSlot{parentSlot}, slots...)
+ parentPath = parentBus
+ }
+ return types.PciPathFromSlots(slots...)
+}
+
+func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) {
+ return q.executeVFIODeviceAdd(device)
+}
+
+func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.VFIODev) (err error) {
+ return q.executeVFIODeviceAdd(device)
+}
+
+func (q *qemu) hotplugVFIODeviceBridgePort(ctx context.Context, device *config.VFIODev) (err error) {
+ addr, bridge, err := q.arch.addDeviceToBridge(ctx, device.ID, types.PCI)
if err != nil {
- return types.PciPath{}, err
+ return err
}
- bus, ok := busq.(string)
- if !ok {
- return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, busq)
- }
+ defer func() {
+ if err != nil {
+ q.arch.removeDeviceFromBridge(device.ID)
+ }
+ }()
+ return q.executePCIVFIODeviceAdd(device, addr, bridge.ID)
+}
- // `bus` is the QOM path of the QOM bus object, but we need
- // the PCI bridge which manages that bus. There doesn't seem
- // to be a way to get that other than to simply drop the last
- // path component.
- idx := strings.LastIndex(bus, "/")
- if idx == -1 {
- return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", bus)
+func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, bridgeID string) error {
+ switch device.Type {
+ case config.VFIOPCIDeviceNormalType:
+ return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, addr, bridgeID, romFile)
+ case config.VFIOPCIDeviceMediatedType:
+ return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile)
+ case config.VFIOAPDeviceMediatedType:
+ return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
+ default:
+ return fmt.Errorf("Incorrect VFIO device type found")
}
- bridge := bus[:idx]
+}
- bridgeSlot, err := q.qomGetSlot(bridge)
- if err != nil {
- return types.PciPath{}, err
+func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error {
+ switch device.Type {
+ case config.VFIOPCIDeviceNormalType:
+ return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, device.Bus, romFile)
+ case config.VFIOPCIDeviceMediatedType:
+ return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile)
+ case config.VFIOAPDeviceMediatedType:
+ return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
+ default:
+ return fmt.Errorf("Incorrect VFIO device type found")
}
-
- return types.PciPathFromSlots(bridgeSlot, devSlot)
}
func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op Operation) (err error) {
@@ -1812,109 +1961,53 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op
return err
}
- devID := *(*device).GetID()
- machineType := q.HypervisorConfig().HypervisorMachineType
-
if op == AddDevice {
-
buf, _ := json.Marshal(device)
q.Logger().WithFields(logrus.Fields{
- "machine-type": machineType,
- "hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus,
- "pcie-root-port": q.state.PCIeRootPort,
- "device-info": string(buf),
+ "machine-type": q.HypervisorConfig().HypervisorMachineType,
+ "hot-plug-vfio": q.state.HotPlugVFIO,
+ "device-info": string(buf),
}).Info("Start hot-plug VFIO device")
-
- // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
- // for pc machine type instead of bridge. This is useful for devices that require
- // a large PCI BAR which is a currently a limitation with PCI bridges.
- if q.state.HotplugVFIOOnRootBus {
- switch (*device).GetType() {
- case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
- // In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port.
- pciDevice, ok := (*device).(config.VFIOPCIDev)
- if !ok {
- return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
- }
- switch machineType {
- case QemuQ35:
- if pciDevice.IsPCIe && q.state.PCIeRootPort <= 0 {
- q.Logger().WithField("dev-id", (*device).GetID()).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35")
- pciDevice.Bus = ""
- }
- default:
- pciDevice.Bus = ""
- }
- *device = pciDevice
-
- if pciDevice.Type == config.VFIOPCIDeviceNormalType {
- err = q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, pciDevice.Bus, romFile)
- } else {
- err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), "", pciDevice.Bus, romFile)
- }
- case config.VFIOAPDeviceMediatedType:
- err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
- }
+ // In case MachineType is q35, a PCIe device is hotplugged on
+ // a PCIe Root Port or alternatively on a PCIe Switch Port
+ if q.HypervisorConfig().HypervisorMachineType != QemuQ35 && q.HypervisorConfig().HypervisorMachineType != QemuVirt {
+ device.Bus = ""
} else {
- addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI)
+ var err error
+ // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
+ // for pc machine type instead of bridge. This is useful for devices that require
+ // a large PCI BAR which is a currently a limitation with PCI bridges.
+ if q.state.HotPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus {
+ err = q.hotplugVFIODeviceRootPort(ctx, device)
+ } else if q.state.HotPlugVFIO == config.SwitchPort {
+ err = q.hotplugVFIODeviceSwitchPort(ctx, device)
+ } else {
+ err = q.hotplugVFIODeviceBridgePort(ctx, device)
+ }
if err != nil {
return err
}
-
- defer func() {
- if err != nil {
- q.arch.removeDeviceFromBridge(devID)
- }
- }()
-
- switch (*device).GetType() {
- case config.VFIOPCIDeviceNormalType:
- pciDevice, ok := (*device).(config.VFIOPCIDev)
- if !ok {
- return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
- }
- err = q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, addr, bridge.ID, romFile)
- case config.VFIOPCIDeviceMediatedType:
- err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), addr, bridge.ID, romFile)
- case config.VFIOAPDeviceMediatedType:
- err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
- default:
- return fmt.Errorf("Incorrect VFIO device type found")
- }
- }
- if err != nil {
- return err
- }
-
- switch (*device).GetType() {
- case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
- pciDevice, ok := (*device).(config.VFIOPCIDev)
- if !ok {
- return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
- }
- // XXX: Depending on whether we're doing root port or
- // bridge hotplug, and how the bridge is set up in
- // other parts of the code, we may or may not already
- // have information about the slot number of the
- // bridge and or the device. For simplicity, just
- // query both of them back from qemu
- guestPciPath, err := q.qomGetPciPath(devID)
- pciDevice.GuestPciPath = guestPciPath
- *device = pciDevice
- return err
}
+ // XXX: Depending on whether we're doing root port or
+ // bridge hotplug, and how the bridge is set up in
+ // other parts of the code, we may or may not already
+ // have information about the slot number of the
+ // bridge and or the device. For simplicity, just
+ // query both of them back from qemu
+ device.GuestPciPath, err = q.qomGetPciPath(device.ID)
return err
- } else {
- q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device")
-
- if !q.state.HotplugVFIOOnRootBus {
- if err := q.arch.removeDeviceFromBridge(devID); err != nil {
- return err
- }
- }
-
- return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID)
}
+
+ q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device")
+
+ if !q.state.HotplugVFIOOnRootBus {
+ if err := q.arch.removeDeviceFromBridge(device.ID); err != nil {
+ return err
+ }
+ }
+
+ return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID)
+
}
func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error {
@@ -2612,7 +2705,7 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin
for i := uint32(0); i < number; i++ {
devices = append(devices,
govmmQemu.PCIeRootPortDevice{
- ID: fmt.Sprintf("%s%d", pcieRootPortPrefix, i),
+ ID: fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, i),
Bus: bus,
Chassis: chassis,
Slot: strconv.FormatUint(uint64(i), 10),
@@ -2626,6 +2719,79 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin
return devices
}
+// gollangci-lint enforces multi-line comments to be a block comment
+// not multiple single line comments ...
+/* pcie.0 bus
+// -------------------------------------------------
+// |
+// -------------
+// | Root Port |
+// -------------
+// -------------------------|------------------------
+// | ----------------- |
+// | PCI Express | Upstream Port | |
+// | Switch ----------------- |
+// | | | |
+// | ------------------- ------------------- |
+// | | Downstream Port | | Downstream Port | |
+// | ------------------- ------------------- |
+// -------------|-----------------------|------------
+// ------------- --------------
+// | GPU/ACCEL | | IB/ETH NIC |
+// ------------- --------------
+*/
+// genericAppendPCIeSwitch adds a PCIe Swtich
+func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
+
+ // Q35, Virt have the correct PCIe support,
+ // hence ignore all other machines
+ if machineType != QemuQ35 && machineType != QemuVirt {
+ return devices
+ }
+
+ // Using an own ID for the root port, so we do not clash with already
+ // existing root ports adding "s" for switch prefix
+ pcieRootPort := govmmQemu.PCIeRootPortDevice{
+ ID: fmt.Sprintf("%s%s%d", config.PCIeSwitchPortPrefix, config.PCIeRootPortPrefix, 0),
+ Bus: defaultBridgeBus,
+ Chassis: "1",
+ Slot: strconv.FormatUint(uint64(0), 10),
+ Multifunction: false,
+ Addr: "0",
+ MemReserve: fmt.Sprintf("%dB", memSize32bit),
+ Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
+ }
+
+ devices = append(devices, pcieRootPort)
+
+ pcieSwitchUpstreamPort := govmmQemu.PCIeSwitchUpstreamPortDevice{
+ ID: fmt.Sprintf("%s%d", config.PCIeSwitchUpstreamPortPrefix, 0),
+ Bus: pcieRootPort.ID,
+ }
+ devices = append(devices, pcieSwitchUpstreamPort)
+
+ currentChassis, err := strconv.Atoi(pcieRootPort.Chassis)
+ if err != nil {
+ return devices
+ }
+ nextChassis := currentChassis + 1
+
+ for i := uint32(0); i < number; i++ {
+
+ pcieSwitchDownstreamPort := govmmQemu.PCIeSwitchDownstreamPortDevice{
+ ID: fmt.Sprintf("%s%d", config.PCIeSwitchhDownstreamPortPrefix, i),
+ Bus: pcieSwitchUpstreamPort.ID,
+ Chassis: fmt.Sprintf("%d", nextChassis),
+ Slot: strconv.FormatUint(uint64(i), 10),
+ // TODO: MemReserve: fmt.Sprintf("%dB", memSize32bit),
+ // TODO: Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
+ }
+ devices = append(devices, pcieSwitchDownstreamPort)
+ }
+
+ return devices
+}
+
func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
span, _ := katatrace.Trace(ctx, q.Logger(), "GetThreadIDs", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
@@ -2801,7 +2967,6 @@ func (q *qemu) Save() (s hv.HypervisorState) {
s.UUID = q.state.UUID
s.HotpluggedMemory = q.state.HotpluggedMemory
s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus
- s.PCIeRootPort = q.state.PCIeRootPort
for _, bridge := range q.arch.getBridges() {
s.Bridges = append(s.Bridges, hv.Bridge{
@@ -2825,7 +2990,6 @@ func (q *qemu) Load(s hv.HypervisorState) {
q.state.HotpluggedMemory = s.HotpluggedMemory
q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus
q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid
- q.state.PCIeRootPort = s.PCIeRootPort
for _, bridge := range s.Bridges {
q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr))
diff --git a/src/runtime/virtcontainers/qemu_amd64.go b/src/runtime/virtcontainers/qemu_amd64.go
index d24953e61..4a6449273 100644
--- a/src/runtime/virtcontainers/qemu_amd64.go
+++ b/src/runtime/virtcontainers/qemu_amd64.go
@@ -26,6 +26,7 @@ import (
"google.golang.org/grpc/credentials/insecure"
"github.com/intel-go/cpuid"
+ "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
)
@@ -182,7 +183,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
return q, nil
}
-func (q *qemuAmd64) capabilities() types.Capabilities {
+func (q *qemuAmd64) capabilities(hConfig HypervisorConfig) types.Capabilities {
var caps types.Capabilities
if q.qemuMachine.Type == QemuQ35 ||
@@ -191,7 +192,9 @@ func (q *qemuAmd64) capabilities() types.Capabilities {
}
caps.SetMultiQueueSupport()
- caps.SetFsSharingSupport()
+ if hConfig.SharedFS != config.NoSharedFS {
+ caps.SetFsSharingSupport()
+ }
return caps
}
@@ -323,6 +326,7 @@ func (q *qemuAmd64) appendProtectionDevice(devices []govmmQemu.Device, firmware,
ReducedPhysBits: 1,
}), "", nil
case noneProtection:
+
return devices, firmware, nil
default:
diff --git a/src/runtime/virtcontainers/qemu_amd64_test.go b/src/runtime/virtcontainers/qemu_amd64_test.go
index 850118f69..17a537956 100644
--- a/src/runtime/virtcontainers/qemu_amd64_test.go
+++ b/src/runtime/virtcontainers/qemu_amd64_test.go
@@ -42,13 +42,14 @@ func TestQemuAmd64BadMachineType(t *testing.T) {
func TestQemuAmd64Capabilities(t *testing.T) {
assert := assert.New(t)
+ config := HypervisorConfig{}
amd64 := newTestQemu(assert, QemuQ35)
- caps := amd64.capabilities()
+ caps := amd64.capabilities(config)
assert.True(caps.IsBlockDeviceHotplugSupported())
amd64 = newTestQemu(assert, QemuMicrovm)
- caps = amd64.capabilities()
+ caps = amd64.capabilities(config)
assert.False(caps.IsBlockDeviceHotplugSupported())
}
diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go
index 3de1aabc9..ead1a4a65 100644
--- a/src/runtime/virtcontainers/qemu_arch_base.go
+++ b/src/runtime/virtcontainers/qemu_arch_base.go
@@ -61,7 +61,7 @@ type qemuArch interface {
kernelParameters(debug bool) []Param
//capabilities returns the capabilities supported by QEMU
- capabilities() types.Capabilities
+ capabilities(config HypervisorConfig) types.Capabilities
// bridges sets the number bridges for the machine type
bridges(number uint32)
@@ -150,6 +150,9 @@ type qemuArch interface {
// appendPCIeRootPortDevice appends a pcie-root-port device to pcie.0 bus
appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device
+ // appendPCIeSwitch appends a ioh3420 device to a pcie-root-port
+ appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device
+
// append vIOMMU device
appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Device, error)
@@ -204,7 +207,8 @@ const (
defaultBridgeBus = "pcie.0"
defaultPCBridgeBus = "pci.0"
maxDevIDSize = 31
- pcieRootPortPrefix = "rp"
+ maxPCIeRootPort = 16 // Limitation from QEMU
+ maxPCIeSwitchPort = 16 // Limitation from QEMU
)
// This is the PCI start address assigned to the first bridge that
@@ -313,11 +317,13 @@ func (q *qemuArchBase) kernelParameters(debug bool) []Param {
return params
}
-func (q *qemuArchBase) capabilities() types.Capabilities {
+func (q *qemuArchBase) capabilities(hConfig HypervisorConfig) types.Capabilities {
var caps types.Capabilities
caps.SetBlockDeviceHotplugSupport()
caps.SetMultiQueueSupport()
- caps.SetFsSharingSupport()
+ if hConfig.SharedFS != config.NoSharedFS {
+ caps.SetFsSharingSupport()
+ }
return caps
}
@@ -708,17 +714,17 @@ func (q *qemuArchBase) appendVhostUserDevice(ctx context.Context, devices []govm
}
func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device {
- pciDevice := vfioDev.(config.VFIOPCIDev)
- if pciDevice.BDF == "" {
+
+ if vfioDev.BDF == "" {
return devices
}
devices = append(devices,
govmmQemu.VFIODevice{
- BDF: pciDevice.BDF,
- VendorID: pciDevice.VendorID,
- DeviceID: pciDevice.DeviceID,
- Bus: pciDevice.Bus,
+ BDF: vfioDev.BDF,
+ VendorID: vfioDev.VendorID,
+ DeviceID: vfioDev.DeviceID,
+ Bus: vfioDev.Bus,
},
)
@@ -834,6 +840,13 @@ func (q *qemuArchBase) appendPCIeRootPortDevice(devices []govmmQemu.Device, numb
return genericAppendPCIeRootPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
}
+// appendPCIeSwitchPortDevice appends a PCIe Switch with ports
+func (q *qemuArchBase) appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
+ return genericAppendPCIeSwitchPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
+}
+
+// getBARsMaxAddressableMemory we need to know the BAR sizes to configure the
+// PCIe Root Port or PCIe Downstream Port attaching a device with huge BARs.
func (q *qemuArchBase) getBARsMaxAddressableMemory() (uint64, uint64) {
pci := nvpci.New()
diff --git a/src/runtime/virtcontainers/qemu_arch_base_test.go b/src/runtime/virtcontainers/qemu_arch_base_test.go
index 51c11bd91..75d7de029 100644
--- a/src/runtime/virtcontainers/qemu_arch_base_test.go
+++ b/src/runtime/virtcontainers/qemu_arch_base_test.go
@@ -117,9 +117,16 @@ func TestQemuArchBaseKernelParameters(t *testing.T) {
func TestQemuArchBaseCapabilities(t *testing.T) {
assert := assert.New(t)
qemuArchBase := newQemuArchBase()
+ hConfig := HypervisorConfig{}
+ hConfig.SharedFS = config.VirtioFS
- c := qemuArchBase.capabilities()
+ c := qemuArchBase.capabilities(hConfig)
assert.True(c.IsBlockDeviceHotplugSupported())
+ assert.True(c.IsFsSharingSupported())
+
+ hConfig.SharedFS = config.NoSharedFS
+ c = qemuArchBase.capabilities(hConfig)
+ assert.False(c.IsFsSharingSupported())
}
func TestQemuArchBaseBridges(t *testing.T) {
@@ -463,7 +470,7 @@ func TestQemuArchBaseAppendVFIODevice(t *testing.T) {
},
}
- vfDevice := config.VFIOPCIDev{
+ vfDevice := config.VFIODev{
BDF: bdf,
}
@@ -483,7 +490,7 @@ func TestQemuArchBaseAppendVFIODeviceWithVendorDeviceID(t *testing.T) {
},
}
- vfDevice := config.VFIOPCIDev{
+ vfDevice := config.VFIODev{
BDF: bdf,
VendorID: vendorID,
DeviceID: deviceID,
diff --git a/src/runtime/virtcontainers/qemu_ppc64le.go b/src/runtime/virtcontainers/qemu_ppc64le.go
index 2d4010fbc..7d71d72ba 100644
--- a/src/runtime/virtcontainers/qemu_ppc64le.go
+++ b/src/runtime/virtcontainers/qemu_ppc64le.go
@@ -11,6 +11,7 @@ import (
"fmt"
"time"
+ "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
"github.com/sirupsen/logrus"
@@ -97,7 +98,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
return q, nil
}
-func (q *qemuPPC64le) capabilities() types.Capabilities {
+func (q *qemuPPC64le) capabilities(hConfig HypervisorConfig) types.Capabilities {
var caps types.Capabilities
// pseries machine type supports hotplugging drives
@@ -106,7 +107,9 @@ func (q *qemuPPC64le) capabilities() types.Capabilities {
}
caps.SetMultiQueueSupport()
- caps.SetFsSharingSupport()
+ if hConfig.SharedFS != config.NoSharedFS {
+ caps.SetFsSharingSupport()
+ }
return caps
}
diff --git a/src/runtime/virtcontainers/qemu_test.go b/src/runtime/virtcontainers/qemu_test.go
index bfa348145..418075e26 100644
--- a/src/runtime/virtcontainers/qemu_test.go
+++ b/src/runtime/virtcontainers/qemu_test.go
@@ -111,9 +111,6 @@ func TestQemuCreateVM(t *testing.T) {
config6 := newQemuConfig()
config6.DisableGuestSeLinux = false
- config7 := newQemuConfig()
- config7.PCIeRootPort = 1
-
config8 := newQemuConfig()
config8.EnableVhostUserStore = true
config8.HugePages = true
@@ -161,7 +158,6 @@ func TestQemuCreateVM(t *testing.T) {
{config3, false, true},
{config5, false, true},
{config6, false, false},
- {config7, false, true},
{config8, false, true},
{config9, true, false},
{config10, false, true},
diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go
index 530713475..a0c078012 100644
--- a/src/runtime/virtcontainers/sandbox.go
+++ b/src/runtime/virtcontainers/sandbox.go
@@ -36,7 +36,6 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
- hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
resCtrl "github.com/kata-containers/kata-containers/src/runtime/pkg/resourcecontrol"
exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental"
@@ -106,15 +105,10 @@ type HypervisorPidKey struct{}
// SandboxStatus describes a sandbox status.
type SandboxStatus struct {
- ContainersStatus []ContainerStatus
-
- // Annotations allow clients to store arbitrary values,
- // for example to add additional status values required
- // to support particular specifications.
- Annotations map[string]string
-
+ Annotations map[string]string
ID string
Hypervisor HypervisorType
+ ContainersStatus []ContainerStatus
State types.SandboxState
HypervisorConfig HypervisorConfig
}
@@ -530,6 +524,7 @@ func createSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Fac
return s, nil
}
+//nolint:gocyclo
func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factory) (sb *Sandbox, retErr error) {
span, ctx := katatrace.Trace(ctx, nil, "newSandbox", sandboxTracingTags, map[string]string{"sandbox_id": sandboxConfig.ID})
defer span.End()
@@ -630,22 +625,49 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
// until we have TDISP/IDE PCIe support.
- coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != hv.NoPort)
- var devs []config.DeviceInfo
+ coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort)
+ // Aggregate all the containner devices for hot-plug and use them to dedcue
+ // the correct amount of ports to reserve for the hypervisor.
+ hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
+
+ var vfioDevices []config.DeviceInfo
+ // vhost-user-block device is a PCIe device in Virt, keep track of it
+ // for correct number of PCIe root ports.
+ var vhostUserBlkDevices []config.DeviceInfo
+
for cnt, containers := range sandboxConfig.Containers {
for dev, device := range containers.DeviceInfos {
- if coldPlugVFIO && deviceManager.IsVFIO(device.ContainerPath) {
+
+ if deviceManager.IsVhostUserBlk(device) {
+ vhostUserBlkDevices = append(vhostUserBlkDevices, device)
+ continue
+ }
+ isVFIO := deviceManager.IsVFIO(device.ContainerPath)
+ if hotPlugVFIO && isVFIO {
+ vfioDevices = append(vfioDevices, device)
+ sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
+ }
+ if coldPlugVFIO && isVFIO {
device.ColdPlug = true
- devs = append(devs, device)
+ device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO
+ vfioDevices = append(vfioDevices, device)
// We need to remove the devices marked for cold-plug
// otherwise at the container level the kata-agent
// will try to hot-plug them.
- infos := sandboxConfig.Containers[cnt].DeviceInfos
- infos = append(infos[:dev], infos[dev+1:]...)
- sandboxConfig.Containers[cnt].DeviceInfos = infos
+ sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging"
}
}
+ var filteredDevices []config.DeviceInfo
+ for _, device := range containers.DeviceInfos {
+ if device.ID != "remove-we-are-cold-plugging" {
+ filteredDevices = append(filteredDevices, device)
+ }
+ }
+ sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices
+
}
+ sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
+ sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices
// store doesn't require hypervisor to be stored immediately
if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil {
@@ -660,7 +682,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
return s, nil
}
- for _, dev := range devs {
+ for _, dev := range vfioDevices {
_, err := s.AddDevice(ctx, dev)
if err != nil {
s.Logger().WithError(err).Debug("Cannot cold-plug add device")
@@ -1723,7 +1745,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
defer span.End()
for i := range s.config.Containers {
-
c, err := newContainer(ctx, s, &s.config.Containers[i])
if err != nil {
return err
@@ -1742,7 +1763,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
if err := s.updateResources(ctx); err != nil {
return err
}
-
if err := s.resourceControllerUpdate(ctx); err != nil {
return err
}
@@ -1754,7 +1774,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
if err := s.storeSandbox(ctx); err != nil {
return err
}
-
return nil
}
@@ -1918,15 +1937,11 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
// adding a group of VFIO devices
for _, dev := range vfioDevices {
if _, err := s.hypervisor.HotplugAddDevice(ctx, dev, VfioDev); err != nil {
- bdf := ""
- if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
- bdf = pciDevice.BDF
- }
s.Logger().
WithFields(logrus.Fields{
"sandbox": s.id,
- "vfio-device-ID": (*dev).GetID(),
- "vfio-device-BDF": bdf,
+ "vfio-device-ID": dev.ID,
+ "vfio-device-BDF": dev.BDF,
}).WithError(err).Error("failed to hotplug VFIO device")
return err
}
@@ -1941,6 +1956,7 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
return err
case config.VhostUserBlk:
vhostUserBlkDevice, ok := device.(*drivers.VhostUserBlkDevice)
+
if !ok {
return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
}
@@ -1975,15 +1991,11 @@ func (s *Sandbox) HotplugRemoveDevice(ctx context.Context, device api.Device, de
// remove a group of VFIO devices
for _, dev := range vfioDevices {
if _, err := s.hypervisor.HotplugRemoveDevice(ctx, dev, VfioDev); err != nil {
- bdf := ""
- if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
- bdf = pciDevice.BDF
- }
s.Logger().WithError(err).
WithFields(logrus.Fields{
"sandbox": s.id,
- "vfio-device-ID": (*dev).GetID(),
- "vfio-device-BDF": bdf,
+ "vfio-device-ID": dev.ID,
+ "vfio-device-BDF": dev.BDF,
}).Error("failed to hot unplug VFIO device")
return err
}
diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go
index de3b1885c..90a2af7ee 100644
--- a/src/runtime/virtcontainers/sandbox_test.go
+++ b/src/runtime/virtcontainers/sandbox_test.go
@@ -593,11 +593,11 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) {
_, err = os.Create(deviceFile)
assert.Nil(t, err)
- savedIOMMUPath := config.SysIOMMUPath
- config.SysIOMMUPath = tmpDir
+ savedIOMMUPath := config.SysIOMMUGroupPath
+ config.SysIOMMUGroupPath = tmpDir
defer func() {
- config.SysIOMMUPath = savedIOMMUPath
+ config.SysIOMMUGroupPath = savedIOMMUPath
}()
dm := manager.NewDeviceManager(config.VirtioSCSI, false, "", 0, nil)
diff --git a/tests/common.bash b/tests/common.bash
index d4aa44684..090bd7fd9 100644
--- a/tests/common.bash
+++ b/tests/common.bash
@@ -240,3 +240,17 @@ restart_containerd_service() {
clean_env_ctr
return 0
}
+
+# @path_results: path to the input metric-results folder
+# @tarball_fname: path and filename to the output tarball
+function compress_metrics_results_dir()
+{
+ local path_results="${1:-results}"
+ local tarball_fname="${2:-}"
+
+ [ -z "${tarball_fname}" ] && die "Missing the tarball filename or the path to save the tarball results is incorrect."
+ [ ! -d "${path_results}" ] && die "Missing path to the results folder."
+
+ cd "${path_results}" && tar -czf "${tarball_fname}" *.json && cd -
+ info "tarball generated: ${tarball_fname}"
+}
diff --git a/tests/integration/gha-run.sh b/tests/integration/gha-run.sh
index 103ce2cda..c5cd573d6 100755
--- a/tests/integration/gha-run.sh
+++ b/tests/integration/gha-run.sh
@@ -31,13 +31,16 @@ function login_azure() {
}
function create_cluster() {
+ # First, ensure that the cluster didn't fail to get cleaned up from a previous run.
+ delete_cluster || true
+
az aks create \
-g "kataCI" \
-n "$(_print_cluster_name)" \
-s "Standard_D4s_v5" \
--node-count 1 \
--generate-ssh-keys \
- $([ "${KATA_HOST_OS}" = "cbl-mariner" ] && echo "--os-sku mariner --workload-runtime KataMshvVmIsolation")
+ $([ "${KATA_HOST_OS}" = "cbl-mariner" ] && echo "--os-sku AzureLinux --workload-runtime KataMshvVmIsolation")
}
function install_bats() {
@@ -55,10 +58,28 @@ function get_cluster_credentials() {
-n "$(_print_cluster_name)"
}
+function ensure_yq() {
+ : "${GOPATH:=${GITHUB_WORKSPACE}}"
+ export GOPATH
+ export PATH="${GOPATH}/bin:${PATH}"
+ INSTALL_IN_GOPATH=true "${repo_root_dir}/ci/install_yq.sh"
+}
+
function run_tests() {
platform="${1}"
+ ensure_yq
+
+ # Emsure we're in the default namespace
+ kubectl config set-context --current --namespace=default
+
+ # Delete any spurious tests namespace that was left behind
+ kubectl delete namespace kata-containers-k8s-tests &> /dev/null || true
sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}|g" "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml"
+ if [ "${KATA_HOST_OS}" = "cbl-mariner" ]; then
+ yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[+].name' "HOST_OS"
+ yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[-1].value' "${KATA_HOST_OS}"
+ fi
cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml"
cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" | grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" || die "Failed to setup the tests image"
@@ -80,6 +101,10 @@ function run_tests() {
sleep 60s
fi
+ # Create a new namespace for the tests and switch to it
+ kubectl apply -f ${integration_dir}/kubernetes/runtimeclass_workloads/tests-namespace.yaml
+ kubectl config set-context --current --namespace=kata-containers-k8s-tests
+
pushd "${integration_dir}/kubernetes"
bash setup.sh
bash run_kubernetes_tests.sh
@@ -89,6 +114,10 @@ function run_tests() {
function cleanup() {
platform="${1}"
+ # Switch back to the default namespace and delete the tests one
+ kubectl config set-context --current --namespace=default
+ kubectl delete namespace kata-containers-k8s-tests
+
if [ "${platform}" = "tdx" ]; then
deploy_spec="-k "${tools_dir}/packaging/kata-deploy/kata-deploy/overlays/k3s""
cleanup_spec="-k "${tools_dir}/packaging/kata-deploy/kata-cleanup/overlays/k3s""
@@ -115,11 +144,12 @@ function delete_cluster() {
az aks delete \
-g "kataCI" \
-n "$(_print_cluster_name)" \
- --yes \
- --no-wait
+ --yes
}
function main() {
+ export KATA_HOST_OS="${KATA_HOST_OS:-}"
+
action="${1:-}"
case "${action}" in
diff --git a/tests/integration/kubernetes/k8s-pod-quota.bats b/tests/integration/kubernetes/k8s-pod-quota.bats
index addc37bb3..d9a527725 100644
--- a/tests/integration/kubernetes/k8s-pod-quota.bats
+++ b/tests/integration/kubernetes/k8s-pod-quota.bats
@@ -14,13 +14,12 @@ setup() {
@test "Pod quota" {
resource_name="pod-quota"
deployment_name="deploymenttest"
- namespace="test-quota-ns"
# Create the resourcequota
kubectl create -f "${pod_config_dir}/resource-quota.yaml"
# View information about resourcequota
- kubectl get -n "$namespace" resourcequota "$resource_name" \
+ kubectl get resourcequota "$resource_name" \
--output=yaml | grep 'pods: "2"'
# Create deployment
@@ -28,10 +27,9 @@ setup() {
# View deployment
kubectl wait --for=condition=Available --timeout=$timeout \
- -n "$namespace" deployment/${deployment_name}
+ deployment/${deployment_name}
}
teardown() {
- kubectl delete -n "$namespace" deployment "$deployment_name"
kubectl delete -f "${pod_config_dir}/resource-quota.yaml"
}
diff --git a/tests/integration/kubernetes/run_kubernetes_tests.sh b/tests/integration/kubernetes/run_kubernetes_tests.sh
index 0975ec0d5..db1e16633 100644
--- a/tests/integration/kubernetes/run_kubernetes_tests.sh
+++ b/tests/integration/kubernetes/run_kubernetes_tests.sh
@@ -54,10 +54,6 @@ else
)
fi
-if [ ${KATA_HOST_OS} == "cbl-mariner" ]; then
- exit 0
-fi
-
# we may need to skip a few test cases when running on non-x86_64 arch
arch_config_file="${kubernetes_dir}/filter_out_per_arch/${TARGET_ARCH}.yaml"
if [ -f "${arch_config_file}" ]; then
diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml
index 680577a5f..6341b0b1f 100644
--- a/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml
+++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml
@@ -6,7 +6,6 @@
apiVersion: v1
kind: Pod
metadata:
- namespace: default
name: custom-dns-test
spec:
terminationGracePeriodSeconds: 0
diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml
index 672c54e68..90fc28667 100644
--- a/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml
+++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml
@@ -8,7 +8,6 @@ apiVersion: v1
kind: Pod
metadata:
name: pod-oom
- namespace: default
spec:
runtimeClassName: kata
restartPolicy: Never
diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml
index ecdaf5e64..9383349d7 100644
--- a/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml
+++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml
@@ -7,7 +7,6 @@ apiVersion: apps/v1
kind: Deployment
metadata:
name: deploymenttest
- namespace: test-quota-ns
spec:
selector:
matchLabels:
diff --git a/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml b/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml
index a8d84d9ad..8ae0a1998 100644
--- a/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml
+++ b/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml
@@ -14,7 +14,6 @@ items:
kind: ResourceQuota
metadata:
name: pod-quota
- namespace: test-quota-ns
spec:
hard:
pods: "2"
diff --git a/tests/integration/kubernetes/runtimeclass_workloads/tests-namespace.yaml b/tests/integration/kubernetes/runtimeclass_workloads/tests-namespace.yaml
new file mode 100644
index 000000000..916003d13
--- /dev/null
+++ b/tests/integration/kubernetes/runtimeclass_workloads/tests-namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: kata-containers-k8s-tests
diff --git a/tests/integration/kubernetes/setup.sh b/tests/integration/kubernetes/setup.sh
index 0c3baf2dc..614f38827 100755
--- a/tests/integration/kubernetes/setup.sh
+++ b/tests/integration/kubernetes/setup.sh
@@ -13,8 +13,24 @@ set_runtime_class() {
sed -i -e "s|runtimeClassName: kata|runtimeClassName: kata-${KATA_HYPERVISOR}|" ${kubernetes_dir}/runtimeclass_workloads/*.yaml
}
+set_kernel_path() {
+ if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then
+ mariner_kernel_path="/usr/share/cloud-hypervisor/vmlinux.bin"
+ find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.kernel]' "${mariner_kernel_path}" \;
+ fi
+}
+
+set_initrd_path() {
+ if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then
+ initrd_path="/opt/kata/share/kata-containers/kata-containers-initrd-cbl-mariner.img"
+ find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.initrd]' "${initrd_path}" \;
+ fi
+}
+
main() {
set_runtime_class
+ set_kernel_path
+ set_initrd_path
}
main "$@"
diff --git a/tests/metrics/README.md b/tests/metrics/README.md
index 1dd996046..d017ed3fc 100644
--- a/tests/metrics/README.md
+++ b/tests/metrics/README.md
@@ -55,6 +55,8 @@ For further details see the [time tests documentation](time).
Tests that measure the size and overheads of the runtime. Generally this is looking at
memory footprint sizes, but could also cover disk space or even CPU consumption.
+For further details see the [density tests documentation](density).
+
### Networking
Tests relating to networking. General items could include:
diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml
new file mode 100644
index 000000000..562b2c83b
--- /dev/null
+++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This file contains baseline expectations
+# for checked results by checkmetrics tool.
+#
+# values set specifically for packet.com c1.small worker.
+
+[[metric]]
+name = "boot-times"
+type = "json"
+description = "measure container lifecycle timings"
+# Min and Max values to set a 'range' that
+# the median of the CSV Results data must fall
+# within (inclusive)
+checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result"
+checktype = "mean"
+midval = 0.42
+minpercent = 20.0
+maxpercent = 20.0
+
+[[metric]]
+name = "memory-footprint"
+type = "json"
+description = "measure memory usage"
+# Min and Max values to set a 'range' that
+# the median of the CSV Results data must fall
+# within (inclusive)
+checkvar = ".\"memory-footprint\".Results | .[] | .average.Result"
+checktype = "mean"
+midval = 2518364.00
+minpercent = 20.0
+maxpercent = 20.0
diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml
new file mode 100644
index 000000000..c6bc85147
--- /dev/null
+++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This file contains baseline expectations
+# for checked results by checkmetrics tool.
+#
+# values set specifically for Equinix m3.small.x86.
+
+[[metric]]
+name = "boot-times"
+type = "json"
+description = "measure container lifecycle timings"
+# Min and Max values to set a 'range' that
+# the median of the CSV Results data must fall
+# within (inclusive)
+checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result"
+checktype = "mean"
+midval = 0.61
+minpercent = 20.0
+maxpercent = 20.0
+
+[[metric]]
+name = "memory-footprint"
+type = "json"
+description = "measure memory usage"
+# Min and Max values to set a 'range' that
+# the median of the CSV Results data must fall
+# within (inclusive)
+checkvar = ".\"memory-footprint\".Results | .[] | .average.Result"
+checktype = "mean"
+midval = 2435844.00
+minpercent = 20.0
+maxpercent = 20.0
diff --git a/tests/metrics/density/README.md b/tests/metrics/density/README.md
new file mode 100644
index 000000000..e07ee18b3
--- /dev/null
+++ b/tests/metrics/density/README.md
@@ -0,0 +1,53 @@
+# Kata Containers density metrics tests
+
+This directory contains a number of tests to help measure container
+memory footprint. Some measures are based around the
+[PSS](https://en.wikipedia.org/wiki/Proportional_set_size) of the runtime
+components, and others look at the system level (`free` and `/proc/meminfo`
+for instance) impact.
+
+## `memory_usage`
+
+This test measures the PSS footprint of the runtime components whilst
+launching a number of small ([BusyBox](https://hub.docker.com/_/busybox/)) containers
+using ctr.
+
+## `fast_footprint`
+
+This test takes system level resource measurements after launching a number of
+containers in parallel and optionally waiting for KSM to settle its memory
+compaction cycles.
+
+The script is quite configurable via environment variables, including:
+
+* Which container workload to run.
+* How many containers to launch.
+* How many containers are launched in parallel.
+* How long to wait until taking the measures.
+
+See the script itself for more details.
+
+This test shares many config options with the `footprint_data` test. Thus, referring
+to the [footprint test documentation](footprint_data.md) may be useful.
+
+> *Note:* If this test finds KSM is enabled on the host, it will wait for KSM
+> to "settle" before taking the final measurement. If your KSM is not configured
+> to process all the allocated VM memory fast enough, the test will hit a timeout
+> and proceed to take the final measurement anyway.
+
+## `footprint_data`
+
+Similar to the `fast_footprint` test, but this test launches the containers
+sequentially and takes a system level measurement between each launch. Thus,
+this test provides finer grained information on system scaling, but takes
+significantly longer to run than the `fast_footprint` test. If you are only
+interested in the final figure or the average impact, you may be better running
+the `fast_footprint` test.
+
+For more details see the [footprint test documentation](footprint_data.md).
+
+## `memory_usage_inside_container`
+
+Measures the memory statistics *inside* the container. This allows evaluation of
+the overhead the VM kernel and rootfs are having on the memory that was requested
+by the container co-ordination system, and thus supplied to the VM.
diff --git a/tests/metrics/density/fast_footprint.sh b/tests/metrics/density/fast_footprint.sh
new file mode 100755
index 000000000..9c84f57fc
--- /dev/null
+++ b/tests/metrics/density/fast_footprint.sh
@@ -0,0 +1,433 @@
+#!/bin/bash
+# Copyright (c) 2017-2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# A script to gather memory 'footprint' information as we launch more
+# and more containers
+#
+# The script gathers information about both user and kernel space consumption
+# Output is into a .json file, named using some of the config component names
+# (such as footprint-busybox.json)
+
+# Pull in some common, useful, items
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+source "${SCRIPT_PATH}/../lib/common.bash"
+
+# Note that all vars that can be set from outside the script (that is,
+# passed in the ENV), use the ':-' setting to allow being over-ridden
+
+# Default sleep, in seconds, to let containers come up and finish their
+# initialisation before we take the measures. Some of the larger
+# containers can take a number of seconds to get running.
+PAYLOAD_SLEEP="${PAYLOAD_SLEEP:-10}"
+
+# How long, in seconds, do we wait for KSM to 'settle down', before we
+# timeout and just continue anyway.
+KSM_WAIT_TIME="${KSM_WAIT_TIME:-300}"
+
+# How long, in seconds, do we poll for ctr to complete launching all the
+# containers?
+CTR_POLL_TIMEOUT="${CTR_POLL_TIMEOUT:-300}"
+
+# How many containers do we launch in parallel before taking the PAYLOAD_SLEEP
+# nap
+PARALLELISM="${PARALLELISM:-10}"
+
+### The default config - run a small busybox image
+# Define what we will be running (app under test)
+# Default is we run busybox, as a 'small' workload
+PAYLOAD="${PAYLOAD:-quay.io/prometheus/busybox:latest}"
+PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}"
+
+###
+# which RUNTIME we use is picked up from the env in
+# common.bash. You can over-ride by setting RUNTIME in your env
+
+###
+# Define the cutoff checks for when we stop running the test
+ # Run up to this many containers
+NUM_CONTAINERS="${NUM_CONTAINERS:-100}"
+ # Run until we have consumed this much memory (from MemFree)
+MAX_MEMORY_CONSUMED="${MAX_MEMORY_CONSUMED:-256*1024*1024*1024}"
+ # Run until we have this much MemFree left
+MIN_MEMORY_FREE="${MIN_MEMORY_FREE:-2*1024*1024*1024}"
+
+# Tools we need to have installed in order to operate
+REQUIRED_COMMANDS="smem awk"
+
+# If we 'dump' the system caches before we measure then we get less
+# noise in the results - they show more what our un-reclaimable footprint is
+DUMP_CACHES="${DUMP_CACHES:-1}"
+
+# Affects the name of the file to store the results in
+TEST_NAME="${TEST_NAME:-fast-footprint-busybox}"
+
+############# end of configurable items ###################
+
+# vars to remember where we started so we can calc diffs
+base_mem_avail=0
+base_mem_free=0
+
+# dump the kernel caches, so we get a more precise (or just different)
+# view of what our footprint really is.
+function dump_caches() {
+ sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
+}
+
+function init() {
+ restart_containerd_service
+
+ check_cmds $REQUIRED_COMMANDS
+ sudo -E "${CTR_EXE}" image pull "$PAYLOAD"
+
+ # Modify the test name if running with KSM enabled
+ check_for_ksm
+
+ # Use the common init func to get to a known state
+ init_env
+
+ # Prepare to start storing results
+ metrics_json_init
+
+ # Store up baseline measures
+ base_mem_avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
+ base_mem_free=$(get_memfree)
+
+ # Store our configuration for this run
+ save_config
+}
+
+save_config(){
+ metrics_json_start_array
+
+ local json="$(cat << EOF
+ {
+ "testname": "${TEST_NAME}",
+ "payload": "${PAYLOAD}",
+ "payload_args": "${PAYLOAD_ARGS}",
+ "payload_sleep": ${PAYLOAD_SLEEP},
+ "ksm_settle_time": ${KSM_WAIT_TIME},
+ "num_containers": ${NUM_CONTAINERS},
+ "parallelism": ${PARALLELISM},
+ "max_memory_consumed": "${MAX_MEMORY_CONSUMED}",
+ "min_memory_free": "${MIN_MEMORY_FREE}",
+ "dump_caches": "${DUMP_CACHES}"
+ }
+EOF
+)"
+ metrics_json_add_array_element "$json"
+ metrics_json_end_array "Config"
+}
+
+function cleanup() {
+ # Finish storing the results
+ metrics_json_save
+
+ clean_env_ctr
+}
+
+# helper function to get USS of process in arg1
+function get_proc_uss() {
+ item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $4}')
+ ((item*=1024))
+ echo $item
+}
+
+# helper function to get PSS of process in arg1
+function get_proc_pss() {
+ item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $5}')
+ ((item*=1024))
+ echo $item
+}
+
+# Get the PSS for the whole of userspace (all processes)
+# This allows us to see if we had any impact on the rest of the system, for instance
+# dockerd grows as we launch containers, so we should account for that in our total
+# memory breakdown
+function grab_all_pss() {
+ item=$(sudo smem -t | tail -1 | awk '{print $5}')
+ ((item*=1024))
+
+ local json="$(cat << EOF
+ "all_pss": {
+ "pss": $item,
+ "Units": "KB"
+ }
+EOF
+)"
+
+ metrics_json_add_array_fragment "$json"
+}
+
+function grab_user_smem() {
+ # userspace
+ item=$(sudo smem -w | head -5 | tail -1 | awk '{print $3}')
+ ((item*=1024))
+
+ local json="$(cat << EOF
+ "user_smem": {
+ "userspace": $item,
+ "Units": "KB"
+ }
+EOF
+)"
+
+ metrics_json_add_array_fragment "$json"
+}
+
+function grab_slab() {
+ # Grabbing slab total from meminfo is easier than doing the math
+ # on slabinfo
+ item=$(fgrep "Slab:" /proc/meminfo | awk '{print $2}')
+ ((item*=1024))
+
+ local json="$(cat << EOF
+ "slab": {
+ "slab": $item,
+ "Units": "KB"
+ }
+EOF
+)"
+
+ metrics_json_add_array_fragment "$json"
+}
+
+function get_memfree() {
+ mem_free=$(sudo smem -w | head -6 | tail -1 | awk '{print $4}')
+ ((mem_free*=1024))
+ echo $mem_free
+}
+
+function grab_system() {
+
+ # avail memory, from 'free'
+ local avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
+ local avail_decr=$((base_mem_avail-avail))
+
+ # cached memory, from 'free'
+ local cached=$(free -b | head -2 | tail -1 | awk '{print $6}')
+
+ # free memory from smem
+ local smem_free=$(get_memfree)
+ local free_decr=$((base_mem_free-item))
+
+ # Anon pages
+ local anon=$(fgrep "AnonPages:" /proc/meminfo | awk '{print $2}')
+ ((anon*=1024))
+
+ # Mapped pages
+ local mapped=$(egrep "^Mapped:" /proc/meminfo | awk '{print $2}')
+ ((mapped*=1024))
+
+ # Cached
+ local meminfo_cached=$(grep "^Cached:" /proc/meminfo | awk '{print $2}')
+ ((meminfo_cached*=1024))
+
+ local json="$(cat << EOF
+ "system": {
+ "avail": $avail,
+ "avail_decr": $avail_decr,
+ "cached": $cached,
+ "smem_free": $smem_free,
+ "free_decr": $free_decr,
+ "anon": $anon,
+ "mapped": $mapped,
+ "meminfo_cached": $meminfo_cached,
+ "Units": "KB"
+ }
+EOF
+)"
+
+ metrics_json_add_array_fragment "$json"
+}
+
+function grab_stats() {
+ # If configured, dump the caches so we get a more stable
+ # view of what our static footprint really is
+ if [[ "$DUMP_CACHES" ]] ; then
+ dump_caches
+ fi
+
+ # user space data
+ # PSS taken all userspace
+ grab_all_pss
+ # user as reported by smem
+ grab_user_smem
+
+ # System overview data
+ # System free and cached
+ grab_system
+
+ # kernel data
+ # The 'total kernel space taken' we can work out as:
+ # ktotal = ((free-avail)-user)
+ # So, we don't grab that number from smem, as that is what it does
+ # internally anyhow.
+ # Still try to grab any finer kernel details that we can though
+
+ # totals from slabinfo
+ grab_slab
+
+ metrics_json_close_array_element
+}
+
+function check_limits() {
+ mem_free=$(get_memfree)
+ if ((mem_free <= MIN_MEMORY_FREE)); then
+ echo 1
+ return
+ fi
+
+ mem_consumed=$((base_mem_avail-mem_free))
+ if ((mem_consumed >= MAX_MEMORY_CONSUMED)); then
+ echo 1
+ return
+ fi
+
+ echo 0
+}
+
+launch_containers() {
+ local parloops leftovers
+
+ (( parloops=${NUM_CONTAINERS}/${PARALLELISM} ))
+ (( leftovers=${NUM_CONTAINERS} - (${parloops}*${PARALLELISM}) ))
+
+ echo "Launching ${parloops}x${PARALLELISM} containers + ${leftovers} etras"
+
+ containers=()
+
+ local iter n
+ for iter in $(seq 1 $parloops); do
+ echo "Launch iteration ${iter}"
+ for n in $(seq 1 $PARALLELISM); do
+ containers+=($(random_name))
+ sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS &
+ done
+
+ if [[ $PAYLOAD_SLEEP ]]; then
+ sleep $PAYLOAD_SLEEP
+ fi
+
+ # check if we have hit one of our limits and need to wrap up the tests
+ if (($(check_limits))); then
+ echo "Ran out of resources, check_limits failed"
+ return
+ fi
+ done
+
+ for n in $(seq 1 $leftovers); do
+ containers+=($(random_name))
+ sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS &
+ done
+}
+
+wait_containers() {
+ local t numcontainers
+ # nap 3s between checks
+ local step=3
+
+ for ((t=0; t<${CTR_POLL_TIMEOUT}; t+=step)); do
+
+ numcontainers=$(sudo -E "${CTR_EXE}" c list -q | wc -l)
+
+ if (( numcontainers >= ${NUM_CONTAINERS} )); then
+ echo "All containers now launched (${t}s)"
+ return
+ else
+ echo "Waiting for containers to launch (${numcontainers} at ${t}s)"
+ fi
+ sleep ${step}
+ done
+
+ echo "Timed out waiting for containers to launch (${t}s)"
+ cleanup
+ die "Timed out waiting for containers to launch (${t}s)"
+}
+
+function go() {
+ # Init the json cycle for this save
+ metrics_json_start_array
+
+ # Grab the first set of stats before we run any containers.
+ grab_stats
+
+ launch_containers
+ wait_containers
+
+ if [ $ksm_on == "1" ]; then
+ echo "Wating for KSM to settle..."
+ wait_ksm_settle ${KSM_WAIT_TIME}
+ fi
+
+ grab_stats
+
+ # Wrap up the results array
+ metrics_json_end_array "Results"
+}
+
+function show_vars()
+{
+ echo -e "\nEvironment variables:"
+ echo -e "\tName (default)"
+ echo -e "\t\tDescription"
+ echo -e "\tPAYLOAD (${PAYLOAD})"
+ echo -e "\t\tThe ctr image to run"
+ echo -e "\tPAYLOAD_ARGS (${PAYLOAD_ARGS})"
+ echo -e "\t\tAny extra arguments passed into the docker 'run' command"
+ echo -e "\tPAYLOAD_SLEEP (${PAYLOAD_SLEEP})"
+ echo -e "\t\tSeconds to sleep between launch and measurement, to allow settling"
+ echo -e "\tKSM_WAIT_TIME (${KSM_WAIT_TIME})"
+ echo -e "\t\tSeconds to wait for KSM to settle before we take the final measure"
+ echo -e "\tCTR_POLL_TIMEOUT (${CTR_POLL_TIMEOUT})"
+ echo -e "\t\tSeconds to poll for ctr to finish launching containers"
+ echo -e "\tPARALLELISM (${PARALLELISM})"
+ echo -e "\t\tNumber of containers we launch in parallel"
+ echo -e "\tNUM_CONTAINERS (${NUM_CONTAINERS})"
+ echo -e "\t\tThe total number of containers to run"
+ echo -e "\tMAX_MEMORY_CONSUMED (${MAX_MEMORY_CONSUMED})"
+ echo -e "\t\tThe maximum amount of memory to be consumed before terminating"
+ echo -e "\tMIN_MEMORY_FREE (${MIN_MEMORY_FREE})"
+ echo -e "\t\tThe minimum amount of memory allowed to be free before terminating"
+ echo -e "\tDUMP_CACHES (${DUMP_CACHES})"
+ echo -e "\t\tA flag to note if the system caches should be dumped before capturing stats"
+ echo -e "\tTEST_NAME (${TEST_NAME})"
+ echo -e "\t\tCan be set to over-ride the default JSON results filename"
+
+}
+
+function help()
+{
+ usage=$(cat << EOF
+Usage: $0 [-h] [options]
+ Description:
+ Launch a series of workloads and take memory metric measurements after
+ each launch.
+ Options:
+ -h, Help page.
+EOF
+)
+ echo "$usage"
+ show_vars
+}
+
+function main() {
+
+ local OPTIND
+ while getopts "h" opt;do
+ case ${opt} in
+ h)
+ help
+ exit 0;
+ ;;
+ esac
+ done
+ shift $((OPTIND-1))
+
+ init
+ go
+ cleanup
+}
+
+main "$@"
diff --git a/tests/metrics/density/footprint_data.md b/tests/metrics/density/footprint_data.md
new file mode 100644
index 000000000..b9ba27fe0
--- /dev/null
+++ b/tests/metrics/density/footprint_data.md
@@ -0,0 +1,87 @@
+# Footprint data script details
+
+The `footprint_data.sh` script runs a number of identical containers sequentially
+via ctr and takes a number of memory related measurements after each
+launch. The script is generally not used in a CI type environment, but is intended
+to be run and analyzed manually.
+
+You can configure the script by setting a number of environment variables.
+
+The following sections list details of the configurable variables, along with a
+small example invocation script.
+
+## Variables
+Environment variables can take effect in two ways.
+
+Some variables affect how the payload is executed. The `RUNTIME` and `PAYLOAD`
+arguments directly affect the payload execution with the following line in
+the script:
+
+`$ ctr run --memory-limit $PAYLOAD_RUNTIME_ARGS --rm --runtime=$CONTAINERD_RUNTIME $PAYLOAD $NAME sh -c $PAYLOAD_ARGS`
+
+Other settings affect how memory footprint is measured and the test termination
+conditions.
+
+| Variable | Function
+| -------- | --------
+| `PAYLOAD` | The ctr image to run
+| `PAYLOAD_ARGS` | Any arguments passed into the ctr image
+| `PAYLOAD_RUNTIME_ARGS` | Any extra arguments passed into the ctr `run` command
+| `PAYLOAD_SLEEP` | Seconds to sleep between launch and measurement, to allow settling
+| `MAX_NUM_CONTAINERS` | The maximum number of containers to run before terminating
+| `MAX_MEMORY_CONSUMED` | The maximum amount of memory to be consumed before terminating
+| `MIN_MEMORY_FREE` | The minimum amount of memory allowed to be free before terminating
+| `DUMP_CACHES` | A flag to note if the system caches should be dumped before capturing stats
+| `DATAFILE` | Can be set to over-ride the default JSON results filename
+
+## Output files
+The names of the JSON files generated by the test are dictated by some of the parameters
+the test is utilising. The default filename is generated in the form of:
+`footprint-${PAYLOAD}[-ksm].json`
+
+## Measurements
+The test measures, calculates, and stores a number of data items:
+
+| Item | Description
+| ---- | -----------
+| `uss` | USS for all the VM runtime components
+| `pss` | PSS for all the VM runtime components
+| `all_pss` | PSS of all of userspace - to monitor if we had other impact on the system
+| `user_smem` | `smem` "userspace" consumption value
+| `avail` | "available" memory from `free`
+| `avail_decr` | "available" memory decrease since start of test
+| `cached` | "Cached" memory from `/proc/meminfo`
+| `smem_free` | Free memory as reported by `smem`
+| `free_decr` | Decrease in Free memory reported by `smem` since start of test
+| `anon` | `AnonPages` as reported from `/proc/meminfo`
+| `mapped` | Mapped pages as reported from `/proc/meminfo`
+| `cached` | Cached pages as reported from `/proc/meminfo`
+| `slab` | Slab as reported from `/proc/meminfo`
+
+## Example script
+The following script is an example of how to configure the environment variables and
+invoke the test script to run a number of different container tests.
+
+```
+#!/bin/bash
+
+set -e
+set -x
+
+export MAX_NUM_CONTAINERS=10
+export MAX_MEMORY_CONSUMED=6*1024*1024*1024
+
+function run() {
+ ###
+ # Define what we will be running (app under test)
+ # Default is we run busybox, as a 'small' workload
+ export PAYLOAD="quay.io/prometheus/busybox:latest"
+ export PAYLOAD_ARGS="tail -f /dev/null"
+ export PAYLOAD_SLEEP=10
+ export PAYLOAD_RUNTIME_ARGS="5120"
+ sudo -E bash $(pwd)/density/footprint_data.sh
+}
+
+export CONTAINERD_RUNTIME=io.containerd.kata.v2
+run
+```
diff --git a/tests/metrics/density/footprint_data.sh b/tests/metrics/density/footprint_data.sh
new file mode 100755
index 000000000..f5fc11341
--- /dev/null
+++ b/tests/metrics/density/footprint_data.sh
@@ -0,0 +1,360 @@
+#!/bin/bash
+# Copyright (c) 2017-2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# A script to gather memory 'footprint' information as we launch more
+# and more containers
+#
+# The script gathers information about both user and kernel space consumption
+# Output is into a .json file, named using some of the config component names
+# (such as footprint-busybox.json)
+
+# Pull in some common, useful, items
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+source "${SCRIPT_PATH}/../lib/common.bash"
+
+KSM_ENABLE_FILE="/sys/kernel/mm/ksm/run"
+
+# Note that all vars that can be set from outside the script (that is,
+# passed in the ENV), use the ':-' setting to allow being over-ridden
+
+# Default sleep for 10s to let containers come up and finish their
+# initialisation before we take the measures. Some of the larger
+# containers can take a number of seconds to get running.
+PAYLOAD_SLEEP="${PAYLOAD_SLEEP:-10}"
+
+### The default config - run a small busybox image
+# Define what we will be running (app under test)
+# Default is we run busybox, as a 'small' workload
+PAYLOAD="${PAYLOAD:-quay.io/prometheus/busybox:latest}"
+PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}"
+
+###
+# Define the cutoff checks for when we stop running the test
+ # Run up to this many containers
+MAX_NUM_CONTAINERS="${MAX_NUM_CONTAINERS:-10}"
+ # Run until we have consumed this much memory (from MemFree)
+MAX_MEMORY_CONSUMED="${MAX_MEMORY_CONSUMED:-6*1024*1024*1024}"
+ # Run until we have this much MemFree left
+MIN_MEMORY_FREE="${MIN_MEMORY_FREE:-2*1024*1024*1024}"
+
+# Tools we need to have installed in order to operate
+REQUIRED_COMMANDS="smem awk"
+
+# If we 'dump' the system caches before we measure then we get less
+# noise in the results - they show more what our un-reclaimable footprint is
+DUMP_CACHES="${DUMP_CACHES:-1}"
+
+# Affects the name of the file to store the results in
+TEST_NAME="${TEST_NAME:-footprint-busybox}"
+
+############# end of configurable items ###################
+
+# vars to remember where we started so we can calc diffs
+base_mem_avail=0
+base_mem_free=0
+
+# dump the kernel caches, so we get a more precise (or just different)
+# view of what our footprint really is.
+function dump_caches() {
+ sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
+}
+
+function init() {
+ restart_containerd_service
+
+ check_cmds $REQUIRED_COMMANDS
+ sudo -E "${CTR_EXE}" image pull "$PAYLOAD"
+
+ # Modify the test name if running with KSM enabled
+ check_for_ksm
+
+ # Use the common init func to get to a known state
+ init_env
+
+ # Prepare to start storing results
+ metrics_json_init
+
+ # Store up baseline measures
+ base_mem_avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
+ base_mem_free=$(get_memfree)
+
+ # Store our configuration for this run
+ save_config
+}
+
+save_config(){
+ metrics_json_start_array
+
+ local json="$(cat << EOF
+ {
+ "testname": "${TEST_NAME}",
+ "payload": "${PAYLOAD}",
+ "payload_args": "${PAYLOAD_ARGS}",
+ "payload_sleep": ${PAYLOAD_SLEEP},
+ "max_containers": ${MAX_NUM_CONTAINERS},
+ "max_memory_consumed": "${MAX_MEMORY_CONSUMED}",
+ "min_memory_free": "${MIN_MEMORY_FREE}",
+ "dump_caches": "${DUMP_CACHES}"
+ }
+EOF
+)"
+ metrics_json_add_array_element "$json"
+ metrics_json_end_array "Config"
+}
+
+function cleanup() {
+ # Finish storing the results
+ metrics_json_save
+
+ clean_env_ctr
+}
+
+# helper function to get USS of process in arg1
+function get_proc_uss() {
+ item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $4}')
+ ((item*=1024))
+ echo $item
+}
+
+# helper function to get PSS of process in arg1
+function get_proc_pss() {
+ item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $5}')
+ ((item*=1024))
+ echo $item
+}
+
+# Get the PSS for the whole of userspace (all processes)
+# This allows us to see if we had any impact on the rest of the system, for instance
+# containerd grows as we launch containers, so we should account for that in our total
+# memory breakdown
+function grab_all_pss() {
+ item=$(sudo smem -t | tail -1 | awk '{print $5}')
+ ((item*=1024))
+
+ local json="$(cat << EOF
+ "all_pss": {
+ "pss": $item,
+ "Units": "KB"
+ }
+EOF
+)"
+
+ metrics_json_add_array_fragment "$json"
+}
+
+function grab_user_smem() {
+ # userspace
+ item=$(sudo smem -w | head -5 | tail -1 | awk '{print $3}')
+ ((item*=1024))
+
+ local json="$(cat << EOF
+ "user_smem": {
+ "userspace": $item,
+ "Units": "KB"
+ }
+EOF
+)"
+
+ metrics_json_add_array_fragment "$json"
+}
+
+function grab_slab() {
+ # Grabbing slab total from meminfo is easier than doing the math
+ # on slabinfo
+ item=$(fgrep "Slab:" /proc/meminfo | awk '{print $2}')
+ ((item*=1024))
+
+ local json="$(cat << EOF
+ "slab": {
+ "slab": $item,
+ "Units": "KB"
+ }
+EOF
+)"
+
+ metrics_json_add_array_fragment "$json"
+}
+
+function get_memfree() {
+ mem_free=$(sudo smem -w | head -6 | tail -1 | awk '{print $4}')
+ ((mem_free*=1024))
+ echo $mem_free
+}
+
+function grab_system() {
+ # avail memory, from 'free'
+ local avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
+ local avail_decr=$((base_mem_avail-avail))
+
+ # cached memory, from 'free'
+ local cached=$(free -b | head -2 | tail -1 | awk '{print $6}')
+
+ # free memory from smem
+ local smem_free=$(get_memfree)
+ local free_decr=$((base_mem_free-item))
+
+ # Anon pages
+ local anon=$(fgrep "AnonPages:" /proc/meminfo | awk '{print $2}')
+ ((anon*=1024))
+
+ # Mapped pages
+ local mapped=$(egrep "^Mapped:" /proc/meminfo | awk '{print $2}')
+ ((mapped*=1024))
+
+ # Cached
+ local meminfo_cached=$(grep "^Cached:" /proc/meminfo | awk '{print $2}')
+ ((meminfo_cached*=1024))
+
+ local json="$(cat << EOF
+ "system": {
+ "avail": $avail,
+ "avail_decr": $avail_decr,
+ "cached": $cached,
+ "smem_free": $smem_free,
+ "free_decr": $free_decr,
+ "anon": $anon,
+ "mapped": $mapped,
+ "meminfo_cached": $meminfo_cached,
+ "Units": "KB"
+ }
+EOF
+)"
+
+ metrics_json_add_array_fragment "$json"
+}
+
+function grab_stats() {
+ # If configured, dump the caches so we get a more stable
+ # view of what our static footprint really is
+ if [[ "$DUMP_CACHES" ]] ; then
+ dump_caches
+ fi
+
+ # user space data
+ # PSS taken all userspace
+ grab_all_pss
+ # user as reported by smem
+ grab_user_smem
+
+ # System overview data
+ # System free and cached
+ grab_system
+
+ # kernel data
+ # The 'total kernel space taken' we can work out as:
+ # ktotal = ((free-avail)-user)
+ # So, we don't grab that number from smem, as that is what it does
+ # internally anyhow.
+ # Still try to grab any finer kernel details that we can though
+
+ # totals from slabinfo
+ grab_slab
+
+ metrics_json_close_array_element
+}
+
+function check_limits() {
+ mem_free=$(get_memfree)
+ if ((mem_free <= MIN_MEMORY_FREE)); then
+ echo 1
+ return
+ fi
+
+ mem_consumed=$((base_mem_avail-mem_free))
+ if ((mem_consumed >= MAX_MEMORY_CONSUMED)); then
+ echo 1
+ return
+ fi
+
+ echo 0
+}
+
+function go() {
+ # Init the json cycle for this save
+ metrics_json_start_array
+
+ containers=()
+
+ for i in $(seq 1 $MAX_NUM_CONTAINERS); do
+ containers+=($(random_name))
+ sudo -E "${CTR_EXE}" run --rm --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS
+
+ if [[ $PAYLOAD_SLEEP ]]; then
+ sleep $PAYLOAD_SLEEP
+ fi
+
+ grab_stats
+
+ # check if we have hit one of our limits and need to wrap up the tests
+ if (($(check_limits))); then
+ # Wrap up the results array
+ metrics_json_end_array "Results"
+ return
+ fi
+ done
+
+ # Wrap up the results array
+ metrics_json_end_array "Results"
+}
+
+
+function show_vars()
+{
+ echo -e "\nEvironment variables:"
+ echo -e "\tName (default)"
+ echo -e "\t\tDescription"
+ echo -e "\tPAYLOAD (${PAYLOAD})"
+ echo -e "\t\tThe ctr image to run"
+ echo -e "\tPAYLOAD_ARGS (${PAYLOAD_ARGS})"
+ echo -e "\t\tAny extra arguments passed into the ctr 'run' command"
+ echo -e "\tPAYLOAD_SLEEP (${PAYLOAD_SLEEP})"
+ echo -e "\t\tSeconds to sleep between launch and measurement, to allow settling"
+ echo -e "\tMAX_NUM_CONTAINERS (${MAX_NUM_CONTAINERS})"
+ echo -e "\t\tThe maximum number of containers to run before terminating"
+ echo -e "\tMAX_MEMORY_CONSUMED (${MAX_MEMORY_CONSUMED})"
+ echo -e "\t\tThe maximum amount of memory to be consumed before terminating"
+ echo -e "\tMIN_MEMORY_FREE (${MIN_MEMORY_FREE})"
+ echo -e "\t\tThe path to the ctr binary (for 'smem' measurements)"
+ echo -e "\tDUMP_CACHES (${DUMP_CACHES})"
+ echo -e "\t\tA flag to note if the system caches should be dumped before capturing stats"
+ echo -e "\tTEST_NAME (${TEST_NAME})"
+ echo -e "\t\tCan be set to over-ride the default JSON results filename"
+
+}
+
+function help()
+{
+ usage=$(cat << EOF
+Usage: $0 [-h] [options]
+ Description:
+ Launch a series of workloads and take memory metric measurements after
+ each launch.
+ Options:
+ -h, Help page.
+EOF
+)
+ echo "$usage"
+ show_vars
+}
+
+function main() {
+
+ local OPTIND
+ while getopts "h" opt;do
+ case ${opt} in
+ h)
+ help
+ exit 0;
+ ;;
+ esac
+ done
+ shift $((OPTIND-1))
+
+ init
+ go
+ cleanup
+}
+
+main "$@"
diff --git a/tests/metrics/density/memory_usage.sh b/tests/metrics/density/memory_usage.sh
new file mode 100755
index 000000000..57d2ce3cd
--- /dev/null
+++ b/tests/metrics/density/memory_usage.sh
@@ -0,0 +1,383 @@
+#!/bin/bash
+# Copyright (c) 2017-2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Description of the test:
+# This test launches a number of containers in idle mode,
+# It will then sleep for a configurable period of time to allow
+# any memory optimisations to 'settle, and then checks the
+# amount of memory used by all the containers to come up with
+# an average (using the PSS measurements)
+# This test uses smem tool to get the memory used.
+
+set -e
+
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+source "${SCRIPT_PATH}/../lib/common.bash"
+
+# Busybox image: Choose a small workload image, this is
+# in order to measure the runtime footprint, not the workload
+# footprint.
+IMAGE='quay.io/prometheus/busybox:latest'
+
+CMD='tail -f /dev/null'
+NUM_CONTAINERS="$1"
+WAIT_TIME="$2"
+AUTO_MODE="$3"
+TEST_NAME="memory footprint"
+SMEM_BIN="smem"
+KSM_ENABLE_FILE="/sys/kernel/mm/ksm/run"
+MEM_TMP_FILE=$(mktemp meminfo.XXXXXXXXXX)
+PS_TMP_FILE=$(mktemp psinfo.XXXXXXXXXX)
+
+function remove_tmp_file() {
+ rm -rf "${MEM_TMP_FILE}" "${PS_TMP_FILE}"
+}
+
+trap remove_tmp_file EXIT
+
+# Show help about this script
+function help(){
+cat << EOF
+Usage: $0 [auto]
+ Description:
+ : Number of containers to run.
+ : Time in seconds to wait before taking
+ metrics.
+ [auto] : Optional 'auto KSM settle' mode
+ waits for ksm pages_shared to settle down
+EOF
+}
+
+
+function get_runc_pss_memory(){
+ ctr_runc_shim_path="/usr/local/bin/containerd-shim-runc-v2"
+ get_pss_memory "${ctr_runc_shim_path}"
+}
+
+function get_runc_individual_memory() {
+ runc_process_result=$(cat "${MEM_TMP_FILE}" | tr "\n" " " | sed -e 's/\s$//g' | sed 's/ /, /g')
+
+ # Verify runc process result
+ if [ -z "${runc_process_result}" ];then
+ die "Runc process not found"
+ fi
+
+ read -r -a runc_values <<< "${runc_process_result}"
+
+ metrics_json_start_array
+
+ local json="$(cat << EOF
+ {
+ "runc individual results": [
+ $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
+ printf '%s\n\t\t\t' "${runc_values[i]}"
+ done)
+ ]
+ }
+EOF
+)"
+ metrics_json_add_array_element "$json"
+ metrics_json_end_array "Raw results"
+}
+
+# This function measures the PSS average
+# memory of a process.
+function get_pss_memory(){
+ ps="$1"
+ mem_amount=0
+ count=0
+ avg=0
+
+ if [ -z "${ps}" ]; then
+ die "No argument to get_pss_memory()"
+ fi
+
+ # Save all the processes names
+ # This will be help us to retrieve raw information
+ echo "${ps}" >> "${PS_TMP_FILE}"
+
+ data=$(sudo "${SMEM_BIN}" --no-header -P "^${ps}" -c "pss" | sed 's/[[:space:]]//g' | tr '\n' ' ' | sed 's/[[:blank:]]*$//')
+
+ # Save all the smem results
+ # This will help us to retrieve raw information
+ echo "${data}" >> "${MEM_TMP_FILE}"
+
+ gral_data=$(echo "${data// /+}" | bc)
+ for i in "${gral_data}"; do
+ if (( $i > 0 ));then
+ mem_amount=$(( i + mem_amount ))
+ (( count++ ))
+ fi
+ done
+
+ if (( "${count}" > 0 ));then
+ avg=$(bc -l <<< "scale=2; ${mem_amount} / ${count}")
+ fi
+
+ echo "${avg}"
+}
+
+function ppid() {
+ local pid
+ pid=$(ps -p "${1:-nopid}" -o ppid=)
+ echo "${pid//[[:blank:]]/}"
+}
+
+# This function measures the PSS average
+# memory of virtiofsd.
+# It is a special case of get_pss_memory,
+# virtiofsd forks itself so, smem sees the process
+# two times, this function sum both pss values:
+# pss_virtiofsd=pss_fork + pss_parent
+function get_pss_memory_virtiofsd() {
+ mem_amount=0
+ count=0
+ avg=0
+
+ virtiofsd_path=${1:-}
+ if [ -z "${virtiofsd_path}" ]; then
+ die "virtiofsd_path not provided"
+ fi
+
+ echo "${virtiofsd_path}" >> "${PS_TMP_FILE}"
+
+ virtiofsd_pids=$(ps aux | grep [v]irtiofsd | awk '{print $2}' | head -1)
+ data=$(sudo smem --no-header -P "^${virtiofsd_path}" -c pid -c "pid pss")
+
+ for p in "${virtiofsd_pids}"; do
+ parent_pid=$(ppid "${p}")
+ cmd="$(cat /proc/${p}/cmdline | tr -d '\0')"
+ cmd_parent="$(cat /proc/${parent_pid}/cmdline | tr -d '\0')"
+ if [ "${cmd}" != "${cmd_parent}" ]; then
+ pss_parent=$(printf "%s" "${data}" | grep "\s^${p}" | awk '{print $2}')
+
+ fork=$(pgrep -P "${p}")
+
+ pss_fork=$(printf "%s" "${data}" | grep "^\s*${fork}" | awk '{print $2}')
+ pss_process=$((pss_fork + pss_parent))
+
+ # Save all the smem results
+ # This will help us to retrieve raw information
+ echo "${pss_process}" >>"${MEM_TMP_FILE}"
+
+ if ((pss_process > 0)); then
+ mem_amount=$((pss_process + mem_amount))
+ ((count++))
+ fi
+ fi
+ done
+
+ if (( "${count}" > 0 ));then
+ avg=$(bc -l <<< "scale=2; ${mem_amount} / ${count}")
+ fi
+ echo "${avg}"
+}
+
+function get_individual_memory(){
+ # Getting all the individual container information
+ first_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==1' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g')
+ first_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==1' | sed 's/ /, /g')
+
+ second_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==2' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g')
+ second_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==2' | sed 's/ /, /g')
+
+ third_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==3' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g')
+ third_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==3' | sed 's/ /, /g')
+
+ read -r -a first_values <<< "${first_process_result}"
+ read -r -a second_values <<< "${second_process_result}"
+ read -r -a third_values <<< "${third_process_result}"
+
+ metrics_json_start_array
+
+ local json="$(cat << EOF
+ {
+ "${first_process_name} memory": [
+ $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
+ [ -n "${first_values[i]}" ] &&
+ printf '%s\n\t\t\t' "${first_values[i]}"
+ done)
+ ],
+ "${second_process_name} memory": [
+ $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
+ [ -n "${second_values[i]}" ] &&
+ printf '%s\n\t\t\t' "${second_values[i]}"
+ done)
+ ],
+ "${third_process_name} memory": [
+ $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
+ [ -n "${third_values[i]}" ] &&
+ printf '%s\n\t\t\t' "${third_values[i]}"
+ done)
+ ]
+ }
+EOF
+)"
+ metrics_json_add_array_element "$json"
+ metrics_json_end_array "Raw results"
+}
+
+# Try to work out the 'average memory footprint' of a container.
+function get_memory_usage(){
+ hypervisor_mem=0
+ virtiofsd_mem=0
+ shim_mem=0
+ memory_usage=0
+
+ containers=()
+
+ info "Creating ${NUM_CONTAINERS} containers"
+ for ((i=1; i<="${NUM_CONTAINERS}"; i++)); do
+ containers+=($(random_name))
+ sudo "${CTR_EXE}" run --runtime "${CTR_RUNTIME}" -d "${IMAGE}" "${containers[-1]}" sh -c "${CMD}"
+ done
+
+ if [ "${AUTO_MODE}" == "auto" ]; then
+ if (( ksm_on != 1 )); then
+ die "KSM not enabled, cannot use auto mode"
+ fi
+
+ echo "Entering KSM settle auto detect mode..."
+ wait_ksm_settle "${WAIT_TIME}"
+ else
+ # If KSM is enabled, then you normally want to sleep long enough to
+ # let it do its work and for the numbers to 'settle'.
+ echo "napping ${WAIT_TIME} s"
+ sleep "${WAIT_TIME}"
+ fi
+
+ metrics_json_start_array
+ # Check the runtime in order in order to determine which process will
+ # be measured about PSS
+ if [ "${RUNTIME}" == "runc" ]; then
+ runc_workload_mem="$(get_runc_pss_memory)"
+ memory_usage="${runc_workload_mem}"
+
+ local json="$(cat << EOF
+ {
+ "average": {
+ "Result": ${memory_usage},
+ "Units" : "KB"
+ },
+ "runc": {
+ "Result": ${runc_workload_mem},
+ "Units" : "KB"
+ }
+ }
+EOF
+)"
+
+ else [ "$RUNTIME" == "kata-runtime" ] || [ "$RUNTIME" == "kata-qemu" ]
+ # Get PSS memory of VM runtime components.
+ # And check that the smem search has found the process - we get a "0"
+ # back if that procedure fails (such as if a process has changed its name
+ # or is not running when expected to be so)
+ # As an added bonus - this script must be run as root.
+ # Now if you do not have enough rights
+ # the smem failure to read the stats will also be trapped.
+
+ hypervisor_mem="$(get_pss_memory ${HYPERVISOR_PATH})"
+ if [ "${hypervisor_mem}" == "0" ]; then
+ die "Failed to find PSS for ${HYPERVISOR_PATH}"
+ fi
+
+ virtiofsd_mem="$(get_pss_memory_virtiofsd ${VIRTIOFSD_PATH})"
+ if [ "${virtiofsd_mem}" == "0" ]; then
+ echo >&2 "WARNING: Failed to find PSS for ${VIRTIOFSD_PATH}"
+ fi
+ shim_mem="$(get_pss_memory ${SHIM_PATH})"
+ if [ "${shim_mem}" == "0" ]; then
+ die "Failed to find PSS for ${SHIM_PATH}"
+ fi
+
+ mem_usage="$(bc -l <<< "scale=2; ${hypervisor_mem} +${virtiofsd_mem} + ${shim_mem}")"
+ memory_usage="${mem_usage}"
+
+ local json="$(cat << EOF
+ {
+ "average": {
+ "Result": ${mem_usage},
+ "Units" : "KB"
+ },
+ "qemus": {
+ "Result": ${hypervisor_mem},
+ "Units" : "KB"
+ },
+ "virtiofsds": {
+ "Result": ${virtiofsd_mem},
+ "Units" : "KB"
+ },
+ "shims": {
+ "Result": ${shim_mem},
+ "Units" : "KB"
+ }
+ }
+EOF
+)"
+ fi
+
+ metrics_json_add_array_element "$json"
+ metrics_json_end_array "Results"
+
+ clean_env_ctr
+}
+
+function save_config(){
+ metrics_json_start_array
+
+ local json="$(cat << EOF
+ {
+ "containers": "${NUM_CONTAINERS}",
+ "ksm": "${ksm_on}",
+ "auto": "${AUTO_MODE}",
+ "waittime": "${WAIT_TIME}",
+ "image": "${IMAGE}",
+ "command": "${CMD}"
+ }
+EOF
+
+)"
+ metrics_json_add_array_element "$json"
+ metrics_json_end_array "Config"
+}
+
+function main(){
+ # Verify enough arguments
+ if [ $# != 2 ] && [ $# != 3 ];then
+ echo >&2 "error: Not enough arguments [$@]"
+ help
+ exit 1
+ fi
+
+ #Check for KSM before reporting test name, as it can modify it
+ check_for_ksm
+
+ init_env
+
+ check_cmds "${SMEM_BIN}" bc
+ check_images "${IMAGE}"
+
+ if [ "${CTR_RUNTIME}" == "io.containerd.kata.v2" ]; then
+ export RUNTIME="kata-runtime"
+ elif [ "${CTR_RUNTIME}" == "io.containerd.runc.v2" ]; then
+ export RUNTIME="runc"
+ else
+ die "Unknown runtime ${CTR_RUNTIME}"
+ fi
+
+ metrics_json_init
+ save_config
+ get_memory_usage
+
+ if [ "$RUNTIME" == "runc" ]; then
+ get_runc_individual_memory
+ elif [ "$RUNTIME" == "kata-runtime" ]; then
+ get_individual_memory
+ fi
+
+ metrics_json_save
+}
+
+main "$@"
diff --git a/tests/metrics/density/memory_usage_inside_container.sh b/tests/metrics/density/memory_usage_inside_container.sh
new file mode 100755
index 000000000..071ded175
--- /dev/null
+++ b/tests/metrics/density/memory_usage_inside_container.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+# Copyright (c) 2017-2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Description of the test:
+# This test launches a busybox container and inside
+# memory free, memory available and total memory
+# is measured by using /proc/meminfo.
+
+set -e
+
+# General env
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+source "${SCRIPT_PATH}/../lib/common.bash"
+
+TEST_NAME="memory footprint inside container"
+VERSIONS_FILE="${SCRIPT_PATH}/../../versions.yaml"
+IMAGE='quay.io/prometheus/busybox:latest'
+CMD="sleep 10; cat /proc/meminfo"
+# We specify here in 'k', as that then matches the results we get from the meminfo,
+# which makes later direct comparison easier.
+MEMSIZE=${MEMSIZE:-$((2048*1024))}
+
+# this variable determines the number of attempts when a test
+# result is considered not valid (a zero value or a negative value)
+MAX_FAILED_ATTEMPTS=3
+memtotalAvg=0
+units_memtotal=""
+memfreeAvg=0
+units_memfree=""
+memavailableAvg=0
+units_memavailable=""
+
+# count_iters: is the index of the current iteration
+count_iters=0
+
+# valid_result: if value stored is '1' the result is valid, '0' otherwise
+valid_result=0
+
+parse_results() {
+ local raw_results="${1}"
+
+ # Variables used for sum cummulative values in the case of two or more reps.
+ # and used to compute average results for 'json' output format.
+ local memtotal_acu="${2:-0}"
+ local memfree_acu="${3:-0}"
+ local memavailable_acu="${4:-0}"
+
+ local memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $2}')
+ units_memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $3}')
+
+ local memfree=$(echo "$raw_results" | awk '/MemFree/ {print $2}')
+ units_memfree=$(echo "$raw_results" | awk '/MemFree/ {print $3}')
+
+ local memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $2}')
+ units_memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $3}')
+
+ # check results: if any result is zero or negative, it is considered as invalid, and the test will be repeated.
+ if (( $(echo "$memtotal <= 0" | bc -l) )) || (( $(echo "$memfree <= 0" | bc -l) )) || (( $(echo "$memavailable <= 0" | bc -l) )); then
+ MAX_FAILED_ATTEMPTS=$((MAX_FAILED_ATTEMPTS-1))
+ valid_result=0
+ info "Skipping invalid result: memtotal: $memtotal memfree: $memfree memavailable: $memavailable"
+ return 0
+ fi
+
+ memtotalAvg=$((memtotal+memtotal_acu))
+ memfreeAvg=$((memfree+memfree_acu))
+ memavailableAvg=$((memavailable+memavailable_acu))
+ valid_result=1
+ info "Iteration# $count_iters memtotal: $memtotal memfree: $memfree memavailable: $memavailable"
+}
+
+store_results_json() {
+ metrics_json_start_array
+ memtotalAvg=$(echo "scale=2; $memtotalAvg / $count_iters" | bc)
+ memfreeAvg=$(echo "scale=2; $memfreeAvg / $count_iters" | bc)
+ memavailableAvg=$(echo "scale=2; $memavailableAvg / $count_iters" | bc)
+
+ local json="$(cat << EOF
+ {
+ "memrequest": {
+ "Result" : ${MEMSIZE},
+ "Units" : "Kb"
+ },
+ "memtotal": {
+ "Result" : ${memtotalAvg},
+ "Units" : "${units_memtotal}"
+ },
+ "memfree": {
+ "Result" : ${memfreeAvg},
+ "Units" : "${units_memfree}"
+ },
+ "memavailable": {
+ "Result" : ${memavailableAvg},
+ "Units" : "${units_memavailable}"
+ },
+ "repetitions": {
+ "Result" : ${count_iters}
+ }
+ }
+EOF
+)"
+ metrics_json_add_array_element "$json"
+ metrics_json_end_array "Results"
+ metrics_json_save
+}
+
+function main() {
+ # switch to select output format
+ local num_iterations=${1:-1}
+ info "Iterations: $num_iterations"
+
+ # Check tools/commands dependencies
+ cmds=("awk" "ctr")
+ init_env
+ check_cmds "${cmds[@]}"
+ check_images "${IMAGE}"
+ metrics_json_init
+ while [ $count_iters -lt $num_iterations ]; do
+ local output=$(sudo -E "${CTR_EXE}" run --memory-limit $((MEMSIZE*1024)) --rm --runtime=$CTR_RUNTIME $IMAGE busybox sh -c "$CMD" 2>&1)
+ parse_results "${output}" "${memtotalAvg}" "${memfreeAvg}" "${memavailableAvg}"
+
+ # quit if number of attempts exceeds the allowed value.
+ [ ${MAX_FAILED_ATTEMPTS} -eq 0 ] && die "Max number of attempts exceeded."
+ [ ${valid_result} -eq 1 ] && count_iters=$((count_iters+1))
+ done
+ store_results_json
+ clean_env_ctr
+}
+
+# Parameters
+# @1: num_iterations {integer}
+main "$@"
diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh
index cdff6c766..5f8bfbf08 100755
--- a/tests/metrics/gha-run.sh
+++ b/tests/metrics/gha-run.sh
@@ -9,24 +9,28 @@ set -o errexit
set -o nounset
set -o pipefail
-kata_tarball_dir=${2:-kata-artifacts}
+kata_tarball_dir="${2:-kata-artifacts}"
metrics_dir="$(dirname "$(readlink -f "$0")")"
source "${metrics_dir}/../common.bash"
+source "${metrics_dir}/lib/common.bash"
-create_symbolic_links() {
- hypervisor="${1:-qemu}"
+declare -r results_dir="${metrics_dir}/results"
+declare -r checkmetrics_dir="${metrics_dir}/cmd/checkmetrics"
+declare -r checkmetrics_config_dir="${checkmetrics_dir}/ci_worker"
+
+function create_symbolic_links() {
local link_configuration_file="/opt/kata/share/defaults/kata-containers/configuration.toml"
- local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${hypervisor}.toml"
+ local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${KATA_HYPERVISOR}.toml"
- if [ ${hypervisor} != 'qemu' ] && [ ${hypervisor} != 'clh' ]; then
- die "Failed to set the configuration.toml: '${hypervisor}' is not recognized as a valid hypervisor name."
+ if [ "${KATA_HYPERVISOR}" != 'qemu' ] && [ "${KATA_HYPERVISOR}" != 'clh' ]; then
+ die "Failed to set the configuration.toml: '${KATA_HYPERVISOR}' is not recognized as a valid hypervisor name."
fi
sudo ln -sf "${source_configuration_file}" "${link_configuration_file}"
}
# Configures containerd
-overwrite_containerd_config() {
+function overwrite_containerd_config() {
containerd_config="/etc/containerd/config.toml"
sudo rm "${containerd_config}"
sudo tee "${containerd_config}" << EOF
@@ -44,7 +48,7 @@ version = 2
EOF
}
-install_kata() {
+function install_kata() {
local kata_tarball="kata-static.tar.xz"
declare -r katadir="/opt/kata"
declare -r destdir="/"
@@ -53,7 +57,7 @@ install_kata() {
# Removing previous kata installation
sudo rm -rf "${katadir}"
- pushd ${kata_tarball_dir}
+ pushd "${kata_tarball_dir}"
sudo tar -xvf "${kata_tarball}" -C "${destdir}"
popd
@@ -64,17 +68,26 @@ install_kata() {
check_containerd_config_for_kata
restart_containerd_service
+ install_checkmetrics
}
-check_containerd_config_for_kata() {
+function install_checkmetrics() {
+ # Ensure we have the latest checkmetrics
+ pushd "${checkmetrics_dir}"
+ make
+ sudo make install
+ popd
+}
+
+function check_containerd_config_for_kata() {
# check containerd config
declare -r line1="default_runtime_name = \"kata\""
declare -r line2="runtime_type = \"io.containerd.kata.v2\""
declare -r num_lines_containerd=2
declare -r containerd_path="/etc/containerd/config.toml"
- local count_matches=$(grep -ic "$line1\|$line2" ${containerd_path})
+ local count_matches=$(grep -ic "$line1\|$line2" "${containerd_path}")
- if [ $count_matches = $num_lines_containerd ]; then
+ if [ "${count_matches}" = "${num_lines_containerd}" ]; then
info "containerd ok"
else
info "overwriting containerd configuration w/ a valid one"
@@ -82,21 +95,62 @@ check_containerd_config_for_kata() {
fi
}
+function check_metrics() {
+ local cm_base_file="${checkmetrics_config_dir}/checkmetrics-json-${KATA_HYPERVISOR}-kata-metric8.toml"
+ checkmetrics --debug --percentage --basefile "${cm_base_file}" --metricsdir "${results_dir}"
+ cm_result=$?
+ if [ "${cm_result}" != 0 ]; then
+ die "run-metrics-ci: checkmetrics FAILED (${cm_result})"
+ fi
+}
+
+function make_tarball_results() {
+ compress_metrics_results_dir "${metrics_dir}/results" "${GITHUB_WORKSPACE}/results-${KATA_HYPERVISOR}.tar.gz"
+}
+
function run_test_launchtimes() {
- hypervisor="${1}"
+ info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor"
- info "Running Launch Time test using ${hypervisor} hypervisor"
-
- create_symbolic_links "${hypervisor}"
+ create_symbolic_links
bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20
}
+function run_test_memory_usage() {
+ info "Running memory-usage test using ${KATA_HYPERVISOR} hypervisor"
+
+ create_symbolic_links
+ bash tests/metrics/density/memory_usage.sh 20 5
+
+ check_metrics
+}
+
+function run_test_memory_usage_inside_container() {
+ info "Running memory-usage inside the container test using ${KATA_HYPERVISOR} hypervisor"
+
+ # ToDo: remove the exit once the metrics workflow is stable
+ exit 0
+ create_symbolic_links
+ bash tests/metrics/density/memory_usage_inside_container.sh 5
+}
+
+function run_test_blogbench() {
+ info "Running Blogbench test using ${KATA_HYPERVISOR} hypervisor"
+
+ # ToDo: remove the exit once the metrics workflow is stable
+ exit 0
+ create_symbolic_links
+ bash tests/metrics/storage/blogbench.sh
+}
+
function main() {
action="${1:-}"
case "${action}" in
install-kata) install_kata ;;
- run-test-launchtimes-qemu) run_test_launchtimes "qemu" ;;
- run-test-launchtimes-clh) run_test_launchtimes "clh" ;;
+ make-tarball-results) make_tarball_results ;;
+ run-test-launchtimes) run_test_launchtimes ;;
+ run-test-memory-usage) run_test_memory_usage ;;
+ run-test-memory-usage-inside-container) run_test_memory_usage_inside_container ;;
+ run-test-blogbench) run_test_blogbench ;;
*) >&2 die "Invalid argument" ;;
esac
}
diff --git a/tests/metrics/lib/common.bash b/tests/metrics/lib/common.bash
index ad7b2a5c4..0bb31030d 100755
--- a/tests/metrics/lib/common.bash
+++ b/tests/metrics/lib/common.bash
@@ -47,14 +47,14 @@ quay.io/libpod"
#
# cmds=(“cmd1” “cmd2”)
# check_cmds "${cmds[@]}"
-check_cmds()
+function check_cmds()
{
local cmd req_cmds=( "$@" )
for cmd in "${req_cmds[@]}"; do
if ! command -v "$cmd" > /dev/null 2>&1; then
die "command $cmd not available"
fi
- echo "command: $cmd: yes"
+ info "command: $cmd: yes"
done
}
@@ -68,19 +68,20 @@ check_cmds()
#
# images=(“img1” “img2”)
# check_imgs "${images[@]}"
-check_images()
+function check_images()
{
local img req_images=( "$@" )
for img in "${req_images[@]}"; do
- echo "ctr pull'ing: $img"
+ info "ctr pull'ing: $img"
if ! sudo "${CTR_EXE}" image pull "$img"; then
die "Failed to pull image $img"
fi
- echo "ctr pull'd: $img"
+ info "ctr pull'd: $img"
done
}
-generate_build_dockerfile() {
+function generate_build_dockerfile()
+{
local dockerfile="$1"
local image="$2"
local map_key="$3"
@@ -99,14 +100,14 @@ generate_build_dockerfile() {
# This function performs a build on the image names
# passed in, to ensure that we have the latest changes from
# the dockerfiles
-build_dockerfile_image()
+function build_dockerfile_image()
{
local image="$1"
local dockerfile_path="$2"
local dockerfile_dir=${2%/*}
if [ -f "$dockerfile_path" ]; then
- echo "docker building $image"
+ info "docker building $image"
if ! sudo "${DOCKER_EXE}" build --build-arg http_proxy="${http_proxy}" --build-arg https_proxy="${https_proxy}" --label "$image" --tag "${image}" -f "$dockerfile_path" "$dockerfile_dir"; then
die "Failed to docker build image $image"
fi
@@ -119,7 +120,7 @@ build_dockerfile_image()
# This function removes the ctr image, builds a new one using a dockerfile
# and imports the image from docker to ctr
-check_ctr_images()
+function check_ctr_images()
{
local ctr_image="$1"
local dockerfile_path="$2"
@@ -138,7 +139,7 @@ check_ctr_images()
# A one time (per uber test cycle) init that tries to get the
# system to a 'known state' as much as possible
-metrics_onetime_init()
+function metrics_onetime_init()
{
# The onetime init must be called once, and only once
if [ ! -z "$onetime_init_done" ]; then
@@ -155,14 +156,14 @@ metrics_onetime_init()
# Print a banner to the logs noting clearly which test
# we are about to run
-test_banner()
+function test_banner()
{
- echo -e "\n===== starting test [$1] ====="
+ info -e "\n===== starting test [$1] ====="
}
# Initialization/verification environment. This function makes
# minimal steps for metrics/tests execution.
-init_env()
+function init_env()
{
test_banner "${TEST_NAME}"
@@ -183,7 +184,8 @@ init_env()
# This function checks if there are containers or
# shim/proxy/hypervisor processes up, if found, they are
# killed to start test with clean environment.
-kill_processes_before_start() {
+function kill_processes_before_start()
+{
DOCKER_PROCS=$(sudo "${DOCKER_EXE}" ps -q)
[[ -n "${DOCKER_PROCS}" ]] && clean_env
@@ -195,26 +197,29 @@ kill_processes_before_start() {
# Generate a random name - generally used when creating containers, but can
# be used for any other appropriate purpose
-random_name() {
+function random_name()
+{
mktemp -u kata-XXXXXX
}
-show_system_ctr_state() {
- echo "Showing system state:"
- echo " --Check containers--"
+function show_system_ctr_state()
+{
+ info "Showing system state:"
+ info " --Check containers--"
sudo "${CTR_EXE}" c list
- echo " --Check tasks--"
+ info " --Check tasks--"
sudo "${CTR_EXE}" task list
local processes="containerd-shim-kata-v2"
for p in ${processes}; do
- echo " --pgrep ${p}--"
+ info " --pgrep ${p}--"
pgrep -a ${p}
done
}
-common_init(){
+function common_init()
+{
if [ "$CTR_RUNTIME" == "io.containerd.kata.v2" ] || [ "$RUNTIME" == "containerd-shim-kata-v2" ]; then
extract_kata_env
else
@@ -225,17 +230,18 @@ common_init(){
fi
}
-
# Save the current KSM settings so we can restore them later
-save_ksm_settings(){
- echo "saving KSM settings"
+function save_ksm_settings()
+{
+ info "saving KSM settings"
ksm_stored_run=$(cat ${KSM_ENABLE_FILE})
ksm_stored_pages=$(cat ${KSM_ENABLE_FILE})
ksm_stored_sleep=$(cat ${KSM_ENABLE_FILE})
}
-set_ksm_aggressive(){
- echo "setting KSM to aggressive mode"
+function set_ksm_aggressive()
+{
+ info "setting KSM to aggressive mode"
# Flip the run off/on to ensure a restart/rescan
sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}"
sudo bash -c "echo ${KSM_AGGRESIVE_PAGES} > ${KSM_PAGES_FILE}"
@@ -245,7 +251,7 @@ set_ksm_aggressive(){
if [ "${KATA_HYPERVISOR}" == "qemu" ]; then
# Disable virtio-fs and save whether it was enabled previously
set_virtio_out=$(sudo -E PATH="$PATH" "${LIB_DIR}/../../.ci/set_kata_config.sh" shared_fs virtio-9p)
- echo "${set_virtio_out}"
+ info "${set_virtio_out}"
grep -q "already" <<< "${set_virtio_out}" || was_virtio_fs=true;
fi
}
@@ -256,8 +262,9 @@ restore_virtio_fs(){
info "Not restoring virtio-fs since it wasn't enabled previously"
}
-restore_ksm_settings(){
- echo "restoring KSM settings"
+function restore_ksm_settings()
+{
+ info "restoring KSM settings"
# First turn off the run to ensure if we are then re-enabling
# that any changes take effect
sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}"
@@ -267,15 +274,17 @@ restore_ksm_settings(){
[ "${KATA_HYPERVISOR}" == "qemu" ] && restore_virtio_fs
}
-disable_ksm(){
- echo "disabling KSM"
+function disable_ksm()
+{
+ info "disabling KSM"
sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}"
[ "${KATA_HYPERVISOR}" == "qemu" ] && restore_virtio_fs
}
# See if KSM is enabled.
# If so, amend the test name to reflect that
-check_for_ksm(){
+function check_for_ksm()
+{
if [ ! -f ${KSM_ENABLE_FILE} ]; then
return
fi
@@ -294,7 +303,8 @@ check_for_ksm(){
# a full scan has managed to do few new merges)
#
# arg1 - timeout in seconds
-wait_ksm_settle(){
+function wait_ksm_settle()
+{
[[ "$RUNTIME" == "runc" ]] || [[ "$CTR_RUNTIME" == "io.containerd.runc.v2" ]] && return
local t pcnt
local oldscan=-1 newscan
@@ -305,7 +315,7 @@ wait_ksm_settle(){
# Wait some time for KSM to kick in to avoid early dismissal
for ((t=0; t<5; t++)); do
pages=$(cat "${KSM_PAGES_SHARED}")
- [[ "$pages" -ne 0 ]] && echo "Discovered KSM activity" && break
+ [[ "$pages" -ne 0 ]] && info "Discovered KSM activity" && break
sleep 1
done
@@ -315,13 +325,13 @@ wait_ksm_settle(){
newscan=$(cat /sys/kernel/mm/ksm/full_scans)
newpages=$(cat "${KSM_PAGES_SHARED}")
- [[ "$newpages" -eq 0 ]] && echo "No need to wait for KSM to settle" && return
+ [[ "$newpages" -eq 0 ]] && info "No need to wait for KSM to settle" && return
if (( newscan != oldscan )); then
- echo -e "\nnew full_scan ($oldscan to $newscan)"
+ info -e "\nnew full_scan ($oldscan to $newscan)"
# Do we have a previous scan to compare with
- echo "check pages $oldpages to $newpages"
+ info "check pages $oldpages to $newpages"
if (( oldpages != -1 )); then
# avoid divide by zero problems
@@ -330,14 +340,14 @@ wait_ksm_settle(){
# abs()
pcnt=$(( $pcnt * -1 ))
- echo "$oldpages to $newpages is ${pcnt}%"
+ info "$oldpages to $newpages is ${pcnt}%"
if (( $pcnt <= 5 )); then
- echo "KSM stabilised at ${t}s"
+ info "KSM stabilised at ${t}s"
return
fi
else
- echo "$oldpages KSM pages... waiting"
+ info "$oldpages KSM pages... waiting"
fi
fi
oldscan=$newscan
@@ -347,7 +357,7 @@ wait_ksm_settle(){
fi
sleep 1
done
- echo "Timed out after ${1}s waiting for KSM to settle"
+ info "Timed out after ${1}s waiting for KSM to settle"
}
common_init
diff --git a/tests/metrics/storage/blogbench.sh b/tests/metrics/storage/blogbench.sh
new file mode 100755
index 000000000..19a960103
--- /dev/null
+++ b/tests/metrics/storage/blogbench.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+#
+# Copyright (c) 2018-2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Description of the test:
+# This test runs the 'blogbench', and extracts the 'scores' for reads
+# and writes
+# Note - the scores are *not* normalised for the number of iterations run,
+# they are total scores for all iterations (this is the blogbench default output)
+
+set -e
+
+# General env
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+source "${SCRIPT_PATH}/../lib/common.bash"
+
+TEST_NAME="blogbench"
+IMAGE="docker.io/library/local-blogbench:latest"
+DOCKERFILE="${SCRIPT_PATH}/blogbench_dockerfile/Dockerfile"
+
+# Number of iterations for blogbench to run - note, results are not
+# scaled to iterations - more iterations results in bigger results
+ITERATIONS="${ITERATIONS:-30}"
+
+# Directory to run the test on
+# This is run inside of the container
+TESTDIR="${TESTDIR:-/tmp}"
+CMD="blogbench -i ${ITERATIONS} -d ${TESTDIR}"
+
+function main() {
+ # Check tools/commands dependencies
+ cmds=("awk" "docker")
+
+ init_env
+ check_cmds "${cmds[@]}"
+ check_ctr_images "${IMAGE}" "${DOCKERFILE}"
+ metrics_json_init
+
+ local output=$(sudo -E ${CTR_EXE} run --rm --runtime=${CTR_RUNTIME} ${IMAGE} test ${CMD})
+
+ # Save configuration
+ metrics_json_start_array
+
+ local frequency=$(echo "${output}" | grep "Frequency" | cut -d "=" -f2 | cut -d ' ' -f2)
+ local iterations=$(echo "${output}" | grep -w "iterations" | cut -d ' ' -f3)
+ local spawing_writers=$(echo "${output}" | grep -w "writers" | cut -d ' ' -f2)
+ local spawing_rewriters=$(echo "${output}" | grep -w "rewriters" | cut -d ' ' -f2)
+ local spawing_commenters=$(echo "${output}" | grep -w "commenters" | cut -d ' ' -f2)
+ local spawing_readers=$(echo "${output}" | grep -w "readers" | cut -d ' ' -f2)
+
+ local json="$(cat << EOF
+ {
+ "Frequency" : ${frequency},
+ "Iterations" : ${iterations},
+ "Number of spawing writers" : ${spawing_writers},
+ "Number of spawing rewriters" : ${spawing_rewriters},
+ "Number of spawing commenters" : ${spawing_commenters},
+ "Number of spawing readers" : ${spawing_readers}
+ }
+EOF
+)"
+ metrics_json_add_array_element "${json}"
+ metrics_json_end_array "Config"
+
+ # Save results
+ metrics_json_start_array
+
+ local writes=$(tail -2 <<< "${output}" | head -1 | awk '{print $5}')
+ local reads=$(tail -1 <<< "${output}" | awk '{print $6}')
+
+ # Obtaining other Blogbench results
+ local -r data=$(echo "${output}" | tail -n +12 | head -n -3)
+ local nb_blogs=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $1} ' | tr '\t' ',' | sed '$ s/.$//')
+ local r_articles=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $2} ' | tr '\t' ',' | sed '$ s/.$//')
+ local w_articles=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $3} ' | tr '\t' ',' | sed '$ s/.$//')
+ local r_pictures=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $4} ' | tr '\t' ',' | sed '$ s/.$//')
+ local w_pictures=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $5} ' | tr '\t' ',' | sed '$ s/.$//')
+ local r_comments=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $6} ' | tr '\t' ',' | sed '$ s/.$//')
+ local w_comments=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $7} ' | tr '\t' ',' | sed '$ s/.$//')
+
+ local json="$(cat << EOF
+ {
+ "write": {
+ "Result" : "${writes}",
+ "Units" : "items"
+ },
+ "read": {
+ "Result" : "${reads}",
+ "Units" : "items"
+ },
+ "Nb blogs": {
+ "Result" : "${nb_blogs}"
+ },
+ "R articles": {
+ "Result" : "${r_articles}"
+ },
+ "W articles": {
+ "Result" : "${w_articles}"
+ },
+ "R pictures": {
+ "Result" : "${r_pictures}"
+ },
+ "W pictures": {
+ "Result" : "${w_pictures}"
+ },
+ "R comments": {
+ "Result" : "${r_comments}"
+ },
+ "W comments": {
+ "Result" : "${w_comments}"
+ }
+ }
+EOF
+)"
+
+ metrics_json_add_array_element "${json}"
+ metrics_json_end_array "Results"
+ metrics_json_save
+ clean_env_ctr
+}
+
+main "$@"
diff --git a/tests/metrics/storage/blogbench_dockerfile/Dockerfile b/tests/metrics/storage/blogbench_dockerfile/Dockerfile
new file mode 100644
index 000000000..593063798
--- /dev/null
+++ b/tests/metrics/storage/blogbench_dockerfile/Dockerfile
@@ -0,0 +1,32 @@
+# Copyright (c) 2018-2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Set up an Ubuntu image with 'blogbench' installed
+
+# Usage: FROM [image name]
+# hadolint ignore=DL3007
+FROM docker.io/library/ubuntu:latest
+
+# Version of the Dockerfile
+LABEL DOCKERFILE_VERSION="1.0"
+
+# URL for blogbench test and blogbench version
+ENV BLOGBENCH_URL "https://download.pureftpd.org/pub/blogbench"
+ENV BLOGBENCH_VERSION 1.1
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends build-essential curl && \
+ apt-get remove -y unattended-upgrades && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/ && \
+ curl -OkL "${BLOGBENCH_URL}/blogbench-${BLOGBENCH_VERSION}.tar.gz" && \
+ tar xzf "blogbench-${BLOGBENCH_VERSION}.tar.gz" -C /
+WORKDIR "/blogbench-${BLOGBENCH_VERSION}"
+RUN arch="$(uname -m)" && \
+ export arch && \
+ ./configure --build="${arch}" && \
+ make && \
+ make install-strip
+
+CMD ["/bin/bash"]
diff --git a/tests/metrics/storage/web-tooling-dockerfile/Dockerfile.in b/tests/metrics/storage/web-tooling-dockerfile/Dockerfile.in
new file mode 100755
index 000000000..8190d8560
--- /dev/null
+++ b/tests/metrics/storage/web-tooling-dockerfile/Dockerfile.in
@@ -0,0 +1,28 @@
+# Copyright (c) 2020-2021 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Set up an Ubuntu image with 'web tooling' installed
+
+# Usage: FROM [image name]
+# hadolint ignore=DL3007
+FROM @UBUNTU_REGISTRY@/ubuntu:latest
+
+# Version of the Dockerfile
+LABEL DOCKERFILE_VERSION="1.0"
+
+# URL for web tooling test
+ENV WEB_TOOLING_URL "https://github.com/v8/web-tooling-benchmark"
+ENV NODEJS_VERSION "setup_14.x"
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends build-essential git curl sudo && \
+ apt-get remove -y unattended-upgrades && \
+ curl -OkL https://deb.nodesource.com/${NODEJS_VERSION} && chmod +x ${NODEJS_VERSION} && ./${NODEJS_VERSION} && \
+ apt-get install -y --no-install-recommends nodejs && \
+ apt-get clean && rm -rf /var/lib/apt/lists && \
+ git clone ${WEB_TOOLING_URL} /web-tooling-benchmark
+WORKDIR /web-tooling-benchmark/
+RUN npm install --unsafe-perm
+
+CMD ["/bin/bash"]
diff --git a/tests/metrics/storage/webtooling.sh b/tests/metrics/storage/webtooling.sh
new file mode 100755
index 000000000..f82849618
--- /dev/null
+++ b/tests/metrics/storage/webtooling.sh
@@ -0,0 +1,283 @@
+#!/bin/bash
+#
+# Copyright (c) 2020-2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Description of the test:
+# This test runs the 'web tooling benchmark'
+# https://github.com/v8/web-tooling-benchmark
+
+set -o pipefail
+
+# General env
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+source "${SCRIPT_PATH}/../lib/common.bash"
+NUM_CONTAINERS="${1:-}"
+
+IMAGE="docker.io/library/local-web-tooling:latest"
+DOCKERFILE="${SCRIPT_PATH}/web-tooling-dockerfile/Dockerfile"
+
+# Directory to run the test inside of the container
+TESTDIR="${TESTDIR:-/testdir}"
+file_path="/web-tooling-benchmark"
+file_name="output"
+
+# Directory where the webtooling results are stored
+TMP_DIR=$(mktemp --tmpdir -d webtool.XXXXXXXXXX)
+
+# Options to control the start of the workload using a trigger-file
+dst_dir="/host"
+src_dir=$(mktemp --tmpdir -d webtool.XXXXXXXXXX)
+trigger_file="$RANDOM.txt"
+guest_trigger_file="$dst_dir/$trigger_file"
+host_trigger_file="$src_dir/$trigger_file"
+start_script="webtooling_start.sh"
+
+# CMD points to the script that starts the workload
+CMD="$dst_dir/$start_script"
+MOUNT_OPTIONS="type=bind,src=$src_dir,dst=$dst_dir,options=rbind:ro"
+PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}"
+
+# This timeout is related with the amount of time that
+# webtool benchmark needs to run inside the container
+timeout=600
+INITIAL_NUM_PIDS=1
+
+cpu_period="100000"
+cpu_quota="200000"
+
+function remove_tmp_dir() {
+ rm -rf "$TMP_DIR"
+ rm -rf "$src_dir"
+}
+
+trap remove_tmp_dir EXIT
+
+# Show help about this script
+function help(){
+cat << EOF
+Usage: $0
+ Description:
+ : Number of containers to run.
+EOF
+}
+
+# script used to launch the workload
+function create_start_script() {
+ local script="${src_dir/$start_script}"
+ rm -rf "${script}"
+
+cat <>"${script}"
+#!/bin/bash
+mkdir -p "${TESTDIR}"
+
+until [ -f ${guest_trigger_file} ]; do
+ sleep 1
+done
+pushd "${file_path}"
+node dist/cli.js > "${file_name}"
+EOF
+ chmod +x "${script}"
+}
+
+function verify_task_is_completed_on_all_containers() {
+ local containers=( $(sudo -E "${CTR_EXE}" c list -q) )
+ local sleep_secs=10
+ local max=$(bc <<<"${timeout} / ${sleep_secs}")
+ local wip_list=()
+ local count=1
+ local sum=0
+ local i=""
+
+ while (( ${sum} < ${NUM_CONTAINERS} )); do
+
+ for i in "${containers[@]}"; do
+ # Only check containers that have not completed the workload at this step
+ num_pids=$(sudo -E "${CTR_EXE}" t metrics "${i}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2)
+
+ if [ "${num_pids}" -lt "${INITIAL_NUM_PIDS}" ]; then
+ ((sum++))
+ else
+ wip_list+=("${i}")
+ fi
+ done
+
+ # hold the list of containers that are still running the workload
+ containers=(${wip_list[*]})
+ wip_list=()
+
+ info "loop ${count} of ${max}: sleeping for ${sleep_secs} seconds"
+ sleep "${sleep_secs}"
+ ((count++))
+ done
+}
+
+function check_containers_are_up() {
+ info "Verify that the containers are running"
+ local containers_launched=0
+ while (( $containers_launched < ${NUM_CONTAINERS} )); do
+ containers_launched="$(sudo -E ${CTR_EXE} t list | grep -c "RUNNING")"
+ sleep 1
+ done
+}
+
+function save_config() {
+ metrics_json_start_array
+
+ local json="$(cat << EOF
+ {
+ "containers": "${NUM_CONTAINERS}",
+ "image": "${IMAGE}",
+ "units": "runs/s"
+ }
+EOF
+)"
+ metrics_json_add_array_element "${json}"
+ metrics_json_end_array "Config"
+}
+
+function main() {
+ # Verify enough arguments
+ if [ $# != 1 ]; then
+ echo >&2 "error: Not enough arguments [$@]"
+ help
+ exit 1
+ fi
+
+ local i=0
+ local containers=()
+ local cmds=("docker")
+ local not_started_count=$NUM_CONTAINERS
+
+ restart_containerd_service
+ # Check tools/commands dependencies
+ init_env
+ check_cmds "${cmds[@]}"
+ check_ctr_images "$IMAGE" "$DOCKERFILE"
+ metrics_json_init
+ save_config
+ create_start_script
+ rm -rf "${host_trigger_file}"
+
+ info "Creating ${NUM_CONTAINERS} containers"
+
+ for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do
+ containers+=($(random_name))
+ # Web tool benchmark needs 2 cpus to run completely in its cpu utilization
+ sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" --cpu-quota "${cpu_quota}" --cpu-period "${cpu_period}" --mount="${MOUNT_OPTIONS}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}"
+ ((not_started_count--))
+ info "${not_started_count} remaining containers"
+ done
+
+ # Check that the requested number of containers are running
+ local timeout_launch="10"
+ check_containers_are_up & pid=$!
+ (sleep "${timeout_launch}" && kill -HUP ${pid}) 2>/dev/null & pid_tout=$!
+
+ if wait $pid 2>/dev/null; then
+ pkill -HUP -P "${pid_tout}"
+ wait "${pid_tout}"
+ else
+ warn "Time out exceeded"
+ return 1
+ fi
+
+ # Get the initial number of pids in a single container before the workload starts
+ INITIAL_NUM_PIDS=$(sudo -E "${CTR_EXE}" t metrics "${containers[-1]}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2)
+ ((INITIAL_NUM_PIDS++))
+
+ # Launch webtooling benchmark
+ local pids=()
+ local j=0
+ for i in "${containers[@]}"; do
+ $(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD}") &
+ pids[${j}]=$!
+ ((j++))
+ done
+
+ # wait for all pids
+ for pid in ${pids[*]}; do
+ wait "${pid}"
+ done
+
+ touch "${host_trigger_file}"
+ info "All containers are running the workload..."
+
+ # Verify that all containers have completed the assigned task
+ verify_task_is_completed_on_all_containers & pid=$!
+ (sleep "$timeout" && kill -HUP $pid) 2>/dev/null & pid_tout=$!
+ if wait ${pid} 2>/dev/null; then
+ pkill -HUP -P "${pid_tout}"
+ wait "${pid_tout}"
+ else
+ warn "Time out exceeded"
+ return 1
+ fi
+
+ RESULTS_CMD="cat ${file_path}/${file_name}"
+ for i in "${containers[@]}"; do
+ sudo -E "${CTR_EXE}" t exec --exec-id "${RANDOM}" "${i}" sh -c "${RESULTS_CMD}" >> "${TMP_DIR}/results"
+ done
+
+ # Save configuration
+ metrics_json_start_array
+
+ local output=$(cat "${TMP_DIR}/results")
+ local cut_results="cut -d':' -f2 | sed -e 's/^[ \t]*//'| cut -d ' ' -f1 | tr '\n' ',' | sed 's/.$//'"
+
+ local acorn=$(echo "${output}" | grep -w "acorn" | eval "${cut_results}")
+ local babel=$(echo "${output}" | grep -w "babel" | sed '/babel-minify/d' | eval "${cut_results}")
+ local babel_minify=$(echo "${output}" | grep -w "babel-minify" | eval "${cut_results}")
+ local babylon=$(echo "${output}" | grep -w "babylon" | eval "${cut_results}")
+ local buble=$(echo "${output}" | grep -w "buble" | eval "${cut_results}")
+ local chai=$(echo "${output}" | grep -w "chai" | eval "${cut_results}")
+ local coffeescript=$(echo "${output}" | grep -w "coffeescript" | eval "${cut_results}")
+ local espree=$(echo "${output}" | grep -w "espree" | eval "${cut_results}")
+ local esprima=$(echo "${output}" | grep -w "esprima" | eval "${cut_results}")
+ local jshint=$(echo "${output}" | grep -w "jshint" | eval "${cut_results}")
+ local lebab=$(echo "${output}" | grep -w "lebab" | eval "${cut_results}")
+ local postcss=$(echo "${output}" | grep -w "postcss" | eval "${cut_results}")
+ local prepack=$(echo "${output}" | grep -w "prepack" | eval "${cut_results}")
+ local prettier=$(echo "${output}" | grep -w "prettier" | eval "${cut_results}")
+ local source_map=$(echo "${output}" | grep -w "source-map" | eval "${cut_results}")
+ local terser=$(echo "${output}" | grep -w "terser" | eval "${cut_results}")
+ local typescript=$(echo "${output}" | grep -w "typescript" | eval "${cut_results}")
+ local uglify_js=$(echo "${output}" | grep -w "uglify-js" | eval "${cut_results}")
+ local geometric_mean=$(echo "${output}" | grep -w "Geometric" | eval "${cut_results}")
+ local average_tps=$(echo "${geometric_mean}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l)
+ local tps=$(echo "${average_tps}*${NUM_CONTAINERS}" | bc -l)
+
+ local json="$(cat << EOF
+ {
+ "Acorn" : "${acorn}",
+ "Babel" : "${babel}",
+ "Babel minify" : "${babel_minify}",
+ "Babylon" : "${babylon}",
+ "Buble" : "${buble}",
+ "Chai" : "${chai}",
+ "Coffeescript" : "${coffeescript}",
+ "Espree" : "${espree}",
+ "Esprima" : "${esprima}",
+ "Jshint" : "${jshint}",
+ "Lebab" : "${lebab}",
+ "Postcss" : "${postcss}",
+ "Prepack" : "${prepack}",
+ "Prettier" : "${prettier}",
+ "Source map" : "${source_map}",
+ "Terser" : "${terser}",
+ "Typescript" : "${typescript}",
+ "Uglify js" : "${uglify_js}",
+ "Geometric mean" : "${geometric_mean}",
+ "Average TPS" : "${average_tps}",
+ "TPS" : "${tps}"
+ }
+EOF
+)"
+ metrics_json_add_array_element "${json}"
+ metrics_json_end_array "Results"
+ metrics_json_save
+ clean_env_ctr
+}
+
+main "$@"
diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries-in-docker.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries-in-docker.sh
index 606ce8435..5ceae83bd 100755
--- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries-in-docker.sh
+++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries-in-docker.sh
@@ -71,6 +71,7 @@ docker run \
--env TDSHIM_CONTAINER_BUILDER="${TDSHIM_CONTAINER_BUILDER:-}" \
--env VIRTIOFSD_CONTAINER_BUILDER="${VIRTIOFSD_CONTAINER_BUILDER:-}" \
--env MEASURED_ROOTFS="${MEASURED_ROOTFS:-}" \
+ --env USE_CACHE="${USE_CACHE:-}" \
--rm \
-w ${script_dir} \
build-kata-deploy "${kata_deploy_create}" $@
diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh
index 47dc2dbfe..26d64fa07 100755
--- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh
+++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh
@@ -48,6 +48,7 @@ readonly cached_artifacts_path="lastSuccessfulBuild/artifact/artifacts"
ARCH=$(uname -m)
MEASURED_ROOTFS=${MEASURED_ROOTFS:-no}
+USE_CACHE="${USE_CACHE:-"yes"}"
workdir="${WORKDIR:-$PWD}"
@@ -87,6 +88,7 @@ options:
--build= :
all
cloud-hypervisor
+ cloud-hypervisor-glibc
firecracker
kernel
kernel-dragonball-experimental
@@ -135,7 +137,11 @@ cleanup_and_fail() {
return 1
}
-install_cached_component() {
+install_cached_tarball_component() {
+ if [ "${USE_CACHE}" != "yes" ]; then
+ return 1
+ fi
+
local component="${1}"
local jenkins_build_url="${2}"
local current_version="${3}"
@@ -214,7 +220,7 @@ install_cached_cc_shim_v2() {
wget "${jenkins_build_url}/root_hash_tdx.txt" -O "shim_v2_root_hash_tdx.txt" || return 1
diff "${root_hash_tdx}" "shim_v2_root_hash_tdx.txt" > /dev/null || return 1
- install_cached_component \
+ install_cached_tarball_component \
"${component}" \
"${jenkins_build_url}" \
"${current_version}" \
@@ -227,7 +233,7 @@ install_cached_cc_shim_v2() {
# Install static CC cloud-hypervisor asset
install_cc_clh() {
- install_cached_component \
+ install_cached_tarball_component \
"cloud-hypervisor" \
"${jenkins_url}/job/kata-containers-2.0-clh-cc-$(uname -m)/${cached_artifacts_path}" \
"$(get_from_kata_deps "assets.hypervisor.cloud_hypervisor.version")" \
@@ -287,7 +293,7 @@ install_cc_image() {
local pause_version="$(get_from_kata_deps "externals.pause.version")"
local rust_version="$(get_from_kata_deps "languages.rust.meta.newest-version")"
- install_cached_component \
+ install_cached_tarball_component \
"${component}" \
"${jenkins}" \
"${osbuilder_last_commit}-${guest_image_last_commit}-${initramfs_last_commit}-${agent_last_commit}-${libs_last_commit}-${attestation_agent_version}-${gperf_version}-${libseccomp_version}-${pause_version}-${rust_version}-${image_type}-${AA_KBC}" \
@@ -333,7 +339,7 @@ install_cc_kernel() {
local kernel_kata_config_version="$(cat ${repo_root_dir}/tools/packaging/kernel/kata_config_version)"
- install_cached_component \
+ install_cached_tarball_component \
"kernel" \
"${jenkins_url}/job/kata-containers-2.0-kernel-cc-$(uname -m)/${cached_artifacts_path}" \
"${kernel_version}-${kernel_kata_config_version}" \
@@ -355,7 +361,7 @@ install_cc_qemu() {
export qemu_repo="$(yq r $versions_yaml assets.hypervisor.qemu.url)"
export qemu_version="$(yq r $versions_yaml assets.hypervisor.qemu.version)"
- install_cached_component \
+ install_cached_tarball_component \
"QEMU" \
"${jenkins_url}/job/kata-containers-2.0-qemu-cc-$(uname -m)/${cached_artifacts_path}" \
"${qemu_version}-$(calc_qemu_files_sha256sum)" \
@@ -413,7 +419,7 @@ install_cc_shimv2() {
# Install static CC virtiofsd asset
install_cc_virtiofsd() {
local virtiofsd_version="$(get_from_kata_deps "externals.virtiofsd.version")-$(get_from_kata_deps "externals.virtiofsd.toolchain")"
- install_cached_component \
+ install_cached_tarball_component \
"virtiofsd" \
"${jenkins_url}/job/kata-containers-2.0-virtiofsd-cc-$(uname -m)/${cached_artifacts_path}" \
"${virtiofsd_version}" \
@@ -437,7 +443,7 @@ install_cached_kernel_component() {
local kernel_kata_config_version="$(cat ${repo_root_dir}/tools/packaging/kernel/kata_config_version)"
- install_cached_component \
+ install_cached_tarball_component \
"kernel" \
"${jenkins_url}/job/kata-containers-2.0-kernel-${tee}-cc-$(uname -m)/${cached_artifacts_path}" \
"${kernel_version}-${kernel_kata_config_version}" \
@@ -449,7 +455,7 @@ install_cached_kernel_component() {
[ "${tee}" == "tdx" ] && return 0
# SEV specific code path
- install_cached_component \
+ install_cached_tarball_component \
"kernel-modules" \
"${jenkins_url}/job/kata-containers-2.0-kernel-sev-cc-$(uname -m)/${cached_artifacts_path}" \
"${kernel_version}" \
@@ -505,7 +511,7 @@ install_cc_tee_qemu() {
export qemu_version="$(yq r $versions_yaml assets.hypervisor.qemu.${tee}.tag)"
export tee="${tee}"
- install_cached_component \
+ install_cached_tarball_component \
"QEMU ${tee}" \
"${jenkins_url}/job/kata-containers-2.0-qemu-${tee}-cc-$(uname -m)/${cached_artifacts_path}" \
"${qemu_version}-$(calc_qemu_files_sha256sum)" \
@@ -523,7 +529,7 @@ install_cc_tdx_qemu() {
}
install_cc_tdx_td_shim() {
- install_cached_component \
+ install_cached_tarball_component \
"td-shim" \
"${jenkins_url}/job/kata-containers-2.0-td-shim-cc-$(uname -m)/${cached_artifacts_path}" \
"$(get_from_kata_deps "externals.td-shim.version")-$(get_from_kata_deps "externals.td-shim.toolchain")" \
@@ -543,7 +549,7 @@ install_cc_tee_ovmf() {
local component_name="ovmf"
local component_version="$(get_from_kata_deps "externals.ovmf.${tee}.version")"
[ "${tee}" == "tdx" ] && component_name="tdvf"
- install_cached_component \
+ install_cached_tarball_component \
"${component_name}" \
"${jenkins_url}/job/kata-containers-2.0-${component_name}-cc-$(uname -m)/${cached_artifacts_path}" \
"${component_version}" \
@@ -583,7 +589,7 @@ install_image() {
local libseccomp_version="$(get_from_kata_deps "externals.libseccomp.version")"
local rust_version="$(get_from_kata_deps "languages.rust.meta.newest-version")"
- install_cached_component \
+ install_cached_tarball_component \
"${component}" \
"${jenkins}" \
"${osbuilder_last_commit}-${guest_image_last_commit}-${agent_last_commit}-${libs_last_commit}-${gperf_version}-${libseccomp_version}-${rust_version}-image" \
@@ -616,7 +622,7 @@ install_initrd() {
local libseccomp_version="$(get_from_kata_deps "externals.libseccomp.version")"
local rust_version="$(get_from_kata_deps "languages.rust.meta.newest-version")"
- install_cached_component \
+ install_cached_tarball_component \
"${component}" \
"${jenkins}" \
"${osbuilder_last_commit}-${guest_image_last_commit}-${agent_last_commit}-${libs_last_commit}-${gperf_version}-${libseccomp_version}-${rust_version}-${initrd_type}" \
@@ -728,12 +734,12 @@ install_kernel_nvidia_gpu() {
#Install GPU and SNP enabled kernel asset
install_kernel_nvidia_gpu_snp() {
- local kernel_url="$(get_from_kata_deps assets.kernel.snp.url)"
+ local kernel_url="$(get_from_kata_deps assets.kernel.sev.url)"
install_kernel_helper \
- "assets.kernel.snp.version" \
+ "assets.kernel.sev.version" \
"kernel-nvidia-gpu-snp" \
- "-x snp -g nvidia -u ${kernel_url} -H deb"
+ "-x sev -g nvidia -u ${kernel_url} -H deb"
}
#Install GPU and TDX experimental enabled kernel asset
@@ -854,31 +860,52 @@ install_firecracker() {
sudo install -D --owner root --group root --mode 0744 release-${firecracker_version}-${ARCH}/jailer-${firecracker_version}-${ARCH} "${destdir}/opt/kata/bin/jailer"
}
-# Install static cloud-hypervisor asset
-install_clh() {
- install_cached_component \
- "cloud-hypervisor" \
- "${jenkins_url}/job/kata-containers-main-clh-$(uname -m)/${cached_artifacts_path}" \
+install_clh_helper() {
+ libc="${1}"
+ features="${2}"
+ suffix="${3:-""}"
+
+ install_cached_tarball_component \
+ "cloud-hypervisor${suffix}" \
+ "${jenkins_url}/job/kata-containers-main-clh-$(uname -m)${suffix}/${cached_artifacts_path}" \
"$(get_from_kata_deps "assets.hypervisor.cloud_hypervisor.version")" \
"" \
"${final_tarball_name}" \
"${final_tarball_path}" \
&& return 0
- if [[ "${ARCH}" == "x86_64" ]]; then
- export features="tdx"
- fi
-
info "build static cloud-hypervisor"
- "${clh_builder}"
+ libc="${libc}" features="${features}" "${clh_builder}"
info "Install static cloud-hypervisor"
mkdir -p "${destdir}/opt/kata/bin/"
- sudo install -D --owner root --group root --mode 0744 cloud-hypervisor/cloud-hypervisor "${destdir}/opt/kata/bin/cloud-hypervisor"
+ sudo install -D --owner root --group root --mode 0744 cloud-hypervisor/cloud-hypervisor "${destdir}/opt/kata/bin/cloud-hypervisor${suffix}"
+}
+
+# Install static cloud-hypervisor asset
+install_clh() {
+ if [[ "${ARCH}" == "x86_64" ]]; then
+ features="mshv,tdx"
+ else
+ features=""
+ fi
+
+ install_clh_helper "musl" "${features}"
+}
+
+# Install static cloud-hypervisor-glibc asset
+install_clh_glibc() {
+ if [[ "${ARCH}" == "x86_64" ]]; then
+ features="mshv"
+ else
+ features=""
+ fi
+
+ install_clh_helper "gnu" "${features}" "-glibc"
}
# Install static virtiofsd asset
install_virtiofsd() {
- install_cached_component \
+ install_cached_tarball_component \
"virtiofsd" \
"${jenkins_url}/job/kata-containers-main-virtiofsd-$(uname -m)/${cached_artifacts_path}" \
"$(get_from_kata_deps "externals.virtiofsd.version")-$(get_from_kata_deps "externals.virtiofsd.toolchain")" \
@@ -925,7 +952,7 @@ install_shimv2() {
local RUST_VERSION="$(get_from_kata_deps "languages.rust.meta.newest-version")"
local shim_v2_version="${shim_v2_last_commit}-${protocols_last_commit}-${runtime_rs_last_commit}-${GO_VERSION}-${RUST_VERSION}"
- install_cached_component \
+ install_cached_tarball_component \
"shim-v2" \
"${jenkins_url}/job/kata-containers-main-shim-v2-$(uname -m)/${cached_artifacts_path}" \
"${shim_v2_version}" \
@@ -1065,7 +1092,7 @@ handle_build() {
cloud-hypervisor) install_clh ;;
- cloud-hypervisor-glibc) ;;
+ cloud-hypervisor-glibc) install_clh_glibc ;;
firecracker) install_firecracker ;;
diff --git a/tools/packaging/kata-deploy/scripts/kata-deploy.sh b/tools/packaging/kata-deploy/scripts/kata-deploy.sh
index 6bb660198..09d27cc65 100644
--- a/tools/packaging/kata-deploy/scripts/kata-deploy.sh
+++ b/tools/packaging/kata-deploy/scripts/kata-deploy.sh
@@ -64,6 +64,15 @@ function install_artifacts() {
chmod +x /opt/kata/bin/*
[ -d /opt/kata/runtime-rs/bin ] && \
chmod +x /opt/kata/runtime-rs/bin/*
+
+ # Allow Mariner to use custom configuration.
+ if [ "${HOST_OS:-}" == "cbl-mariner" ]; then
+ config_path="/opt/kata/share/defaults/kata-containers/configuration-clh.toml"
+ clh_path="/opt/kata/bin/cloud-hypervisor-glibc"
+ sed -i -E 's|(enable_annotations) = .+|\1 = ["enable_iommu", "initrd", "kernel"]|' "${config_path}"
+ sed -i -E "s|(valid_hypervisor_paths) = .+|\1 = [\"${clh_path}\"]|" "${config_path}"
+ sed -i -E "s|(path) = \".+/cloud-hypervisor\"|\1 = \"${clh_path}\"|" "${config_path}"
+ fi
}
function wait_till_node_is_ready() {
diff --git a/tools/packaging/kernel/build-kernel.sh b/tools/packaging/kernel/build-kernel.sh
index fef85503a..35c54741f 100755
--- a/tools/packaging/kernel/build-kernel.sh
+++ b/tools/packaging/kernel/build-kernel.sh
@@ -243,25 +243,21 @@ get_kernel_frag_path() {
if [[ "${gpu_vendor}" != "" ]];then
info "Add kernel config for GPU due to '-g ${gpu_vendor}'"
- local gpu_configs="$(ls ${gpu_path}/${gpu_vendor}.conf)"
- all_configs="${all_configs} ${gpu_configs}"
# If conf_guest is set we need to update the CONFIG_LOCALVERSION
# to match the suffix created in install_kata
# -nvidia-gpu-{snp|tdx}, the linux headers will be named the very
# same if build with make deb-pkg for TDX or SNP.
+ local gpu_configs=$(mktemp).conf
+ local gpu_subst_configs="${gpu_path}/${gpu_vendor}.${arch_target}.conf.in"
if [[ "${conf_guest}" != "" ]];then
- local gpu_cc_configs=$(mktemp).conf
- local gpu_subst_configs="$(ls ${gpu_path}/${gpu_vendor}.conf.in)"
-
export CONF_GUEST_SUFFIX="-${conf_guest}"
- envsubst <${gpu_subst_configs} >${gpu_cc_configs}
- unset CONF_GUEST_SUFFIX
-
- all_configs="${all_configs} ${gpu_cc_configs}"
else
- local gpu_configs="$(ls ${gpu_path}/${gpu_vendor}.conf)"
- all_configs="${all_configs} ${gpu_configs}"
+ export CONF_GUEST_SUFFIX=""
fi
+ envsubst <${gpu_subst_configs} >${gpu_configs}
+ unset CONF_GUEST_SUFFIX
+
+ all_configs="${all_configs} ${gpu_configs}"
fi
if [ "${MEASURED_ROOTFS}" == "yes" ]; then
diff --git a/tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in b/tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in
new file mode 100644
index 000000000..8cb9cf511
--- /dev/null
+++ b/tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in
@@ -0,0 +1,29 @@
+# Support for loading modules.
+# It is used to support loading GPU drivers.
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+
+# CRYPTO_FIPS requires this config when loading modules is enabled.
+CONFIG_MODULE_SIG=y
+
+# Linux kernel version suffix
+CONFIG_LOCALVERSION="-nvidia-gpu${CONF_GUEST_SUFFIX}"
+
+# Newer NVIDIA drivers need additional symbols
+CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y
+CONFIG_MEMORY_FAILURE=y
+
+
+# VFIO/IOMMU setttings
+CONFIG_MMU_NOTIFIER=y
+CONFIG_IOASID=y
+CONFIG_IOMMU_IO_PGTABLE=y
+CONFIG_IOMMU_IO_PGTABLE_LPAE=y
+CONFIG_IOMMU_SVA=y
+CONFIG_ARM_SMMU_V3=y
+CONFIG_ARM_SMMU_V3_SVA=y
+
+# CC related configs
+CONFIG_CRYPTO_ECC=y
+CONFIG_CRYPTO_ECDH=y
+CONFIG_CRYPTO_ECDSA=y
diff --git a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf b/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf
deleted file mode 100644
index 883c0f3af..000000000
--- a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf
+++ /dev/null
@@ -1,14 +0,0 @@
-# Support mmconfig PCI config space access.
-# It's used to enable the MMIO access method for PCIe devices.
-CONFIG_PCI_MMCONFIG=y
-
-# Support for loading modules.
-# It is used to support loading GPU drivers.
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-
-# CRYPTO_FIPS requires this config when loading modules is enabled.
-CONFIG_MODULE_SIG=y
-
-# Linux kernel version suffix
-CONFIG_LOCALVERSION="-nvidia-gpu"
diff --git a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf.in b/tools/packaging/kernel/configs/fragments/gpu/nvidia.x86_64.conf.in
similarity index 62%
rename from tools/packaging/kernel/configs/fragments/gpu/nvidia.conf.in
rename to tools/packaging/kernel/configs/fragments/gpu/nvidia.x86_64.conf.in
index 73cce6173..6ef830aab 100644
--- a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf.in
+++ b/tools/packaging/kernel/configs/fragments/gpu/nvidia.x86_64.conf.in
@@ -12,3 +12,14 @@ CONFIG_MODULE_SIG=y
# Linux kernel version suffix
CONFIG_LOCALVERSION="-nvidia-gpu${CONF_GUEST_SUFFIX}"
+
+# Newer NVIDIA drivers need additional symbols
+CONFIG_X86_MCE=y
+CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y
+CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y
+CONFIG_MEMORY_FAILURE=y
+
+# CC related configs
+CONFIG_CRYPTO_ECC=y
+CONFIG_CRYPTO_ECDH=y
+CONFIG_CRYPTO_ECDSA=y
diff --git a/tools/packaging/kernel/kata_config_version b/tools/packaging/kernel/kata_config_version
index d5e55fec1..8cccbbf4a 100644
--- a/tools/packaging/kernel/kata_config_version
+++ b/tools/packaging/kernel/kata_config_version
@@ -1 +1 @@
-109cc
+109cc+
diff --git a/tools/packaging/static-build/cloud-hypervisor/build-static-clh.sh b/tools/packaging/static-build/cloud-hypervisor/build-static-clh.sh
index 975a517a1..f381897bc 100755
--- a/tools/packaging/static-build/cloud-hypervisor/build-static-clh.sh
+++ b/tools/packaging/static-build/cloud-hypervisor/build-static-clh.sh
@@ -76,12 +76,12 @@ build_clh_from_source() {
if [ -n "${features}" ]; then
info "Build cloud-hypervisor enabling the following features: ${features}"
- ./scripts/dev_cli.sh build --release --libc musl --features "${features}"
+ ./scripts/dev_cli.sh build --release --libc "${libc}" --features "${features}"
else
- ./scripts/dev_cli.sh build --release --libc musl
+ ./scripts/dev_cli.sh build --release --libc "${libc}"
fi
rm -f cloud-hypervisor
- cp build/cargo_target/$(uname -m)-unknown-linux-musl/release/cloud-hypervisor .
+ cp build/cargo_target/$(uname -m)-unknown-linux-${libc}/release/cloud-hypervisor .
popd
}
diff --git a/tools/packaging/static-build/ovmf/build.sh b/tools/packaging/static-build/ovmf/build.sh
index 16e4292b8..d9eef8781 100755
--- a/tools/packaging/static-build/ovmf/build.sh
+++ b/tools/packaging/static-build/ovmf/build.sh
@@ -37,17 +37,17 @@ fi
[ -n "$ovmf_repo" ] || die "failed to get ovmf repo"
if [ "${ovmf_build}" == "x86_64" ]; then
- [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.x86_64.version")
- [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.x86_64.package")
- [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.x86_64.package_output_dir")
+ [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.x86_64.version")
+ [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.x86_64.package")
+ [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.x86_64.package_output_dir")
elif [ "${ovmf_build}" == "sev" ]; then
- [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.sev.version")
- [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.sev.package")
- [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.sev.package_output_dir")
+ [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.sev.version")
+ [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.sev.package")
+ [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.sev.package_output_dir")
elif [ "${ovmf_build}" == "tdx" ]; then
- [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.tdx.version")
- [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.tdx.package")
- [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.tdx.package_output_dir")
+ [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.tdx.version")
+ [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.tdx.package")
+ [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.tdx.package_output_dir")
fi
[ -n "$ovmf_version" ] || die "failed to get ovmf version or commit"
@@ -56,8 +56,8 @@ fi
sudo docker pull ${container_image} || \
(sudo docker build -t "${container_image}" "${script_dir}" && \
- # No-op unless PUSH_TO_REGISTRY is exported as "yes"
- push_to_registry "${container_image}")
+ # No-op unless PUSH_TO_REGISTRY is exported as "yes"
+ push_to_registry "${container_image}")
sudo docker run --rm -i -v "${repo_root_dir}:${repo_root_dir}" \
-w "${PWD}" \