diff --git a/.github/workflows/build-kata-static-tarball-amd64.yaml b/.github/workflows/build-kata-static-tarball-amd64.yaml index b5c7584fe..f0f606850 100644 --- a/.github/workflows/build-kata-static-tarball-amd64.yaml +++ b/.github/workflows/build-kata-static-tarball-amd64.yaml @@ -13,14 +13,15 @@ on: required: false type: string default: no + commit-hash: + required: false + type: string jobs: build-asset: runs-on: ubuntu-latest strategy: matrix: - stage: - - ${{ inputs.stage }} asset: - cloud-hypervisor - cloud-hypervisor-glibc @@ -46,9 +47,11 @@ jobs: - shim-v2 - tdvf - virtiofsd + stage: + - ${{ inputs.stage }} exclude: - - stage: release - asset: cloud-hypervisor-glibc + - asset: cloud-hypervisor-glibc + stage: release steps: - name: Login to Kata Containers quay.io if: ${{ inputs.push-to-registry == 'yes' }} @@ -60,7 +63,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} fetch-depth: 0 # This is needed in order to keep the commit ids history - name: Build ${{ matrix.asset }} @@ -88,7 +91,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-artifacts uses: actions/download-artifact@v3 with: diff --git a/.github/workflows/build-kata-static-tarball-arm64.yaml b/.github/workflows/build-kata-static-tarball-arm64.yaml index 1fc981733..2ad97a0ba 100644 --- a/.github/workflows/build-kata-static-tarball-arm64.yaml +++ b/.github/workflows/build-kata-static-tarball-arm64.yaml @@ -9,6 +9,9 @@ on: required: false type: string default: no + commit-hash: + required: false + type: string jobs: build-asset: @@ -41,7 +44,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} fetch-depth: 0 # This is needed in order to keep the commit ids history - name: Build ${{ matrix.asset }} run: | @@ -72,7 +75,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-artifacts uses: actions/download-artifact@v3 with: diff --git a/.github/workflows/build-kata-static-tarball-s390x.yaml b/.github/workflows/build-kata-static-tarball-s390x.yaml index 58186ab8c..cf2831033 100644 --- a/.github/workflows/build-kata-static-tarball-s390x.yaml +++ b/.github/workflows/build-kata-static-tarball-s390x.yaml @@ -9,6 +9,9 @@ on: required: false type: string default: no + commit-hash: + required: false + type: string jobs: build-asset: @@ -37,7 +40,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} fetch-depth: 0 # This is needed in order to keep the commit ids history - name: Build ${{ matrix.asset }} run: | @@ -69,7 +72,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-artifacts uses: actions/download-artifact@v3 with: diff --git a/.github/workflows/ci-nightly.yaml b/.github/workflows/ci-nightly.yaml new file mode 100644 index 000000000..9a47ce0e4 --- /dev/null +++ b/.github/workflows/ci-nightly.yaml @@ -0,0 +1,14 @@ +name: Kata Containers Nightly CI +on: + schedule: + - cron: '0 0 * * *' + workflow_dispatch: + +jobs: + kata-containers-ci-on-push: + uses: ./.github/workflows/ci.yaml + with: + commit-hash: ${{ github.sha }} + pr-number: "nightly" + tag: ${{ github.sha }}-nightly + secrets: inherit diff --git a/.github/workflows/ci-on-push.yaml b/.github/workflows/ci-on-push.yaml index 9f7c82eaf..6d4cc7fc0 100644 --- a/.github/workflows/ci-on-push.yaml +++ b/.github/workflows/ci-on-push.yaml @@ -12,65 +12,14 @@ on: - synchronize - reopened - labeled - + paths-ignore: + - 'docs/**' jobs: - build-kata-static-tarball-amd64: + kata-containers-ci-on-push: if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml + uses: ./.github/workflows/ci.yaml with: - tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }} - - publish-kata-deploy-payload-amd64: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - needs: build-kata-static-tarball-amd64 - uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml - with: - tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }} - registry: ghcr.io - repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64 + commit-hash: ${{ github.event.pull_request.head.sha }} + pr-number: ${{ github.event.pull_request.number }} + tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }} secrets: inherit - - run-k8s-tests-on-aks: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - needs: publish-kata-deploy-payload-amd64 - uses: ./.github/workflows/run-k8s-tests-on-aks.yaml - with: - registry: ghcr.io - repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64 - secrets: inherit - - run-k8s-tests-on-sev: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - needs: publish-kata-deploy-payload-amd64 - uses: ./.github/workflows/run-k8s-tests-on-sev.yaml - with: - registry: ghcr.io - repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64 - - run-k8s-tests-on-snp: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - needs: publish-kata-deploy-payload-amd64 - uses: ./.github/workflows/run-k8s-tests-on-snp.yaml - with: - registry: ghcr.io - repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64 - - run-k8s-tests-on-tdx: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - needs: publish-kata-deploy-payload-amd64 - uses: ./.github/workflows/run-k8s-tests-on-tdx.yaml - with: - registry: ghcr.io - repo: ${{ github.repository_owner }}/kata-deploy-ci - tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64 - - run-metrics-tests: - if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} - needs: build-kata-static-tarball-amd64 - uses: ./.github/workflows/run-metrics.yaml - with: - tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }} diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 000000000..faec7fca4 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,76 @@ +name: Run the Kata Containers CI +on: + workflow_call: + inputs: + commit-hash: + required: true + type: string + pr-number: + required: true + type: string + tag: + required: true + type: string + +jobs: + build-kata-static-tarball-amd64: + uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml + with: + tarball-suffix: -${{ inputs.tag }} + commit-hash: ${{ inputs.commit-hash }} + + publish-kata-deploy-payload-amd64: + needs: build-kata-static-tarball-amd64 + uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml + with: + tarball-suffix: -${{ inputs.tag }} + registry: ghcr.io + repo: ${{ github.repository_owner }}/kata-deploy-ci + tag: ${{ inputs.tag }}-amd64 + commit-hash: ${{ inputs.commit-hash }} + secrets: inherit + + run-k8s-tests-on-aks: + needs: publish-kata-deploy-payload-amd64 + uses: ./.github/workflows/run-k8s-tests-on-aks.yaml + with: + registry: ghcr.io + repo: ${{ github.repository_owner }}/kata-deploy-ci + tag: ${{ inputs.tag }}-amd64 + commit-hash: ${{ inputs.commit-hash }} + pr-number: ${{ inputs.pr-number }} + secrets: inherit + + run-k8s-tests-on-sev: + needs: publish-kata-deploy-payload-amd64 + uses: ./.github/workflows/run-k8s-tests-on-sev.yaml + with: + registry: ghcr.io + repo: ${{ github.repository_owner }}/kata-deploy-ci + tag: ${{ inputs.tag }}-amd64 + commit-hash: ${{ inputs.commit-hash }} + + run-k8s-tests-on-snp: + needs: publish-kata-deploy-payload-amd64 + uses: ./.github/workflows/run-k8s-tests-on-snp.yaml + with: + registry: ghcr.io + repo: ${{ github.repository_owner }}/kata-deploy-ci + tag: ${{ inputs.tag }}-amd64 + commit-hash: ${{ inputs.commit-hash }} + + run-k8s-tests-on-tdx: + needs: publish-kata-deploy-payload-amd64 + uses: ./.github/workflows/run-k8s-tests-on-tdx.yaml + with: + registry: ghcr.io + repo: ${{ github.repository_owner }}/kata-deploy-ci + tag: ${{ inputs.tag }}-amd64 + commit-hash: ${{ inputs.commit-hash }} + + run-metrics-tests: + needs: build-kata-static-tarball-amd64 + uses: ./.github/workflows/run-metrics.yaml + with: + tarball-suffix: -${{ inputs.tag }} + commit-hash: ${{ inputs.commit-hash }} diff --git a/.github/workflows/payload-after-push.yaml b/.github/workflows/payload-after-push.yaml index 97bb309b1..871d73388 100644 --- a/.github/workflows/payload-after-push.yaml +++ b/.github/workflows/payload-after-push.yaml @@ -9,18 +9,21 @@ jobs: build-assets-amd64: uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml with: + commit-hash: ${{ github.sha }} push-to-registry: yes secrets: inherit build-assets-arm64: uses: ./.github/workflows/build-kata-static-tarball-arm64.yaml with: + commit-hash: ${{ github.sha }} push-to-registry: yes secrets: inherit build-assets-s390x: uses: ./.github/workflows/build-kata-static-tarball-s390x.yaml with: + commit-hash: ${{ github.sha }} push-to-registry: yes secrets: inherit @@ -28,6 +31,7 @@ jobs: needs: build-assets-amd64 uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml with: + commit-hash: ${{ github.sha }} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-amd64 @@ -37,6 +41,7 @@ jobs: needs: build-assets-arm64 uses: ./.github/workflows/publish-kata-deploy-payload-arm64.yaml with: + commit-hash: ${{ github.sha }} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-arm64 @@ -46,6 +51,7 @@ jobs: needs: build-assets-s390x uses: ./.github/workflows/publish-kata-deploy-payload-s390x.yaml with: + commit-hash: ${{ github.sha }} registry: quay.io repo: kata-containers/kata-deploy-ci tag: kata-containers-s390x diff --git a/.github/workflows/publish-kata-deploy-payload-amd64.yaml b/.github/workflows/publish-kata-deploy-payload-amd64.yaml index 91c7a0612..b5ba900d8 100644 --- a/.github/workflows/publish-kata-deploy-payload-amd64.yaml +++ b/.github/workflows/publish-kata-deploy-payload-amd64.yaml @@ -14,6 +14,9 @@ on: tag: required: true type: string + commit-hash: + required: false + type: string jobs: kata-payload: @@ -21,7 +24,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-kata-tarball uses: actions/download-artifact@v3 diff --git a/.github/workflows/publish-kata-deploy-payload-arm64.yaml b/.github/workflows/publish-kata-deploy-payload-arm64.yaml index c4fd32477..6c35ed8a3 100644 --- a/.github/workflows/publish-kata-deploy-payload-arm64.yaml +++ b/.github/workflows/publish-kata-deploy-payload-arm64.yaml @@ -14,6 +14,9 @@ on: tag: required: true type: string + commit-hash: + required: false + type: string jobs: kata-payload: @@ -25,7 +28,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-kata-tarball uses: actions/download-artifact@v3 diff --git a/.github/workflows/publish-kata-deploy-payload-s390x.yaml b/.github/workflows/publish-kata-deploy-payload-s390x.yaml index 2a0ea8071..ee7fa3fd7 100644 --- a/.github/workflows/publish-kata-deploy-payload-s390x.yaml +++ b/.github/workflows/publish-kata-deploy-payload-s390x.yaml @@ -14,6 +14,9 @@ on: tag: required: true type: string + commit-hash: + required: false + type: string jobs: kata-payload: @@ -25,7 +28,7 @@ jobs: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-kata-tarball uses: actions/download-artifact@v3 diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index c553ca514..a50313fd0 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -72,8 +72,7 @@ jobs: - uses: actions/checkout@v3 - name: install hub run: | - HUB_VER=$(curl -s "https://api.github.com/repos/github/hub/releases/latest" | jq -r .tag_name | sed 's/^v//') - wget -q -O- https://github.com/github/hub/releases/download/v$HUB_VER/hub-linux-amd64-$HUB_VER.tgz | \ + wget -q -O- https://github.com/mislav/hub/releases/download/v2.14.2/hub-linux-amd64-2.14.2.tgz | \ tar xz --strip-components=2 --wildcards '*/bin/hub' && sudo mv hub /usr/local/bin/hub - name: download-artifacts-amd64 diff --git a/.github/workflows/run-k8s-tests-on-aks.yaml b/.github/workflows/run-k8s-tests-on-aks.yaml index a39c2bbcd..d8658270a 100644 --- a/.github/workflows/run-k8s-tests-on-aks.yaml +++ b/.github/workflows/run-k8s-tests-on-aks.yaml @@ -11,6 +11,12 @@ on: tag: required: true type: string + pr-number: + required: true + type: string + commit-hash: + required: false + type: string jobs: run-k8s-tests: @@ -31,13 +37,13 @@ jobs: DOCKER_REGISTRY: ${{ inputs.registry }} DOCKER_REPO: ${{ inputs.repo }} DOCKER_TAG: ${{ inputs.tag }} - GH_PR_NUMBER: ${{ github.event.pull_request.number }} + GH_PR_NUMBER: ${{ inputs.pr-number }} KATA_HOST_OS: ${{ matrix.host_os }} KATA_HYPERVISOR: ${{ matrix.vmm }} steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: Download Azure CLI run: bash tests/integration/gha-run.sh install-azure-cli diff --git a/.github/workflows/run-k8s-tests-on-sev.yaml b/.github/workflows/run-k8s-tests-on-sev.yaml index 52ab7f955..3fc4ca835 100644 --- a/.github/workflows/run-k8s-tests-on-sev.yaml +++ b/.github/workflows/run-k8s-tests-on-sev.yaml @@ -11,6 +11,9 @@ on: tag: required: true type: string + commit-hash: + required: false + type: string jobs: run-k8s-tests: @@ -29,7 +32,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: Run tests timeout-minutes: 30 diff --git a/.github/workflows/run-k8s-tests-on-snp.yaml b/.github/workflows/run-k8s-tests-on-snp.yaml index 535c6de6d..8aa1763d2 100644 --- a/.github/workflows/run-k8s-tests-on-snp.yaml +++ b/.github/workflows/run-k8s-tests-on-snp.yaml @@ -11,6 +11,9 @@ on: tag: required: true type: string + commit-hash: + required: false + type: string jobs: run-k8s-tests: @@ -29,7 +32,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: Run tests timeout-minutes: 30 diff --git a/.github/workflows/run-k8s-tests-on-tdx.yaml b/.github/workflows/run-k8s-tests-on-tdx.yaml index 886b1c026..ccbc16db7 100644 --- a/.github/workflows/run-k8s-tests-on-tdx.yaml +++ b/.github/workflows/run-k8s-tests-on-tdx.yaml @@ -11,6 +11,9 @@ on: tag: required: true type: string + commit-hash: + required: false + type: string jobs: run-k8s-tests: @@ -29,7 +32,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: Run tests timeout-minutes: 30 diff --git a/.github/workflows/run-metrics.yaml b/.github/workflows/run-metrics.yaml index 5e06c3074..92a5f8af9 100644 --- a/.github/workflows/run-metrics.yaml +++ b/.github/workflows/run-metrics.yaml @@ -5,16 +5,25 @@ on: tarball-suffix: required: false type: string + commit-hash: + required: false + type: string jobs: run-metrics: + strategy: + fail-fast: true + matrix: + vmm: ['clh', 'qemu'] + max-parallel: 1 runs-on: metrics env: GOPATH: ${{ github.workspace }} + KATA_HYPERVISOR: ${{ matrix.vmm }} steps: - uses: actions/checkout@v3 with: - ref: ${{ github.event.pull_request.head.sha }} + ref: ${{ inputs.commit-hash }} - name: get-kata-tarball uses: actions/download-artifact@v3 @@ -25,8 +34,25 @@ jobs: - name: Install kata run: bash tests/metrics/gha-run.sh install-kata kata-artifacts - - name: run launch times on qemu - run: bash tests/metrics/gha-run.sh run-test-launchtimes-qemu + - name: run launch times test + run: bash tests/metrics/gha-run.sh run-test-launchtimes - - name: run launch times on clh - run: bash tests/metrics/gha-run.sh run-test-launchtimes-clh + - name: run memory foot print test + run: bash tests/metrics/gha-run.sh run-test-memory-usage + + - name: run memory usage inside container test + run: bash tests/metrics/gha-run.sh run-test-memory-usage-inside-container + + - name: run blogbench test + run: bash tests/metrics/gha-run.sh run-test-blogbench + + - name: make metrics tarball ${{ matrix.vmm }} + run: bash tests/metrics/gha-run.sh make-tarball-results + + - name: archive metrics results ${{ matrix.vmm }} + uses: actions/upload-artifact@v3 + with: + name: metrics-artifacts-${{ matrix.vmm }} + path: results-${{ matrix.vmm }}.tar.gz + retention-days: 1 + if-no-files-found: error diff --git a/.github/workflows/static-checks-dragonball.yaml b/.github/workflows/static-checks-dragonball.yaml index 9c8ae694b..61e3fe2c4 100644 --- a/.github/workflows/static-checks-dragonball.yaml +++ b/.github/workflows/static-checks-dragonball.yaml @@ -23,7 +23,7 @@ jobs: if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }} run: | ./ci/install_rust.sh - PATH=$PATH:"$HOME/.cargo/bin" + echo PATH="$HOME/.cargo/bin:$PATH" >> $GITHUB_ENV - name: Run Unit Test if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }} run: | diff --git a/README.md b/README.md index 7ea956d9d..78a62179c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -[![CI | Publish Kata Containers payload](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml/badge.svg)](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml) +[![CI | Publish Kata Containers payload](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml/badge.svg)](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml) [![Kata Containers Nightly CI](https://github.com/kata-containers/kata-containers/actions/workflows/ci-nightly.yaml/badge.svg)](https://github.com/kata-containers/kata-containers/actions/workflows/ci-nightly.yaml) # Kata Containers diff --git a/docs/design/README.md b/docs/design/README.md index 2fe93b5f6..63d52c655 100644 --- a/docs/design/README.md +++ b/docs/design/README.md @@ -17,6 +17,7 @@ Kata Containers design documents: - [Design for Kata Containers `Lazyload` ability with `nydus`](kata-nydus-design.md) - [Design for direct-assigned volume](direct-blk-device-assignment.md) - [Design for core-scheduling](core-scheduling.md) +- [Virtualization Reference Architecture](kata-vra.md) --- - [Design proposals](proposals) diff --git a/docs/design/kata-vra.md b/docs/design/kata-vra.md new file mode 100644 index 000000000..ba53c3371 --- /dev/null +++ b/docs/design/kata-vra.md @@ -0,0 +1,434 @@ +# Virtualization Reference Architecture + +## Subject to Change | © 2022 by NVIDIA Corporation. All rights reserved. | For test and development only_ + +Before digging deeper into the virtualization reference architecture, let's +first look at the various GPUDirect use cases in the following table. We’re +distinguishing between two top-tier use cases where the devices are (1) +passthrough and (2) virtualized, where a VM gets assigned a virtual function +(VF) and not the physical function (PF). A combination of PF and VF would also +be possible. + +| Device #1  (passthrough) | Device #2 (passthrough) | P2P Compatibility and Mode | +| ------------------------- | ----------------------- | -------------------------------------------- | +| GPU PF | GPU PF | GPUDirect P2P  | +| GPU PF | NIC PF | GPUDirect RDMA | +| MIG-slice | MIG-slice | _No GPUDirect P2P_ | +| MIG-slice | NIC PF | GPUDirect RDMA | +| **PDevice #1  (virtualized)** | **Device #2 (virtualized)** | **P2P Compatibility and Mode** | +| Time-slice vGPU VF | Time-slice vGPU VF | _No GPUDirect P2P  but NVLINK P2P available_ | +| Time-slice vGPU VF | NIC VF | GPUDirect RDMA | +| MIG-slice vGPU | MIG-slice vGPU | _No GPUDirect P2P_ | +| MIG-slice vGPU | NIC VF | GPUDirect RDMA | + +In a virtualized environment we have several distinct features that may prevent +Peer-to-peer (P2P) communication of two endpoints in a PCI Express topology. The +IOMMU translates IO virtual addresses (IOVA) to physical addresses (PA). Each +device behind an IOMMU has its own IOVA memory space, usually, no two devices +share the same IOVA memory space but it’s up to the hypervisor or OS how it +chooses to map devices to IOVA spaces.  Any PCI Express DMA transactions will +use IOVAs, which the IOMMU must translate. By default, all the traffic is routed +to the root complex and not issued directly to the peer device. + +An IOMMU can be used to isolate and protect devices even if virtualization is +not used; since devices can only access memory regions that are mapped for it, a +DMA from one device to another is not possible. DPDK uses the IOMMU to have +better isolation between devices, another benefit is that IOVA space can be +represented as a contiguous memory even if the PA space is heavily scattered. + +In the case of virtualization, the IOMMU is responsible for isolating the device +and memory between VMs for safe device assignment without compromising the host +and other guest OSes. Without an IOMMU, any device can access the entire system +and perform DMA transactions _anywhere_. + +The second feature is ACS (Access Control Services), which controls which +devices are allowed to communicate with one another and thus avoids improper +routing of packets irrespectively of whether IOMMU is enabled or not. + +When IOMMU is enabled, ACS is normally configured to force all PCI Express DMA +to go through the root complex so IOMMU can translate it, impacting performance +between peers with higher latency and reduced bandwidth. + +A way to avoid the performance hit is to enable Address Translation Services +(ATS). ATS-capable endpoints can prefetch IOVA -> PA translations from the IOMMU +and then perform DMA transactions directly to another endpoint. Hypervisors +enable this by enabling ATS in such endpoints, configuring ACS to enable Direct +Translated P2P, and configuring the IOMMU to allow Address Translation requests. + +Another important factor is that the NVIDIA driver stack will use the PCI +Express topology of the system it is running on to determine whether the +hardware is capable of supporting P2P. The driver stack qualifies specific +chipsets, and PCI Express switches for use with GPUDirect P2P. In virtual +environments, the PCI Express topology is flattened and obfuscated to present a +uniform environment to the software inside the VM, which breaks the GPUDirect +P2P use case. + +On a bare metal machine, the driver stack groups GPUs into cliques that can +perform GPUDirect P2P communication, excluding peer mappings where P2P +communication is not possible, prominently if GPUs are attached to multiple CPU +sockets.   + +CPUs and local memory banks are referred to as NUMA nodes. In a two-socket +server, each of the CPUs has a local memory bank for a total of two NUMA nodes. +Some servers provide the ability to configure additional NUMA nodes per CPU, +which means a CPU socket can have two NUMA nodes  (some servers support four +NUMA nodes per socket) with local memory banks and L3 NUMA domains for improved +performance. + +One of the current solutions is that the hypervisor provides additional topology +information that the driver stack can pick up and enable GPUDirect P2P between +GPUs, even if the virtualized environment does not directly expose it. The PCI +Express virtual P2P approval capability structure in the PCI configuration space +is entirely emulated by the hypervisor of passthrough GPU devices. + +A clique ID is provided where GPUs with the same clique ID belong to a group of +GPUs capable of P2P communication + +On vSphere, Azure, and other CPSs,  the hypervisor lays down a `topologies.xml` +which NCCL can pick up and deduce the right P2P level[^1]. NCCL is leveraging +Infiniband (IB) and/or Unified Communication X (UCX) for communication, and +GPUDirect P2P and GPUDirect RDMA should just work in this case. The only culprit +is that software or applications that do not use the XML file to deduce the +topology will fail and not enable GPUDirect ( [`nccl-p2p-level`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-p2p-level) ) + +## Hypervisor PCI Express Topology + +To enable every part of the accelerator stack, we propose a virtualized +reference architecture to enable GPUDirect P2P and GPUDirect RDMA for any +hypervisor. The idea is split into two parts to enable the right PCI Express +topology. The first part builds upon extending the PCI Express virtual P2P +approval capability structure to every device that wants to do P2P in some way +and groups devices by clique ID. The other part involves replicating a subset of +the host topology so that applications running in the VM do not need to read +additional information and enable the P2P capability like in the bare-metal use +case described above. The driver stack can then deduce automatically if the +topology presented in the VM is capable of P2P communication. + +We will work with the following host topology for the following sections. It is +a system with two converged DPUs, each having an `A100X` GPU and two `ConnectX-6` +network ports connected to the downstream ports of a PCI Express switch. + +```sh ++-00.0-[d8-df]----00.0-[d9-df]--+-00.0-[da-db]--+-00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network + | +-00.1 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network + | \-00.2 Mellanox Tech MT42822 BlueField-2 SoC Management Interface + \-01.0-[dc-df]----00.0-[dd-df]----08.0-[de-df]----00.0 NVIDIA Corporation GA100 [A100X] + ++-00.0-[3b-42]----00.0-[3c-42]--+-00.0-[3d-3e]--+-00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network + | +-00.1 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network + | \-00.2 Mellanox Tech MT42822 BlueField-2 SoC Management Interface + \-01.0-[3f-42]----00.0-[40-42]----08.0-[41-42]----00.0 NVIDIA Corporation GA100 [A100X] +``` + +The green path highlighted above is the optimal and preferred path for +efficient P2P communication. + +## PCI Express Virtual P2P Approval Capability + +Most of the time, the PCI Express topology is flattened and obfuscated to ensure +easy migration of the VM image between different physical hardware topologies. +In Kata, we can configure the hypervisor to use PCI Express root ports to +hotplug the VFIO  devices one is passing through. A user can select how many PCI +Express root ports to allocate depending on how many devices are passed through. +A recent addition to Kata will detect the right amount of PCI Express devices +that need hotplugging and bail out if the number of root ports is insufficient. +In Kata, we do not automatically increase the number of root ports, we want the +user to be in full control of the topology. + +```toml +# /etc/kata-containers/configuration.toml + +# VFIO devices are hotplugged on a bridge by default. +# Enable hot-plugging on the root bus. This may be required for devices with +# a large PCI bar, as this is a current limitation with hot-plugging on +# a bridge. +# Default “bridge-port” +hotplug_vfio = "root-port" + +# Before hot plugging a PCIe device, you need to add a pcie_root_port device. +# Use this parameter when using some large PCI bar devices, such as NVIDIA GPU +# The value means the number of pcie_root_port +# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35" +# Default 0 +pcie_root_port = 8 +``` + +VFIO devices are hotplugged on a PCIe-PCI bridge by default. Hotplug of PCI +Express devices is only supported on PCI Express root or downstream ports. With +this configuration set, if we start up a Kata container, we can inspect our +topology and see the allocated PCI Express root ports and the hotplugged +devices. + +```sh +$ lspci -tv + -[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller + +-01.0 Red Hat, Inc. Virtio console + +-02.0 Red Hat, Inc. Virtio SCSI + +-03.0 Red Hat, Inc. Virtio RNG + +-04.0-[01]----00.0 Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6 + +-05.0-[02]----00.0 Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6 + +-06.0-[03]----00.0 NVIDIA Corporation Device 20b8 + +-07.0-[04]----00.0 NVIDIA Corporation Device 20b8 + +-08.0-[05]-- + +-09.0-[06]-- + +-0a.0-[07]-- + +-0b.0-[08]-- + +-0c.0 Red Hat, Inc. Virtio socket + +-0d.0 Red Hat, Inc. Virtio file system + +-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller + +-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller + \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller +``` + +For devices with huge BARs (Base Address Registers) like the GPU (we need to +configure the PCI Express root port properly and allocate enough memory for +mapping), we have added a heuristic to Kata to deduce the right settings. Hence, +the BARs can be mapped correctly. This functionality is added to +[`nvidia/go-nvlib1](https://gitlab.com/nvidia/cloud-native/go-nvlib) which is part +of Kata now. + +```sh +$ sudo dmesg | grep BAR +[ 0.179960] pci 0000:00:04.0: BAR 7: assigned [io 0x1000-0x1fff] +[ 0.179962] pci 0000:00:05.0: BAR 7: assigned [io 0x2000-0x2fff] +[ 0.179963] pci 0000:00:06.0: BAR 7: assigned [io 0x3000-0x3fff] +[ 0.179964] pci 0000:00:07.0: BAR 7: assigned [io 0x4000-0x4fff] +[ 0.179966] pci 0000:00:08.0: BAR 7: assigned [io 0x5000-0x5fff] +[ 0.179967] pci 0000:00:09.0: BAR 7: assigned [io 0x6000-0x6fff] +[ 0.179968] pci 0000:00:0a.0: BAR 7: assigned [io 0x7000-0x7fff] +[ 0.179969] pci 0000:00:0b.0: BAR 7: assigned [io 0x8000-0x8fff] +[ 2.115912] pci 0000:01:00.0: BAR 0: assigned [mem 0x13000000000-0x13001ffffff 64bit pref] +[ 2.116203] pci 0000:01:00.0: BAR 2: assigned [mem 0x13002000000-0x130027fffff 64bit pref] +[ 2.683132] pci 0000:02:00.0: BAR 0: assigned [mem 0x12000000000-0x12001ffffff 64bit pref] +[ 2.683419] pci 0000:02:00.0: BAR 2: assigned [mem 0x12002000000-0x120027fffff 64bit pref] +[ 2.959155] pci 0000:03:00.0: BAR 1: assigned [mem 0x11000000000-0x117ffffffff 64bit pref] +[ 2.959345] pci 0000:03:00.0: BAR 3: assigned [mem 0x11800000000-0x11801ffffff 64bit pref] +[ 2.959523] pci 0000:03:00.0: BAR 0: assigned [mem 0xf9000000-0xf9ffffff] +[ 2.966119] pci 0000:04:00.0: BAR 1: assigned [mem 0x10000000000-0x107ffffffff 64bit pref] +[ 2.966295] pci 0000:04:00.0: BAR 3: assigned [mem 0x10800000000-0x10801ffffff 64bit pref] +[ 2.966472] pci 0000:04:00.0: BAR 0: assigned [mem 0xf7000000-0xf7ffffff] +``` + +The NVIDIA driver stack in this case would refuse to do P2P communication since +(1) the topology is not what it expects, (2)  we do not have a qualified +chipset. Since our P2P devices are not connected to a PCI Express switch port, +we need to provide additional information to support the P2P functionality. One +way of providing such meta information would be to annotate the container; most +of the settings in Kata's configuration file can be overridden via annotations, +but this limits the flexibility, and a user would need to update all the +containers that he wants to run with Kata. The goal is to make such things as +transparent as possible, so we also introduced +[CDI](https://github.com/container-orchestrated-devices/container-device-interface) +(Container Device Interface) to Kata. CDI is a[ +specification](https://github.com/container-orchestrated-devices/container-device-interface/blob/master/SPEC.md) +for container runtimes to support third-party devices. + +As written before, we can provide a clique ID for the devices that belong +together and are capable of doing P2P. This information is provided to the +hypervisor, which will set up things in the VM accordingly. Let's suppose the +user wanted to do GPUDirect RDMA with the first GPU and the NIC that reside on +the same DPU, one could provide the specification telling the hypervisor that +they belong to the same clique. + +```yaml +# /etc/cdi/nvidia.yaml +cdiVersion: 0.4.0 +kind: nvidia.com/gpu +devices: +- name: gpu0 + annotations: + bdf: “41:00.0” + clique-id: “0” + containerEdits: + deviceNodes: + - path: “/dev/vfio/71" + +# /etc/cdi/mellanox.yaml +cdiVersion: 0.4.0 +kind: mellanox.com/nic +devices: +- name: nic0 + annotations: + bdf: “3d:00.0” + clique-id: “0” + attach-pci: “true” + containerEdits: + deviceNodes: + - path: "/dev/vfio/66" +``` + +Since this setting is bound to the device and not the container we do not need +to alter the container just allocate the right resource and GPUDirect RDMA would +be set up correctly. Rather than exposing them separately, an idea would be to +expose a GPUDirect RDMA device via NFD (Node Feature Discovery) that combines +both of them; this way, we could make sure that the right pair is allocated and +used more on  Kubernetes deployment in the next section. + +The GPU driver stack is leveraging the PCI Express virtual P2P approval +capability, but the NIC stack does not use this now. One of the action items is +to enable MOFED to read the P2P approval capability and enable ATS and ACS +settings as described above. + +This way, we could enable GPUDirect P2P and GPUDirect RDMA on any topology +presented to the VM application. It is the responsibility of the administrator +or infrastructure engineer to provide the right information either via +annotations or a CDI specification. + +## Host Topology Replication + +The other way to represent the PCI Express topology in the VM is to replicate a +subset of the topology needed to support the P2P use case inside the VM. Similar +to the configuration for the root ports, we can easily configure the usage of +PCI Express switch ports to hotplug the devices. + +```toml +# /etc/kata-containers/configuration.toml + +# VFIO devices are hotplugged on a bridge by default. +# Enable hot plugging on the root bus. This may be required for devices with +# a large PCI bar, as this is a current limitation with hot plugging on +# a bridge. +# Default “bridge-port” +hotplug_vfio = "switch-port" + +# Before hot plugging a PCIe device, you need to add a pcie_root_port device. +# Use this parameter when using some large PCI bar devices, such as Nvidia GPU +# The value means the number of pcie_root_port +# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35" +# Default 0 +pcie_switch_port = 8 +``` + +Each device that is passed through is attached to a PCI Express downstream port +as illustrated below. We can even replicate the host’s two DPUs topologies with +added metadata through the CDI. Most of the time, a container only needs one +pair of GPU and NIC for GPUDirect RDMA. This is more of a showcase of what we +can do with the power of Kata and CDI. One could even think of adding groups of +devices that support P2P, even from different CPU sockets or NUMA nodes, into +one container; indeed, the first group is NUMA node 0 (red), and the second +group is NUMA node 1 (green). Since they are grouped correctly, P2P would be +enabled naturally inside a group, aka clique ID. + +```sh +$ lspci -tv + -[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller + +-01.0 Red Hat, Inc. Virtio console + +-02.0 Red Hat, Inc. Virtio SCSI + +-03.0 Red Hat, Inc. Virtio RNG + +-04.0-[01-04]----00.0-[02-04]--+-00.0-[03]----00.0 NVIDIA Corporation Device 20b8 + | \-01.0-[04]----00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx + +-05.0-[05-08]----00.0-[06-08]--+-00.0-[07]----00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx + | \-01.0-[08]----00.0 NVIDIA Corporation Device 20b8 + +-06.0 Red Hat, Inc. Virtio socket + +-07.0 Red Hat, Inc. Virtio file system + +-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller + +-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller [AHCI mode] + \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller + \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller +``` + +The configuration of using either the root port or switch port can be applied on +a per Container or Pod basis, meaning we can switch PCI Express topologies on +each run of an application. + +## Hypervisor Resource Limits + +Every hypervisor will have resource limits in terms of how many PCI Express root +ports, switch ports, or bridge ports can be created, especially with devices +that need to reserve a 4K IO range per PCI specification. Each instance of root +or switch port will consume 4K IO of very limited capacity, 64k is the maximum. + +Simple math brings us to the conclusion that we can have a maximum of 16 PCI +Express root ports or 16 PCI Express switch ports in QEMU if devices with IO +BARs are used in the PCI Express hierarchy. + +Additionally, one can have 32 slots on the PCI root bus and a maximum of 256 +slots for the complete PCI(e) topology. + +Per default, QEMU will attach a multi-function device in the last slot on the +PCI root bus, + +```sh + +-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller + +-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller [AHCI mode] + \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller +``` + +Kata will additionally add `virtio-xxx-pci` devices consuming (5 slots) plus a +PCIe-PCI-bridge (1 slot) and a DRAM controller (1 slot), meaning per default, we +have already eight slots used. This leaves us 24 slots for adding other devices +to the root bus. + +The problem that arises here is one use-case from a customer that uses recent +RTX GPUs with Kata. The user wanted to pass through eight of these GPUs into one +container and ran into issues. The problem is that those cards often consist of +four individual device nodes: GPU, Audio, and two USB controller devices (some +cards have a USB-C output). + +These devices are grouped into one IOMMU group. Since one needs to pass through +the complete IOMMU group into the VM, we need to allocate 32 PCI Express root +ports or 32 PCI Express switch ports, which is technically impossible due to the +resource limits outlined above. Since all the devices appear as PCI Express +devices, we need to hotplug those into a root or switch port. + +The solution to this problem is leveraging CDI. For each device, add the +information if it is going to be hotplugged as a PCI Express or PCI device, +which results in either using a PCI Express root/switch port or an ordinary PCI +bridge. PCI bridges are not affected by the limited IO range. This way, the GPU +is attached as a PCI Express device to a root/switch port and the other three +PCI devices to a PCI bridge, leaving enough resources to create the needed PCI +Express root/switch ports.  For example, we’re going to attach the GPUs to a PCI +Express root port and the NICs to a PCI bridge. + +```jsonld +# /etc/cdi/mellanox.json +cdiVersion: 0.4.0 +kind: mellanox.com/nic +devices: +- name: nic0 + annotations: + bdf: “3d:00.0” + clique-id: “0” + attach-pci: “true” + containerEdits: + deviceNodes: + - path: "/dev/vfio/66" +- name: nic1 + annotations: + bdf: “3d:00.1” + clique-id: “1” + attach-pci: “true” + containerEdits: + deviceNodes: + - path: "/dev/vfio/67” +``` + +The configuration is set to use eight root ports for the GPUs and attach the +NICs to a PCI bridge which is connected to a PCI Express-PCI bridge which is the +preferred way of introducing a PCI topology in a PCI Express machine. + +```sh +$ lspci -tv +-[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller + +-01.0 Red Hat, Inc. Virtio console + +-02.0 Red Hat, Inc. Virtio SCSI + +-03.0 Red Hat, Inc. Virtio RNG + +-04.0-[01]----00.0 NVIDIA Corporation Device 20b8 + +-05.0-[02]----00.0 NVIDIA Corporation Device 20b8 + +-06.0-[03]-- + +-07.0-[04]-- + +-08.0-[05]-- + +-09.0-[06]-- + +-0a.0-[07]-- + +-0b.0-[08]-- + +-0c.0-[09-0a]----00.0-[0a]--+-00.0 Mellanox Tech MT42822 BlueField-2 ConnectX-6 + | \-01.0 Mellanox Tech MT42822 BlueField-2 ConnectX-6 + +-0d.0 Red Hat, Inc. Virtio socket + +-0e.0 Red Hat, Inc. Virtio file system + +-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller + +-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller + \-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller +``` + +The PCI devices will consume a slot of which we have 256 in the PCI(e) topology +and leave scarce resources for the needed PCI Express devices. diff --git a/src/agent/rustjail/src/cgroups/fs/mod.rs b/src/agent/rustjail/src/cgroups/fs/mod.rs index 80c31b617..90cfe3c0a 100644 --- a/src/agent/rustjail/src/cgroups/fs/mod.rs +++ b/src/agent/rustjail/src/cgroups/fs/mod.rs @@ -39,11 +39,9 @@ use std::path::Path; const GUEST_CPUS_PATH: &str = "/sys/devices/system/cpu/online"; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger().new(o!("subsystem" => "cgroups")) - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger().new(o!("subsystem" => "cgroups")) } macro_rules! get_controller_or_return_singular_none { @@ -82,7 +80,7 @@ impl CgroupManager for Manager { fn set(&self, r: &LinuxResources, update: bool) -> Result<()> { info!( - sl!(), + sl(), "cgroup manager set resources for container. Resources input {:?}", r ); @@ -120,7 +118,7 @@ impl CgroupManager for Manager { // set devices resources set_devices_resources(&self.cgroup, &r.devices, res); - info!(sl!(), "resources after processed {:?}", res); + info!(sl(), "resources after processed {:?}", res); // apply resources self.cgroup.apply(res)?; @@ -197,7 +195,7 @@ impl CgroupManager for Manager { if guest_cpuset.is_empty() { return Ok(()); } - info!(sl!(), "update_cpuset_path to: {}", guest_cpuset); + info!(sl(), "update_cpuset_path to: {}", guest_cpuset); let h = cgroups::hierarchies::auto(); let root_cg = h.root_control_group(); @@ -205,12 +203,12 @@ impl CgroupManager for Manager { let root_cpuset_controller: &CpuSetController = root_cg.controller_of().unwrap(); let path = root_cpuset_controller.path(); let root_path = Path::new(path); - info!(sl!(), "root cpuset path: {:?}", &path); + info!(sl(), "root cpuset path: {:?}", &path); let container_cpuset_controller: &CpuSetController = self.cgroup.controller_of().unwrap(); let path = container_cpuset_controller.path(); let container_path = Path::new(path); - info!(sl!(), "container cpuset path: {:?}", &path); + info!(sl(), "container cpuset path: {:?}", &path); let mut paths = vec![]; for ancestor in container_path.ancestors() { @@ -219,7 +217,7 @@ impl CgroupManager for Manager { } paths.push(ancestor); } - info!(sl!(), "parent paths to update cpuset: {:?}", &paths); + info!(sl(), "parent paths to update cpuset: {:?}", &paths); let mut i = paths.len(); loop { @@ -233,7 +231,7 @@ impl CgroupManager for Manager { .to_str() .unwrap() .trim_start_matches(root_path.to_str().unwrap()); - info!(sl!(), "updating cpuset for parent path {:?}", &r_path); + info!(sl(), "updating cpuset for parent path {:?}", &r_path); let cg = new_cgroup(cgroups::hierarchies::auto(), r_path)?; let cpuset_controller: &CpuSetController = cg.controller_of().unwrap(); cpuset_controller.set_cpus(guest_cpuset)?; @@ -241,7 +239,7 @@ impl CgroupManager for Manager { if !container_cpuset.is_empty() { info!( - sl!(), + sl(), "updating cpuset for container path: {:?} cpuset: {}", &container_path, container_cpuset @@ -276,7 +274,7 @@ fn set_network_resources( network: &LinuxNetwork, res: &mut cgroups::Resources, ) { - info!(sl!(), "cgroup manager set network"); + info!(sl(), "cgroup manager set network"); // set classid // description can be found at https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v1/net_cls.html @@ -303,7 +301,7 @@ fn set_devices_resources( device_resources: &[LinuxDeviceCgroup], res: &mut cgroups::Resources, ) { - info!(sl!(), "cgroup manager set devices"); + info!(sl(), "cgroup manager set devices"); let mut devices = vec![]; for d in device_resources.iter() { @@ -332,7 +330,7 @@ fn set_hugepages_resources( hugepage_limits: &[LinuxHugepageLimit], res: &mut cgroups::Resources, ) { - info!(sl!(), "cgroup manager set hugepage"); + info!(sl(), "cgroup manager set hugepage"); let mut limits = vec![]; let hugetlb_controller = cg.controller_of::(); @@ -346,7 +344,7 @@ fn set_hugepages_resources( limits.push(hr); } else { warn!( - sl!(), + sl(), "{} page size support cannot be verified, dropping requested limit", l.page_size ); } @@ -359,7 +357,7 @@ fn set_block_io_resources( blkio: &LinuxBlockIo, res: &mut cgroups::Resources, ) { - info!(sl!(), "cgroup manager set block io"); + info!(sl(), "cgroup manager set block io"); res.blkio.weight = blkio.weight; res.blkio.leaf_weight = blkio.leaf_weight; @@ -387,13 +385,13 @@ fn set_block_io_resources( } fn set_cpu_resources(cg: &cgroups::Cgroup, cpu: &LinuxCpu) -> Result<()> { - info!(sl!(), "cgroup manager set cpu"); + info!(sl(), "cgroup manager set cpu"); let cpuset_controller: &CpuSetController = cg.controller_of().unwrap(); if !cpu.cpus.is_empty() { if let Err(e) = cpuset_controller.set_cpus(&cpu.cpus) { - warn!(sl!(), "write cpuset failed: {:?}", e); + warn!(sl(), "write cpuset failed: {:?}", e); } } @@ -424,7 +422,7 @@ fn set_cpu_resources(cg: &cgroups::Cgroup, cpu: &LinuxCpu) -> Result<()> { } fn set_memory_resources(cg: &cgroups::Cgroup, memory: &LinuxMemory, update: bool) -> Result<()> { - info!(sl!(), "cgroup manager set memory"); + info!(sl(), "cgroup manager set memory"); let mem_controller: &MemController = cg.controller_of().unwrap(); if !update { @@ -493,7 +491,7 @@ fn set_memory_resources(cg: &cgroups::Cgroup, memory: &LinuxMemory, update: bool } fn set_pids_resources(cg: &cgroups::Cgroup, pids: &LinuxPids) -> Result<()> { - info!(sl!(), "cgroup manager set pids"); + info!(sl(), "cgroup manager set pids"); let pid_controller: &PidController = cg.controller_of().unwrap(); let v = if pids.limit > 0 { MaxValue::Value(pids.limit) @@ -965,7 +963,7 @@ pub fn get_paths() -> Result> { for l in fs::read_to_string(PATHS)?.lines() { let fl: Vec<&str> = l.split(':').collect(); if fl.len() != 3 { - info!(sl!(), "Corrupted cgroup data!"); + info!(sl(), "Corrupted cgroup data!"); continue; } @@ -986,7 +984,7 @@ pub fn get_mounts(paths: &HashMap) -> Result = p[1].split(' ').collect(); if post.len() != 3 { - warn!(sl!(), "can't parse {} line {:?}", MOUNTS, l); + warn!(sl(), "can't parse {} line {:?}", MOUNTS, l); continue; } diff --git a/src/agent/rustjail/src/cgroups/notifier.rs b/src/agent/rustjail/src/cgroups/notifier.rs index 9f91b3584..5260a3d3f 100644 --- a/src/agent/rustjail/src/cgroups/notifier.rs +++ b/src/agent/rustjail/src/cgroups/notifier.rs @@ -16,11 +16,9 @@ use inotify::{Inotify, WatchMask}; use tokio::io::AsyncReadExt; use tokio::sync::mpsc::{channel, Receiver}; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger().new(o!("subsystem" => "cgroups_notifier")) - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger().new(o!("subsystem" => "cgroups_notifier")) } pub async fn notify_oom(cid: &str, cg_dir: String) -> Result> { @@ -38,7 +36,7 @@ pub async fn notify_oom(cid: &str, cg_dir: String) -> Result> { fn get_value_from_cgroup(path: &Path, key: &str) -> Result { let content = fs::read_to_string(path)?; info!( - sl!(), + sl(), "get_value_from_cgroup file: {:?}, content: {}", &path, &content ); @@ -67,11 +65,11 @@ async fn register_memory_event_v2( let event_control_path = Path::new(&cg_dir).join(memory_event_name); let cgroup_event_control_path = Path::new(&cg_dir).join(cgroup_event_name); info!( - sl!(), + sl(), "register_memory_event_v2 event_control_path: {:?}", &event_control_path ); info!( - sl!(), + sl(), "register_memory_event_v2 cgroup_event_control_path: {:?}", &cgroup_event_control_path ); @@ -82,8 +80,8 @@ async fn register_memory_event_v2( // Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited let cg_wd = inotify.add_watch(&cgroup_event_control_path, WatchMask::MODIFY)?; - info!(sl!(), "ev_wd: {:?}", ev_wd); - info!(sl!(), "cg_wd: {:?}", cg_wd); + info!(sl(), "ev_wd: {:?}", ev_wd); + info!(sl(), "cg_wd: {:?}", cg_wd); let (sender, receiver) = channel(100); let containere_id = containere_id.to_string(); @@ -97,17 +95,17 @@ async fn register_memory_event_v2( while let Some(event_or_error) = stream.next().await { let event = event_or_error.unwrap(); info!( - sl!(), + sl(), "container[{}] get event for container: {:?}", &containere_id, &event ); // info!("is1: {}", event.wd == wd1); - info!(sl!(), "event.wd: {:?}", event.wd); + info!(sl(), "event.wd: {:?}", event.wd); if event.wd == ev_wd { let oom = get_value_from_cgroup(&event_control_path, "oom_kill"); if oom.unwrap_or(0) > 0 { let _ = sender.send(containere_id.clone()).await.map_err(|e| { - error!(sl!(), "send containere_id failed, error: {:?}", e); + error!(sl(), "send containere_id failed, error: {:?}", e); }); return; } @@ -171,13 +169,13 @@ async fn register_memory_event( let mut buf = [0u8; 8]; match eventfd_stream.read(&mut buf).await { Err(err) => { - warn!(sl!(), "failed to read from eventfd: {:?}", err); + warn!(sl(), "failed to read from eventfd: {:?}", err); return; } Ok(_) => { let content = fs::read_to_string(path.clone()); info!( - sl!(), + sl(), "cgroup event for container: {}, path: {:?}, content: {:?}", &containere_id, &path, @@ -193,7 +191,7 @@ async fn register_memory_event( } let _ = sender.send(containere_id.clone()).await.map_err(|e| { - error!(sl!(), "send containere_id failed, error: {:?}", e); + error!(sl(), "send containere_id failed, error: {:?}", e); }); } }); diff --git a/src/agent/rustjail/src/container.rs b/src/agent/rustjail/src/container.rs index b1d7499cd..2964ae377 100644 --- a/src/agent/rustjail/src/container.rs +++ b/src/agent/rustjail/src/container.rs @@ -1596,10 +1596,8 @@ mod tests { use tempfile::tempdir; use test_utils::skip_if_not_root; - macro_rules! sl { - () => { - slog_scope::logger() - }; + fn sl() -> slog::Logger { + slog_scope::logger() } #[test] @@ -1854,7 +1852,7 @@ mod tests { let _ = new_linux_container_and_then(|mut c: LinuxContainer| { c.processes.insert( 1, - Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap(), + Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap(), ); let p = c.get_process("123"); assert!(p.is_ok(), "Expecting Ok, Got {:?}", p); @@ -1881,7 +1879,7 @@ mod tests { let (c, _dir) = new_linux_container(); let ret = c .unwrap() - .start(Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap()) + .start(Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap()) .await; assert!(ret.is_err(), "Expecting Err, Got {:?}", ret); } @@ -1891,7 +1889,7 @@ mod tests { let (c, _dir) = new_linux_container(); let ret = c .unwrap() - .run(Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap()) + .run(Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap()) .await; assert!(ret.is_err(), "Expecting Err, Got {:?}", ret); } diff --git a/src/agent/rustjail/src/process.rs b/src/agent/rustjail/src/process.rs index cdecae130..0e7fe73ef 100644 --- a/src/agent/rustjail/src/process.rs +++ b/src/agent/rustjail/src/process.rs @@ -161,7 +161,7 @@ impl Process { pub fn notify_term_close(&mut self) { let notify = self.term_exit_notifier.clone(); - notify.notify_waiters(); + notify.notify_one(); } pub fn close_stdin(&mut self) { diff --git a/src/agent/src/device.rs b/src/agent/src/device.rs index f292299dc..2b8af8229 100644 --- a/src/agent/src/device.rs +++ b/src/agent/src/device.rs @@ -26,11 +26,9 @@ use oci::{LinuxDeviceCgroup, LinuxResources, Spec}; use protocols::agent::Device; use tracing::instrument; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger().new(o!("subsystem" => "device")) - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger().new(o!("subsystem" => "device")) } const VM_ROOTFS: &str = "/"; @@ -78,7 +76,7 @@ where { let syspci = Path::new(&syspci); let drv = drv.as_ref(); - info!(sl!(), "rebind_pci_driver: {} => {:?}", dev, drv); + info!(sl(), "rebind_pci_driver: {} => {:?}", dev, drv); let devpath = syspci.join("devices").join(dev.to_string()); let overridepath = &devpath.join("driver_override"); @@ -606,7 +604,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) - let host_minor = specdev.minor; info!( - sl!(), + sl(), "update_spec_devices() updating device"; "container_path" => &specdev.path, "type" => &specdev.r#type, @@ -659,7 +657,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) - if let Some(update) = res_updates.get(&(host_type.as_str(), host_major, host_minor)) { info!( - sl!(), + sl(), "update_spec_devices() updating resource"; "type" => &host_type, "host_major" => host_major, @@ -923,7 +921,7 @@ pub async fn add_devices( #[instrument] async fn add_device(device: &Device, sandbox: &Arc>) -> Result { // log before validation to help with debugging gRPC protocol version differences. - info!(sl!(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}", + info!(sl(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}", device.id, device.type_, device.vm_path, device.container_path, device.options); if device.type_.is_empty() { diff --git a/src/agent/src/image_rpc.rs b/src/agent/src/image_rpc.rs index 571994a0d..d9cd45b84 100644 --- a/src/agent/src/image_rpc.rs +++ b/src/agent/src/image_rpc.rs @@ -38,11 +38,9 @@ const KATA_CC_IMAGE_WORK_DIR: &str = "/run/image/"; const KATA_CC_PAUSE_BUNDLE: &str = "/pause_bundle"; const CONFIG_JSON: &str = "config.json"; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger() - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger().new(o!("subsystem" => "cgroups")) } pub struct ImageService { @@ -57,18 +55,17 @@ impl ImageService { env::set_var("CC_IMAGE_WORK_DIR", KATA_CC_IMAGE_WORK_DIR); let mut image_client = ImageClient::default(); - let image_policy_file = &AGENT_CONFIG.read().await.image_policy_file; + let image_policy_file = &AGENT_CONFIG.image_policy_file; if !image_policy_file.is_empty() { image_client.config.file_paths.sigstore_config = image_policy_file.clone(); } - let simple_signing_sigstore_config = - &AGENT_CONFIG.read().await.simple_signing_sigstore_config; + let simple_signing_sigstore_config = &AGENT_CONFIG.simple_signing_sigstore_config; if !simple_signing_sigstore_config.is_empty() { image_client.config.file_paths.sigstore_config = simple_signing_sigstore_config.clone(); } - let image_registry_auth_file = &AGENT_CONFIG.read().await.image_registry_auth_file; + let image_registry_auth_file = &AGENT_CONFIG.image_registry_auth_file; if !image_registry_auth_file.is_empty() { image_client.config.file_paths.auth_file = image_registry_auth_file.clone(); } @@ -88,7 +85,7 @@ impl ImageService { return Err(anyhow!("Pause image not present in rootfs")); } - info!(sl!(), "use guest pause image cid {:?}", cid); + info!(sl(), "use guest pause image cid {:?}", cid); let pause_bundle = Path::new(CONTAINER_BASE).join(cid); let pause_rootfs = pause_bundle.join("rootfs"); let pause_config = pause_bundle.join(CONFIG_JSON); @@ -159,12 +156,12 @@ impl ImageService { async fn pull_image(&self, req: &image::PullImageRequest) -> Result { env::set_var("OCICRYPT_KEYPROVIDER_CONFIG", OCICRYPT_CONFIG_PATH); - let https_proxy = &AGENT_CONFIG.read().await.https_proxy; + let https_proxy = &AGENT_CONFIG.https_proxy; if !https_proxy.is_empty() { env::set_var("HTTPS_PROXY", https_proxy); } - let no_proxy = &AGENT_CONFIG.read().await.no_proxy; + let no_proxy = &AGENT_CONFIG.no_proxy; if !no_proxy.is_empty() { env::set_var("NO_PROXY", no_proxy); } @@ -179,7 +176,7 @@ impl ImageService { return Ok(image.to_owned()); } - let aa_kbc_params = &AGENT_CONFIG.read().await.aa_kbc_params; + let aa_kbc_params = &AGENT_CONFIG.aa_kbc_params; if !aa_kbc_params.is_empty() { match self.attestation_agent_started.compare_exchange_weak( false, @@ -188,22 +185,21 @@ impl ImageService { Ordering::SeqCst, ) { Ok(_) => Self::init_attestation_agent()?, - Err(_) => info!(sl!(), "Attestation Agent already running"), + Err(_) => info!(sl(), "Attestation Agent already running"), } } // If the attestation-agent is being used, then enable the authenticated credentials support info!( - sl!(), + sl(), "image_client.config.auth set to: {}", !aa_kbc_params.is_empty() ); self.image_client.lock().await.config.auth = !aa_kbc_params.is_empty(); // Read enable signature verification from the agent config and set it in the image_client - let enable_signature_verification = - &AGENT_CONFIG.read().await.enable_signature_verification; + let enable_signature_verification = &AGENT_CONFIG.enable_signature_verification; info!( - sl!(), + sl(), "enable_signature_verification set to: {}", enable_signature_verification ); self.image_client.lock().await.config.security_validate = *enable_signature_verification; @@ -215,7 +211,7 @@ impl ImageService { let decrypt_config = format!("provider:attestation-agent:{}", aa_kbc_params); - info!(sl!(), "pull image {:?}, bundle path {:?}", cid, bundle_path); + info!(sl(), "pull image {:?}, bundle path {:?}", cid, bundle_path); // Image layers will store at KATA_CC_IMAGE_WORK_DIR, generated bundles // with rootfs and config.json will store under CONTAINER_BASE/cid. let res = self @@ -228,13 +224,13 @@ impl ImageService { match res { Ok(image) => { info!( - sl!(), + sl(), "pull and unpack image {:?}, cid: {:?}, with image-rs succeed. ", image, cid ); } Err(e) => { error!( - sl!(), + sl(), "pull and unpack image {:?}, cid: {:?}, with image-rs failed with {:?}. ", image, cid, diff --git a/src/agent/src/main.rs b/src/agent/src/main.rs index 1b6b5fe42..a869e5afa 100644 --- a/src/agent/src/main.rs +++ b/src/agent/src/main.rs @@ -65,7 +65,7 @@ use tokio::{ io::AsyncWrite, sync::{ watch::{channel, Receiver}, - Mutex, RwLock, + Mutex, }, task::JoinHandle, }; @@ -84,12 +84,11 @@ cfg_if! { const NAME: &str = "kata-agent"; lazy_static! { - static ref AGENT_CONFIG: Arc> = Arc::new(RwLock::new( + static ref AGENT_CONFIG: AgentConfig = // Note: We can't do AgentOpts.parse() here to send through the processed arguments to AgentConfig // clap::Parser::parse() greedily process all command line input including cargo test parameters, // so should only be used inside main. - AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap() - )); + AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap(); } #[derive(Parser)] @@ -182,13 +181,13 @@ async fn real_main() -> std::result::Result<(), Box> { lazy_static::initialize(&AGENT_CONFIG); - init_agent_as_init(&logger, AGENT_CONFIG.read().await.unified_cgroup_hierarchy)?; + init_agent_as_init(&logger, AGENT_CONFIG.unified_cgroup_hierarchy)?; drop(logger_async_guard); } else { lazy_static::initialize(&AGENT_CONFIG); } - let config = AGENT_CONFIG.read().await; + let config = &AGENT_CONFIG; let log_vport = config.log_vport as u32; let log_handle = tokio::spawn(create_logger_task(rfd, log_vport, shutdown_rx.clone())); @@ -201,7 +200,7 @@ async fn real_main() -> std::result::Result<(), Box> { let (logger, logger_async_guard) = logging::create_logger(NAME, "agent", config.log_level, writer); - announce(&logger, &config); + announce(&logger, config); // This variable is required as it enables the global (and crucially static) logger, // which is required to satisfy the the lifetime constraints of the auto-generated gRPC code. @@ -229,7 +228,7 @@ async fn real_main() -> std::result::Result<(), Box> { let span_guard = root_span.enter(); // Start the sandbox and wait for its ttRPC server to end - start_sandbox(&logger, &config, init_mode, &mut tasks, shutdown_rx.clone()).await?; + start_sandbox(&logger, config, init_mode, &mut tasks, shutdown_rx.clone()).await?; // Install a NOP logger for the remainder of the shutdown sequence // to ensure any log calls made by local crates using the scope logger diff --git a/src/agent/src/metrics.rs b/src/agent/src/metrics.rs index a5522c0eb..d7fc4d12b 100644 --- a/src/agent/src/metrics.rs +++ b/src/agent/src/metrics.rs @@ -15,11 +15,9 @@ use tracing::instrument; const NAMESPACE_KATA_AGENT: &str = "kata_agent"; const NAMESPACE_KATA_GUEST: &str = "kata_guest"; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger().new(o!("subsystem" => "metrics")) - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger().new(o!("subsystem" => "metrics")) } lazy_static! { @@ -139,7 +137,7 @@ fn update_agent_metrics() -> Result<()> { Ok(p) => p, Err(e) => { // FIXME: return Ok for all errors? - warn!(sl!(), "failed to create process instance: {:?}", e); + warn!(sl(), "failed to create process instance: {:?}", e); return Ok(()); } @@ -160,7 +158,7 @@ fn update_agent_metrics() -> Result<()> { // io match me.io() { Err(err) => { - info!(sl!(), "failed to get process io stat: {:?}", err); + info!(sl(), "failed to get process io stat: {:?}", err); } Ok(io) => { set_gauge_vec_proc_io(&AGENT_IO_STAT, &io); @@ -169,7 +167,7 @@ fn update_agent_metrics() -> Result<()> { match me.stat() { Err(err) => { - info!(sl!(), "failed to get process stat: {:?}", err); + info!(sl(), "failed to get process stat: {:?}", err); } Ok(stat) => { set_gauge_vec_proc_stat(&AGENT_PROC_STAT, &stat); @@ -177,7 +175,7 @@ fn update_agent_metrics() -> Result<()> { } match me.status() { - Err(err) => error!(sl!(), "failed to get process status: {:?}", err), + Err(err) => error!(sl(), "failed to get process status: {:?}", err), Ok(status) => set_gauge_vec_proc_status(&AGENT_PROC_STATUS, &status), } @@ -189,7 +187,7 @@ fn update_guest_metrics() { // try get load and task info match procfs::LoadAverage::new() { Err(err) => { - info!(sl!(), "failed to get guest LoadAverage: {:?}", err); + info!(sl(), "failed to get guest LoadAverage: {:?}", err); } Ok(load) => { GUEST_LOAD @@ -209,7 +207,7 @@ fn update_guest_metrics() { // try to get disk stats match procfs::diskstats() { Err(err) => { - info!(sl!(), "failed to get guest diskstats: {:?}", err); + info!(sl(), "failed to get guest diskstats: {:?}", err); } Ok(diskstats) => { for diskstat in diskstats { @@ -221,7 +219,7 @@ fn update_guest_metrics() { // try to get vm stats match procfs::vmstat() { Err(err) => { - info!(sl!(), "failed to get guest vmstat: {:?}", err); + info!(sl(), "failed to get guest vmstat: {:?}", err); } Ok(vmstat) => { for (k, v) in vmstat { @@ -233,7 +231,7 @@ fn update_guest_metrics() { // cpu stat match procfs::KernelStats::new() { Err(err) => { - info!(sl!(), "failed to get guest KernelStats: {:?}", err); + info!(sl(), "failed to get guest KernelStats: {:?}", err); } Ok(kernel_stats) => { set_gauge_vec_cpu_time(&GUEST_CPU_TIME, "total", &kernel_stats.total); @@ -246,7 +244,7 @@ fn update_guest_metrics() { // try to get net device stats match procfs::net::dev_status() { Err(err) => { - info!(sl!(), "failed to get guest net::dev_status: {:?}", err); + info!(sl(), "failed to get guest net::dev_status: {:?}", err); } Ok(devs) => { // netdev: map[string]procfs::net::DeviceStatus @@ -259,7 +257,7 @@ fn update_guest_metrics() { // get statistics about memory from /proc/meminfo match procfs::Meminfo::new() { Err(err) => { - info!(sl!(), "failed to get guest Meminfo: {:?}", err); + info!(sl(), "failed to get guest Meminfo: {:?}", err); } Ok(meminfo) => { set_gauge_vec_meminfo(&GUEST_MEMINFO, &meminfo); diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index 9049cd5c2..4e7429e49 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -118,33 +118,25 @@ const ERR_NO_SANDBOX_PIDNS: &str = "Sandbox does not have sandbox_pidns"; // not available. const IPTABLES_RESTORE_WAIT_SEC: u64 = 5; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger() - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger() } -// Convenience macro to wrap an error and response to ttrpc client -macro_rules! ttrpc_error { - ($code:path, $err:expr $(,)?) => { - get_rpc_status($code, format!("{:?}", $err)) - }; +// Convenience function to wrap an error and response to ttrpc client +fn ttrpc_error(code: ttrpc::Code, err: impl std::fmt::Debug) -> ttrpc::Error { + get_rpc_status(code, format!("{:?}", err)) } -macro_rules! is_allowed { - ($req:ident) => { - if !AGENT_CONFIG - .read() - .await - .is_allowed_endpoint($req.descriptor_dyn().name()) - { - return Err(ttrpc_error!( - ttrpc::Code::UNIMPLEMENTED, - format!("{} is blocked", $req.descriptor_dyn().name()), - )); - } - }; +fn is_allowed(req: &impl MessageDyn) -> ttrpc::Result<()> { + if !AGENT_CONFIG.is_allowed_endpoint(req.descriptor_dyn().name()) { + Err(ttrpc_error( + ttrpc::Code::UNIMPLEMENTED, + format!("{} is blocked", req.descriptor_dyn().name()), + )) + } else { + Ok(()) + } } #[derive(Clone, Debug)] @@ -207,14 +199,14 @@ impl AgentService { let mut oci = match oci_spec.as_mut() { Some(spec) => rustjail::grpc_to_oci(spec), None => { - error!(sl!(), "no oci spec in the create container request!"); + error!(sl(), "no oci spec in the create container request!"); return Err(anyhow!(nix::Error::EINVAL)); } }; - info!(sl!(), "receive createcontainer, spec: {:?}", &oci); + info!(sl(), "receive createcontainer, spec: {:?}", &oci); info!( - sl!(), + sl(), "receive createcontainer, storages: {:?}", &req.storages ); @@ -237,9 +229,9 @@ impl AgentService { let dev_major_minor = format!("{}:{}", specdev.major, specdev.minor); if specdev.path == TRUSTED_STORAGE_DEVICE { - let data_integrity = AGENT_CONFIG.read().await.data_integrity; + let data_integrity = AGENT_CONFIG.data_integrity; info!( - sl!(), + sl(), "trusted_store device major:min {}, enable data integrity {}", dev_major_minor, data_integrity.to_string() @@ -260,7 +252,7 @@ impl AgentService { // here, the agent will rely on rustjail (using the oci.Mounts // list) to bind mount all of them inside the container. let m = add_storages( - sl!(), + sl(), req.storages.to_vec(), self.sandbox.clone(), Some(req.container_id.clone()), @@ -308,33 +300,33 @@ impl AgentService { }; let mut ctr: LinuxContainer = - LinuxContainer::new(cid.as_str(), CONTAINER_BASE, opts, &sl!())?; + LinuxContainer::new(cid.as_str(), CONTAINER_BASE, opts, &sl())?; - let pipe_size = AGENT_CONFIG.read().await.container_pipe_size; + let pipe_size = AGENT_CONFIG.container_pipe_size; let p = if let Some(p) = oci.process { - Process::new(&sl!(), &p, cid.as_str(), true, pipe_size)? + Process::new(&sl(), &p, cid.as_str(), true, pipe_size)? } else { - info!(sl!(), "no process configurations!"); + info!(sl(), "no process configurations!"); return Err(anyhow!(nix::Error::EINVAL)); }; // if starting container failed, we will do some rollback work // to ensure no resources are leaked. if let Err(err) = ctr.start(p).await { - error!(sl!(), "failed to start container: {:?}", err); + error!(sl(), "failed to start container: {:?}", err); if let Err(e) = ctr.destroy().await { - error!(sl!(), "failed to destroy container: {:?}", e); + error!(sl(), "failed to destroy container: {:?}", e); } if let Err(e) = remove_container_resources(&mut s, &cid) { - error!(sl!(), "failed to remove container resources: {:?}", e); + error!(sl(), "failed to remove container resources: {:?}", e); } return Err(err); } s.update_shared_pidns(&ctr)?; s.add_container(ctr); - info!(sl!(), "created container!"); + info!(sl(), "created container!"); Ok(()) } @@ -431,7 +423,7 @@ impl AgentService { let cid = req.container_id.clone(); let exec_id = req.exec_id.clone(); - info!(sl!(), "do_exec_process cid: {} eid: {}", cid, exec_id); + info!(sl(), "do_exec_process cid: {} eid: {}", cid, exec_id); let s = self.sandbox.clone(); let mut sandbox = s.lock().await; @@ -444,9 +436,9 @@ impl AgentService { // Apply any necessary corrections for PCI addresses update_env_pci(&mut process.Env, &sandbox.pcimap)?; - let pipe_size = AGENT_CONFIG.read().await.container_pipe_size; + let pipe_size = AGENT_CONFIG.container_pipe_size; let ocip = rustjail::process_grpc_to_oci(&process); - let p = Process::new(&sl!(), &ocip, exec_id.as_str(), false, pipe_size)?; + let p = Process::new(&sl(), &ocip, exec_id.as_str(), false, pipe_size)?; let ctr = sandbox .get_container(&cid) @@ -464,7 +456,7 @@ impl AgentService { let s = self.sandbox.clone(); info!( - sl!(), + sl(), "signal process"; "container-id" => cid.clone(), "exec-id" => eid.clone(), @@ -486,7 +478,7 @@ impl AgentService { match p.signal(sig) { Err(Errno::ESRCH) => { info!( - sl!(), + sl(), "signal encounter ESRCH, continue"; "container-id" => cid.clone(), "exec-id" => eid.clone(), @@ -502,7 +494,7 @@ impl AgentService { if eid.is_empty() { // eid is empty, signal all the remaining processes in the container cgroup info!( - sl!(), + sl(), "signal all the remaining processes"; "container-id" => cid.clone(), "exec-id" => eid.clone(), @@ -510,7 +502,7 @@ impl AgentService { if let Err(err) = self.freeze_cgroup(&cid, FreezerState::Frozen).await { warn!( - sl!(), + sl(), "freeze cgroup failed"; "container-id" => cid.clone(), "exec-id" => eid.clone(), @@ -523,7 +515,7 @@ impl AgentService { let res = unsafe { libc::kill(*pid, sig) }; if let Err(err) = Errno::result(res).map(drop) { warn!( - sl!(), + sl(), "signal failed"; "container-id" => cid.clone(), "exec-id" => eid.clone(), @@ -534,7 +526,7 @@ impl AgentService { } if let Err(err) = self.freeze_cgroup(&cid, FreezerState::Thawed).await { warn!( - sl!(), + sl(), "unfreeze cgroup failed"; "container-id" => cid.clone(), "exec-id" => eid.clone(), @@ -579,7 +571,7 @@ impl AgentService { let (exit_send, mut exit_recv) = tokio::sync::mpsc::channel(100); info!( - sl!(), + sl(), "wait process"; "container-id" => cid.clone(), "exec-id" => eid.clone() @@ -596,9 +588,9 @@ impl AgentService { }; if let Some(mut exit_rx) = exit_rx { - info!(sl!(), "cid {} eid {} waiting for exit signal", &cid, &eid); + info!(sl(), "cid {} eid {} waiting for exit signal", &cid, &eid); while exit_rx.changed().await.is_ok() {} - info!(sl!(), "cid {} eid {} received exit signal", &cid, &eid); + info!(sl(), "cid {} eid {} received exit signal", &cid, &eid); } let mut sandbox = s.lock().await; @@ -673,16 +665,15 @@ impl AgentService { let cid = req.container_id; let eid = req.exec_id; - let term_exit_notifier; + let mut term_exit_notifier = Arc::new(tokio::sync::Notify::new()); let reader = { let s = self.sandbox.clone(); let mut sandbox = s.lock().await; let p = sandbox.find_container_process(cid.as_str(), eid.as_str())?; - term_exit_notifier = p.term_exit_notifier.clone(); - if p.term_master.is_some() { + term_exit_notifier = p.term_exit_notifier.clone(); p.get_reader(StreamType::TermMaster) } else if stdout { if p.parent_stdout.is_some() { @@ -727,7 +718,7 @@ impl AgentService { .join(container_id) .join(CONFIG_JSON); debug!( - sl!(), + sl(), "Image bundle config path: {:?}", image_oci_config_path ); @@ -772,9 +763,9 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::CreateContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "create_container", req); - is_allowed!(req); + is_allowed(&req)?; match self.do_create_container(req).await { - Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), } } @@ -785,9 +776,9 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::StartContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "start_container", req); - is_allowed!(req); + is_allowed(&req)?; match self.do_start_container(req).await { - Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), } } @@ -798,10 +789,10 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::RemoveContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "remove_container", req); - is_allowed!(req); + is_allowed(&req)?; match self.do_remove_container(req).await { - Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), } } @@ -812,9 +803,9 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::ExecProcessRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "exec_process", req); - is_allowed!(req); + is_allowed(&req)?; match self.do_exec_process(req).await { - Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), } } @@ -825,9 +816,9 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::SignalProcessRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "signal_process", req); - is_allowed!(req); + is_allowed(&req)?; match self.do_signal_process(req).await { - Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), Ok(_) => Ok(Empty::new()), } } @@ -838,10 +829,10 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::WaitProcessRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "wait_process", req); - is_allowed!(req); + is_allowed(&req)?; self.do_wait_process(req) .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e)) } async fn update_container( @@ -850,7 +841,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::UpdateContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "update_container", req); - is_allowed!(req); + is_allowed(&req)?; let cid = req.container_id.clone(); let res = req.resources; @@ -858,7 +849,7 @@ impl agent_ttrpc::AgentService for AgentService { let mut sandbox = s.lock().await; let ctr = sandbox.get_container(&cid).ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "invalid container id".to_string(), ) @@ -870,7 +861,7 @@ impl agent_ttrpc::AgentService for AgentService { let oci_res = rustjail::resources_grpc_to_oci(res); match ctr.set(oci_res) { Err(e) => { - return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)); + return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } Ok(_) => return Ok(resp), @@ -886,20 +877,20 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::StatsContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "stats_container", req); - is_allowed!(req); + is_allowed(&req)?; let cid = req.container_id; let s = Arc::clone(&self.sandbox); let mut sandbox = s.lock().await; let ctr = sandbox.get_container(&cid).ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "invalid container id".to_string(), ) })?; ctr.stats() - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e)) } async fn pause_container( @@ -908,20 +899,20 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::PauseContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "pause_container", req); - is_allowed!(req); + is_allowed(&req)?; let cid = req.container_id(); let s = Arc::clone(&self.sandbox); let mut sandbox = s.lock().await; let ctr = sandbox.get_container(cid).ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "invalid container id".to_string(), ) })?; ctr.pause() - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -932,20 +923,20 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::ResumeContainerRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "resume_container", req); - is_allowed!(req); + is_allowed(&req)?; let cid = req.container_id(); let s = Arc::clone(&self.sandbox); let mut sandbox = s.lock().await; let ctr = sandbox.get_container(cid).ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "invalid container id".to_string(), ) })?; ctr.resume() - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -956,16 +947,16 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::RemoveStaleVirtiofsShareMountsRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "remove_stale_virtiofs_share_mounts", req); - is_allowed!(req); + is_allowed(&req)?; let mount_infos = parse_mount_table("/proc/self/mountinfo") - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; for m in &mount_infos { if m.mount_point.starts_with(KATA_GUEST_SHARE_DIR) { // stat the mount point, virtiofs daemon will remove the stale cache and release the fds if the mount point doesn't exist any more. // More details in https://github.com/kata-containers/kata-containers/issues/6455#issuecomment-1477137277 match stat::stat(Path::new(&m.mount_point)) { - Ok(_) => info!(sl!(), "stat {} success", m.mount_point), - Err(e) => info!(sl!(), "stat {} failed: {}", m.mount_point, e), + Ok(_) => info!(sl(), "stat {} success", m.mount_point), + Err(e) => info!(sl(), "stat {} failed: {}", m.mount_point, e), } } } @@ -978,10 +969,10 @@ impl agent_ttrpc::AgentService for AgentService { _ctx: &TtrpcContext, req: protocols::agent::WriteStreamRequest, ) -> ttrpc::Result { - is_allowed!(req); + is_allowed(&req)?; self.do_write_stream(req) .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e)) } async fn read_stdout( @@ -989,10 +980,10 @@ impl agent_ttrpc::AgentService for AgentService { _ctx: &TtrpcContext, req: protocols::agent::ReadStreamRequest, ) -> ttrpc::Result { - is_allowed!(req); + is_allowed(&req)?; self.do_read_stream(req, true) .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e)) } async fn read_stderr( @@ -1000,10 +991,10 @@ impl agent_ttrpc::AgentService for AgentService { _ctx: &TtrpcContext, req: protocols::agent::ReadStreamRequest, ) -> ttrpc::Result { - is_allowed!(req); + is_allowed(&req)?; self.do_read_stream(req, false) .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e)) + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e)) } async fn close_stdin( @@ -1012,7 +1003,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::CloseStdinRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "close_stdin", req); - is_allowed!(req); + is_allowed(&req)?; let cid = req.container_id.clone(); let eid = req.exec_id; @@ -1022,7 +1013,7 @@ impl agent_ttrpc::AgentService for AgentService { let p = sandbox .find_container_process(cid.as_str(), eid.as_str()) .map_err(|e| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, format!("invalid argument: {:?}", e), ) @@ -1039,7 +1030,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::TtyWinResizeRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "tty_win_resize", req); - is_allowed!(req); + is_allowed(&req)?; let cid = req.container_id.clone(); let eid = req.exec_id.clone(); @@ -1048,7 +1039,7 @@ impl agent_ttrpc::AgentService for AgentService { let p = sandbox .find_container_process(cid.as_str(), eid.as_str()) .map_err(|e| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::UNAVAILABLE, format!("invalid argument: {:?}", e), ) @@ -1065,11 +1056,11 @@ impl agent_ttrpc::AgentService for AgentService { let err = libc::ioctl(fd, TIOCSWINSZ, &win); Errno::result(err).map(drop).map_err(|e| { - ttrpc_error!(ttrpc::Code::INTERNAL, format!("ioctl error: {:?}", e)) + ttrpc_error(ttrpc::Code::INTERNAL, format!("ioctl error: {:?}", e)) })?; } } else { - return Err(ttrpc_error!(ttrpc::Code::UNAVAILABLE, "no tty".to_string())); + return Err(ttrpc_error(ttrpc::Code::UNAVAILABLE, "no tty".to_string())); } Ok(Empty::new()) @@ -1081,10 +1072,10 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::UpdateInterfaceRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "update_interface", req); - is_allowed!(req); + is_allowed(&req)?; let interface = req.interface.into_option().ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "empty update interface request".to_string(), ) @@ -1097,7 +1088,7 @@ impl agent_ttrpc::AgentService for AgentService { .update_interface(&interface) .await .map_err(|e| { - ttrpc_error!(ttrpc::Code::INTERNAL, format!("update interface: {:?}", e)) + ttrpc_error(ttrpc::Code::INTERNAL, format!("update interface: {:?}", e)) })?; Ok(interface) @@ -1109,10 +1100,10 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::UpdateRoutesRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "update_routes", req); - is_allowed!(req); + is_allowed(&req)?; let new_routes = req.routes.into_option().map(|r| r.Routes).ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "empty update routes request".to_string(), ) @@ -1121,14 +1112,14 @@ impl agent_ttrpc::AgentService for AgentService { let mut sandbox = self.sandbox.lock().await; sandbox.rtnl.update_routes(new_routes).await.map_err(|e| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INTERNAL, format!("Failed to update routes: {:?}", e), ) })?; let list = sandbox.rtnl.list_routes().await.map_err(|e| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INTERNAL, format!("Failed to list routes after update: {:?}", e), ) @@ -1146,11 +1137,11 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::UpdateEphemeralMountsRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "update_mounts", req); - is_allowed!(req); + is_allowed(&req)?; - match update_ephemeral_mounts(sl!(), req.storages.to_vec(), self.sandbox.clone()).await { + match update_ephemeral_mounts(sl(), req.storages.to_vec(), self.sandbox.clone()).await { Ok(_) => Ok(Empty::new()), - Err(e) => Err(ttrpc_error!( + Err(e) => Err(ttrpc_error( ttrpc::Code::INTERNAL, format!("Failed to update mounts: {:?}", e), )), @@ -1163,9 +1154,9 @@ impl agent_ttrpc::AgentService for AgentService { req: GetIPTablesRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "get_iptables", req); - is_allowed!(req); + is_allowed(&req)?; - info!(sl!(), "get_ip_tables: request received"); + info!(sl(), "get_ip_tables: request received"); // the binary could exists in either /usr/sbin or /sbin // here check both of the places and return the one exists @@ -1190,8 +1181,8 @@ impl agent_ttrpc::AgentService for AgentService { ..Default::default() }), Err(e) => { - warn!(sl!(), "failed to run {}: {:?}", cmd, e.kind()); - return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)); + warn!(sl(), "failed to run {}: {:?}", cmd, e.kind()); + return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } } } @@ -1202,9 +1193,9 @@ impl agent_ttrpc::AgentService for AgentService { req: SetIPTablesRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "set_iptables", req); - is_allowed!(req); + is_allowed(&req)?; - info!(sl!(), "set_ip_tables request received"); + info!(sl(), "set_ip_tables request received"); // the binary could exists in both /usr/sbin and /sbin // here check both of the places and return the one exists @@ -1233,8 +1224,8 @@ impl agent_ttrpc::AgentService for AgentService { { Ok(child) => child, Err(e) => { - warn!(sl!(), "failure to spawn {}: {:?}", cmd, e.kind()); - return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)); + warn!(sl(), "failure to spawn {}: {:?}", cmd, e.kind()); + return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } }; @@ -1242,9 +1233,9 @@ impl agent_ttrpc::AgentService for AgentService { Some(si) => si, None => { println!("failed to get stdin from child"); - return Err(ttrpc_error!( + return Err(ttrpc_error( ttrpc::Code::INTERNAL, - "failed to take stdin from child".to_string() + "failed to take stdin from child".to_string(), )); } }; @@ -1254,12 +1245,12 @@ impl agent_ttrpc::AgentService for AgentService { let _ = match stdin.write_all(&req.data) { Ok(o) => o, Err(e) => { - warn!(sl!(), "error writing stdin: {:?}", e.kind()); + warn!(sl(), "error writing stdin: {:?}", e.kind()); return; } }; if tx.send(1).is_err() { - warn!(sl!(), "stdin writer thread receiver dropped"); + warn!(sl(), "stdin writer thread receiver dropped"); }; }); @@ -1267,16 +1258,16 @@ impl agent_ttrpc::AgentService for AgentService { .await .is_err() { - return Err(ttrpc_error!( + return Err(ttrpc_error( ttrpc::Code::INTERNAL, - "timeout waiting for stdin writer to complete".to_string() + "timeout waiting for stdin writer to complete".to_string(), )); } if handle.await.is_err() { - return Err(ttrpc_error!( + return Err(ttrpc_error( ttrpc::Code::INTERNAL, - "stdin writer thread failure".to_string() + "stdin writer thread failure".to_string(), )); } @@ -1284,24 +1275,24 @@ impl agent_ttrpc::AgentService for AgentService { Ok(o) => o, Err(e) => { warn!( - sl!(), + sl(), "failure waiting for spawned {} to complete: {:?}", cmd, e.kind() ); - return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)); + return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } }; if !output.status.success() { - warn!(sl!(), "{} failed: {:?}", cmd, output.stderr); - return Err(ttrpc_error!( + warn!(sl(), "{} failed: {:?}", cmd, output.stderr); + return Err(ttrpc_error( ttrpc::Code::INTERNAL, format!( "{} failed: {:?}", cmd, String::from_utf8_lossy(&output.stderr) - ) + ), )); } @@ -1317,7 +1308,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::ListInterfacesRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "list_interfaces", req); - is_allowed!(req); + is_allowed(&req)?; let list = self .sandbox @@ -1327,7 +1318,7 @@ impl agent_ttrpc::AgentService for AgentService { .list_interfaces() .await .map_err(|e| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INTERNAL, format!("Failed to list interfaces: {:?}", e), ) @@ -1345,7 +1336,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::ListRoutesRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "list_routes", req); - is_allowed!(req); + is_allowed(&req)?; let list = self .sandbox @@ -1354,7 +1345,7 @@ impl agent_ttrpc::AgentService for AgentService { .rtnl .list_routes() .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, format!("list routes: {:?}", e)))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, format!("list routes: {:?}", e)))?; Ok(protocols::agent::Routes { Routes: list, @@ -1368,7 +1359,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::CreateSandboxRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "create_sandbox", req); - is_allowed!(req); + is_allowed(&req)?; { let sandbox = self.sandbox.clone(); @@ -1383,7 +1374,7 @@ impl agent_ttrpc::AgentService for AgentService { if !req.guest_hook_path.is_empty() { let _ = s.add_hooks(&req.guest_hook_path).map_err(|e| { error!( - sl!(), + sl(), "add guest hook {} failed: {:?}", req.guest_hook_path, e ); }); @@ -1394,24 +1385,24 @@ impl agent_ttrpc::AgentService for AgentService { } for m in req.kernel_modules.iter() { - load_kernel_module(m).map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + load_kernel_module(m).map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; } s.setup_shared_namespaces() .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; } - match add_storages(sl!(), req.storages.to_vec(), self.sandbox.clone(), None).await { + match add_storages(sl(), req.storages.to_vec(), self.sandbox.clone(), None).await { Ok(m) => { let sandbox = self.sandbox.clone(); let mut s = sandbox.lock().await; s.mounts = m } - Err(e) => return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), }; - match setup_guest_dns(sl!(), req.dns.to_vec()) { + match setup_guest_dns(sl(), req.dns.to_vec()) { Ok(_) => { let sandbox = self.sandbox.clone(); let mut s = sandbox.lock().await; @@ -1421,7 +1412,7 @@ impl agent_ttrpc::AgentService for AgentService { .iter() .map(|dns| s.network.set_dns(dns.to_string())); } - Err(e) => return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), }; Ok(Empty::new()) @@ -1433,7 +1424,7 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::DestroySandboxRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "destroy_sandbox", req); - is_allowed!(req); + is_allowed(&req)?; let s = Arc::clone(&self.sandbox); let mut sandbox = s.lock().await; @@ -1442,7 +1433,7 @@ impl agent_ttrpc::AgentService for AgentService { sandbox .destroy() .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; // Close get_oom_event connection, // otherwise it will block the shutdown of ttrpc. sandbox.event_tx.take(); @@ -1451,13 +1442,13 @@ impl agent_ttrpc::AgentService for AgentService { .sender .take() .ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INTERNAL, "failed to get sandbox sender channel".to_string(), ) })? .send(1) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -1468,14 +1459,14 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::AddARPNeighborsRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "add_arp_neighbors", req); - is_allowed!(req); + is_allowed(&req)?; let neighs = req .neighbors .into_option() .map(|n| n.ARPNeighbors) .ok_or_else(|| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INVALID_ARGUMENT, "empty add arp neighbours request".to_string(), ) @@ -1488,7 +1479,7 @@ impl agent_ttrpc::AgentService for AgentService { .add_arp_neighbors(neighs) .await .map_err(|e| { - ttrpc_error!( + ttrpc_error( ttrpc::Code::INTERNAL, format!("Failed to add ARP neighbours: {:?}", e), ) @@ -1502,14 +1493,14 @@ impl agent_ttrpc::AgentService for AgentService { ctx: &TtrpcContext, req: protocols::agent::OnlineCPUMemRequest, ) -> ttrpc::Result { - is_allowed!(req); + is_allowed(&req)?; let s = Arc::clone(&self.sandbox); let sandbox = s.lock().await; trace_rpc_call!(ctx, "online_cpu_mem", req); sandbox .online_cpu_memory(&req) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -1520,10 +1511,10 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::ReseedRandomDevRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "reseed_random_dev", req); - is_allowed!(req); + is_allowed(&req)?; random::reseed_rng(req.data.as_slice()) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -1534,9 +1525,9 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::GuestDetailsRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "get_guest_details", req); - is_allowed!(req); + is_allowed(&req)?; - info!(sl!(), "get guest details!"); + info!(sl(), "get guest details!"); let mut resp = GuestDetailsResponse::new(); // to get memory block size match get_memory_info( @@ -1550,8 +1541,8 @@ impl agent_ttrpc::AgentService for AgentService { resp.support_mem_hotplug_probe = v; } Err(e) => { - info!(sl!(), "fail to get memory info!"); - return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)); + info!(sl(), "fail to get memory info!"); + return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } } @@ -1568,10 +1559,10 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::MemHotplugByProbeRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "mem_hotplug_by_probe", req); - is_allowed!(req); + is_allowed(&req)?; do_mem_hotplug_by_probe(&req.memHotplugProbeAddr) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -1582,10 +1573,10 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::SetGuestDateTimeRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "set_guest_date_time", req); - is_allowed!(req); + is_allowed(&req)?; do_set_guest_date_time(req.Sec, req.Usec) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -1596,9 +1587,9 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::CopyFileRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "copy_file", req); - is_allowed!(req); + is_allowed(&req)?; - do_copy_file(&req).map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + do_copy_file(&req).map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -1609,10 +1600,10 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::GetMetricsRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "get_metrics", req); - is_allowed!(req); + is_allowed(&req)?; match get_metrics(&req) { - Err(e) => Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)), + Err(e) => Err(ttrpc_error(ttrpc::Code::INTERNAL, e)), Ok(s) => { let mut metrics = Metrics::new(); metrics.set_metrics(s); @@ -1626,7 +1617,7 @@ impl agent_ttrpc::AgentService for AgentService { _ctx: &TtrpcContext, req: protocols::agent::GetOOMEventRequest, ) -> ttrpc::Result { - is_allowed!(req); + is_allowed(&req)?; let sandbox = self.sandbox.clone(); let s = sandbox.lock().await; let event_rx = &s.event_rx.clone(); @@ -1635,7 +1626,7 @@ impl agent_ttrpc::AgentService for AgentService { drop(sandbox); if let Some(container_id) = event_rx.recv().await { - info!(sl!(), "get_oom_event return {}", &container_id); + info!(sl(), "get_oom_event return {}", &container_id); let mut resp = OOMEvent::new(); resp.container_id = container_id; @@ -1643,7 +1634,7 @@ impl agent_ttrpc::AgentService for AgentService { return Ok(resp); } - Err(ttrpc_error!(ttrpc::Code::INTERNAL, "")) + Err(ttrpc_error(ttrpc::Code::INTERNAL, "")) } async fn get_volume_stats( @@ -1652,9 +1643,9 @@ impl agent_ttrpc::AgentService for AgentService { req: VolumeStatsRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "get_volume_stats", req); - is_allowed!(req); + is_allowed(&req)?; - info!(sl!(), "get volume stats!"); + info!(sl(), "get volume stats!"); let mut resp = VolumeStatsResponse::new(); let mut condition = VolumeCondition::new(); @@ -1665,8 +1656,8 @@ impl agent_ttrpc::AgentService for AgentService { condition.message = String::from("OK"); } Err(e) => { - info!(sl!(), "failed to open the volume"); - return Err(ttrpc_error!(ttrpc::Code::INTERNAL, e)); + info!(sl(), "failed to open the volume"); + return Err(ttrpc_error(ttrpc::Code::INTERNAL, e)); } }; @@ -1675,12 +1666,12 @@ impl agent_ttrpc::AgentService for AgentService { // to get volume capacity stats get_volume_capacity_stats(&req.volume_guest_path) .map(|u| usage_vec.push(u)) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; // to get volume inode stats get_volume_inode_stats(&req.volume_guest_path) .map(|u| usage_vec.push(u)) - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; resp.usage = usage_vec; resp.volume_condition = MessageField::some(condition); @@ -1693,11 +1684,11 @@ impl agent_ttrpc::AgentService for AgentService { req: protocols::agent::AddSwapRequest, ) -> ttrpc::Result { trace_rpc_call!(ctx, "add_swap", req); - is_allowed!(req); + is_allowed(&req)?; do_add_swap(&self.sandbox, &req) .await - .map_err(|e| ttrpc_error!(ttrpc::Code::INTERNAL, e))?; + .map_err(|e| ttrpc_error(ttrpc::Code::INTERNAL, e))?; Ok(Empty::new()) } @@ -1724,7 +1715,7 @@ impl health_ttrpc::Health for HealthService { _ctx: &TtrpcContext, req: protocols::health::CheckRequest, ) -> ttrpc::Result { - info!(sl!(), "version {:?}", req); + info!(sl(), "version {:?}", req); let mut rep = protocols::health::VersionCheckResponse::new(); rep.agent_version = AGENT_VERSION.to_string(); rep.grpc_version = API_VERSION.to_string(); @@ -1745,17 +1736,17 @@ fn get_memory_info( match fs::read_to_string(block_size_path) { Ok(v) => { if v.is_empty() { - warn!(sl!(), "file {} is empty", block_size_path); + warn!(sl(), "file {} is empty", block_size_path); return Err(anyhow!(ERR_INVALID_BLOCK_SIZE)); } size = u64::from_str_radix(v.trim(), 16).map_err(|_| { - warn!(sl!(), "failed to parse the str {} to hex", size); + warn!(sl(), "failed to parse the str {} to hex", size); anyhow!(ERR_INVALID_BLOCK_SIZE) })?; } Err(e) => { - warn!(sl!(), "memory block size error: {:?}", e.kind()); + warn!(sl(), "memory block size error: {:?}", e.kind()); if e.kind() != std::io::ErrorKind::NotFound { return Err(anyhow!(e)); } @@ -1767,7 +1758,7 @@ fn get_memory_info( match stat::stat(hotplug_probe_path) { Ok(_) => plug = true, Err(e) => { - warn!(sl!(), "hotplug memory error: {:?}", e); + warn!(sl(), "hotplug memory error: {:?}", e); match e { nix::Error::ENOENT => plug = false, _ => return Err(anyhow!(e)), @@ -1873,7 +1864,7 @@ pub async fn start( .register_service(hservice) .register_service(iservice); - info!(sl!(), "ttRPC server started"; "address" => server_address); + info!(sl(), "ttRPC server started"; "address" => server_address); Ok(server) } @@ -1948,7 +1939,7 @@ fn remove_container_resources(sandbox: &mut Sandbox, cid: &str) -> Result<()> { for m in cmounts.iter() { if let Err(err) = sandbox.unset_and_remove_sandbox_storage(m) { error!( - sl!(), + sl(), "failed to unset_and_remove_sandbox_storage for container {}, error: {:?}", cid, err @@ -1984,7 +1975,7 @@ fn is_signal_handled(proc_status_file: &str, signum: u32) -> bool { return fs::metadata(proc_status_file).is_ok(); } else if signum > 64 { // Ensure invalid signum won't break bit shift logic - warn!(sl!(), "received invalid signum {}", signum); + warn!(sl(), "received invalid signum {}", signum); return false; } else { (signum - 1).into() @@ -1994,7 +1985,7 @@ fn is_signal_handled(proc_status_file: &str, signum: u32) -> bool { let file = match File::open(proc_status_file) { Ok(f) => f, Err(_) => { - warn!(sl!(), "failed to open file {}", proc_status_file); + warn!(sl(), "failed to open file {}", proc_status_file); return false; } }; @@ -2174,7 +2165,7 @@ pub fn setup_bundle(cid: &str, spec: &mut Spec) -> Result { let rootfs_exists = Path::new(&rootfs_path).exists(); info!( - sl!(), + &sl(), "The rootfs_path is {:?} and exists: {}", rootfs_path, rootfs_exists ); @@ -2186,7 +2177,7 @@ pub fn setup_bundle(cid: &str, spec: &mut Spec) -> Result { "bind", MsFlags::MS_BIND, "", - &sl!(), + &sl(), )?; } @@ -2222,7 +2213,7 @@ fn load_kernel_module(module: &protocols::agent::KernelModule) -> Result<()> { } info!( - sl!(), + sl(), "load_kernel_module {}: {:?}", module.name, module.parameters ); @@ -3008,7 +2999,7 @@ OtherField:other for cmd in iptables_cmd_list { if !check_command(cmd) { warn!( - sl!(), + sl(), "one or more commands for ip tables test are missing, skip it" ); return; diff --git a/src/agent/src/tracer.rs b/src/agent/src/tracer.rs index bad4a6f50..1199b601c 100644 --- a/src/agent/src/tracer.rs +++ b/src/agent/src/tracer.rs @@ -69,7 +69,7 @@ macro_rules! trace_rpc_call { propagator.extract(&extract_carrier_from_ttrpc($ctx)) }); - info!(sl!(), "rpc call from shim to agent: {:?}", $name); + info!(sl(), "rpc call from shim to agent: {:?}", $name); // generate tracing span let rpc_span = span!(tracing::Level::INFO, $name, "mod"="rpc.rs", req=?$req); diff --git a/src/agent/src/uevent.rs b/src/agent/src/uevent.rs index 5d1f55494..53b7c103d 100644 --- a/src/agent/src/uevent.rs +++ b/src/agent/src/uevent.rs @@ -19,11 +19,9 @@ use tokio::sync::watch::Receiver; use tokio::sync::Mutex; use tracing::instrument; -// Convenience macro to obtain the scope logger -macro_rules! sl { - () => { - slog_scope::logger().new(o!("subsystem" => "uevent")) - }; +// Convenience function to obtain the scope logger. +fn sl() -> slog::Logger { + slog_scope::logger().new(o!("subsystem" => "uevent")) } #[derive(Debug, Default, Clone, PartialEq, Eq)] @@ -120,11 +118,11 @@ pub async fn wait_for_uevent( ) -> Result { let logprefix = format!("Waiting for {:?}", &matcher); - info!(sl!(), "{}", logprefix); + info!(sl(), "{}", logprefix); let mut sb = sandbox.lock().await; for uev in sb.uevent_map.values() { if matcher.is_match(uev) { - info!(sl!(), "{}: found {:?} in uevent map", logprefix, &uev); + info!(sl(), "{}: found {:?} in uevent map", logprefix, &uev); return Ok(uev.clone()); } } @@ -139,9 +137,9 @@ pub async fn wait_for_uevent( sb.uevent_watchers.push(Some((Box::new(matcher), tx))); drop(sb); // unlock - info!(sl!(), "{}: waiting on channel", logprefix); + info!(sl(), "{}: waiting on channel", logprefix); - let hotplug_timeout = AGENT_CONFIG.read().await.hotplug_timeout; + let hotplug_timeout = AGENT_CONFIG.hotplug_timeout; let uev = match tokio::time::timeout(hotplug_timeout, rx).await { Ok(v) => v?, @@ -157,7 +155,7 @@ pub async fn wait_for_uevent( } }; - info!(sl!(), "{}: found {:?} on channel", logprefix, &uev); + info!(sl(), "{}: found {:?} on channel", logprefix, &uev); Ok(uev) } diff --git a/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs b/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs index c7c9cb082..91f9406b8 100644 --- a/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs +++ b/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs @@ -341,7 +341,10 @@ impl DragonballInner { // cannot exceed maximum value if new_vcpus > self.config.cpu_info.default_maxvcpus { - return Err(anyhow!("resize vcpu error: cannot greater than maxvcpus")); + warn!( + sl!(), + "Cannot allocate more vcpus than the max allowed number of vcpus. The maximum allowed amount of vcpus will be used instead."); + return Ok((current_vcpus, self.config.cpu_info.default_maxvcpus)); } Ok((current_vcpus, new_vcpus)) diff --git a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs index 53eccc52b..2a8b6e600 100644 --- a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs +++ b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs @@ -105,7 +105,12 @@ impl InitialSizeManager { hv.cpu_info.default_vcpus = self.resource.vcpu as i32 } if self.resource.mem_mb > 0 { - hv.memory_info.default_memory = self.resource.mem_mb; + // since the memory overhead introduced by kata-agent and system components + // will really affect the amount of memory the user can use, so we choose to + // plus the default_memory here, instead of overriding it. + // (if we override the default_memory here, and user apllications still + // use memory as they orignally expected, it would be easy to OOM.) + hv.memory_info.default_memory += self.resource.mem_mb; } Ok(()) } diff --git a/src/runtime/cmd/kata-runtime/kata-env.go b/src/runtime/cmd/kata-runtime/kata-env.go index f17480aba..f74cb9f89 100644 --- a/src/runtime/cmd/kata-runtime/kata-env.go +++ b/src/runtime/cmd/kata-runtime/kata-env.go @@ -17,7 +17,7 @@ import ( "github.com/prometheus/procfs" "github.com/urfave/cli" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" "github.com/kata-containers/kata-containers/src/runtime/pkg/utils" @@ -113,8 +113,8 @@ type HypervisorInfo struct { SocketPath string Msize9p uint32 MemorySlots uint32 - PCIeRootPort uint32 - ColdPlugVFIO hv.PCIePort + HotPlugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort HotplugVFIOOnRootBus bool Debug bool } @@ -317,9 +317,9 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) { EntropySource: config.HypervisorConfig.EntropySource, SharedFS: config.HypervisorConfig.SharedFS, VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, + HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, - PCIeRootPort: config.HypervisorConfig.PCIeRootPort, SocketPath: socketPath, }, nil } diff --git a/src/runtime/cmd/kata-runtime/kata-env_test.go b/src/runtime/cmd/kata-runtime/kata-env_test.go index 3760104d0..c8d4d0ea9 100644 --- a/src/runtime/cmd/kata-runtime/kata-env_test.go +++ b/src/runtime/cmd/kata-runtime/kata-env_test.go @@ -19,12 +19,12 @@ import ( "testing" "github.com/BurntSushi/toml" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" vcUtils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/urfave/cli" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" @@ -74,8 +74,9 @@ func createConfig(configPath string, fileData string) error { return nil } -func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeConfig, err error) { - var coldPlugVFIO hv.PCIePort +func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.RuntimeConfig, err error) { + var hotPlugVFIO config.PCIePort + var coldPlugVFIO config.PCIePort const logPath = "/log/path" hypervisorPath := filepath.Join(prefixDir, "hypervisor") kernelPath := filepath.Join(prefixDir, "kernel") @@ -87,8 +88,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC blockStorageDriver := "virtio-scsi" enableIOThreads := true hotplugVFIOOnRootBus := true - pcieRootPort := uint32(2) - coldPlugVFIO = hv.NoPort + hotPlugVFIO = config.BridgePort + coldPlugVFIO = config.NoPort disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := filepath.Join(prefixDir, "virtiofsd") @@ -132,8 +133,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC BlockDeviceDriver: blockStorageDriver, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, + HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, - PCIeRootPort: pcieRootPort, DisableNewNetNs: disableNewNetNs, DefaultVCPUCount: hypConfig.NumVCPUs, DefaultMaxVCPUCount: hypConfig.DefaultMaxVCPUs, @@ -156,12 +157,12 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC return "", oci.RuntimeConfig{}, err } - _, config, err = katautils.LoadConfiguration(configFile, true) + _, ociConfig, err = katautils.LoadConfiguration(configFile, true) if err != nil { return "", oci.RuntimeConfig{}, err } - return configFile, config, nil + return configFile, ociConfig, nil } func getExpectedAgentDetails(config oci.RuntimeConfig) (AgentInfo, error) { @@ -277,7 +278,7 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo { VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, - PCIeRootPort: config.HypervisorConfig.PCIeRootPort, + HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, } diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index e7c115f92..1cc8b54dc 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -131,6 +131,11 @@ default_maxmemory = @DEFMAXMEMSZ@ # Shared file system type: # - virtio-fs (default) # - virtio-fs-nydus +# - none +# WARNING: "none" should be carefully used, and only used in very few specific cases, as +# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing +# issues with rotation of secrets, certs, or configurations via kubernetes objects like +# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*. shared_fs = "@DEFSHAREDFS_CLH_VIRTIOFS@" # Path to vhost-user-fs daemon. diff --git a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in index 33574b17d..4861cb1ed 100644 --- a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in +++ b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in @@ -178,6 +178,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@ # - virtio-fs (default) # - virtio-9p # - virtio-fs-nydus +# - none +# WARNING: "none" should be carefully used, and only used in very few specific cases, as +# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing +# issues with rotation of secrets, certs, or configurations via kubernetes objects like +# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*. shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@" # Path to vhost-user-fs daemon. diff --git a/src/runtime/config/configuration-qemu-sev.toml.in b/src/runtime/config/configuration-qemu-sev.toml.in index 7571cc6d0..db373e6cc 100644 --- a/src/runtime/config/configuration-qemu-sev.toml.in +++ b/src/runtime/config/configuration-qemu-sev.toml.in @@ -186,6 +186,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@ # - virtio-fs (default) # - virtio-9p # - virtio-fs-nydus +# - none +# WARNING: "none" should be carefully used, and only used in very few specific cases, as +# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing +# issues with rotation of secrets, certs, or configurations via kubernetes objects like +# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*. shared_fs = "@DEFSHAREDFS_QEMU_SEV_VIRTIOFS@" # Path to vhost-user-fs daemon. @@ -669,4 +674,4 @@ service_offload = @DEFSERVICEOFFLOAD@ # # Keys can be remotely provisioned. The Kata agent fetches them from e.g. # a HTTPS URL: -#provision=https://my-key-broker.foo/tenant/ \ No newline at end of file +#provision=https://my-key-broker.foo/tenant/ diff --git a/src/runtime/config/configuration-qemu-snp.toml.in b/src/runtime/config/configuration-qemu-snp.toml.in index d4fd77a88..f7aa09678 100644 --- a/src/runtime/config/configuration-qemu-snp.toml.in +++ b/src/runtime/config/configuration-qemu-snp.toml.in @@ -184,6 +184,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@ # - virtio-fs (default) # - virtio-9p # - virtio-fs-nydus +# - none +# WARNING: "none" should be carefully used, and only used in very few specific cases, as +# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing +# issues with rotation of secrets, certs, or configurations via kubernetes objects like +# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*. shared_fs = "@DEFSHAREDFS_QEMU_SNP_VIRTIOFS@" # Path to vhost-user-fs daemon. diff --git a/src/runtime/config/configuration-qemu-tdx.toml.in b/src/runtime/config/configuration-qemu-tdx.toml.in index 384bea041..e679677c8 100644 --- a/src/runtime/config/configuration-qemu-tdx.toml.in +++ b/src/runtime/config/configuration-qemu-tdx.toml.in @@ -172,6 +172,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@ # - virtio-fs (default) # - virtio-9p # - virtio-fs-nydus +# - none +# WARNING: "none" should be carefully used, and only used in very few specific cases, as +# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing +# issues with rotation of secrets, certs, or configurations via kubernetes objects like +# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*. shared_fs = "@DEFSHAREDFS_QEMU_TDX_VIRTIOFS@" # Path to vhost-user-fs daemon. diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index bc0dcb736..8d34386a3 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -206,6 +206,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@ # - virtio-fs (default) # - virtio-9p # - virtio-fs-nydus +# - none +# WARNING: "none" should be carefully used, and only used in very few specific cases, as +# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing +# issues with rotation of secrets, certs, or configurations via kubernetes objects like +# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*. shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@" # Path to vhost-user-fs daemon. @@ -380,8 +385,15 @@ pflashes = [] # Default false #hotplug_vfio_on_root_bus = true +# Enable hot-plugging of VFIO devices to a bridge-port, +# root-port or switch-port. +# The default setting is "no-port" +#hot_plug_vfio = "root-port" + # In a confidential compute environment hot-plugging can compromise -# security. Enable cold-plugging of VFIO devices to a root-port. +# security. +# Enable cold-plugging of VFIO devices to a bridge-port, +# root-port or switch-port. # The default setting is "no-port", which means disabled. #cold_plug_vfio = "root-port" diff --git a/src/runtime/pkg/containerd-shim-v2/create_test.go b/src/runtime/pkg/containerd-shim-v2/create_test.go index e3e8e9369..c24e3ced3 100644 --- a/src/runtime/pkg/containerd-shim-v2/create_test.go +++ b/src/runtime/pkg/containerd-shim-v2/create_test.go @@ -20,7 +20,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/assert" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations" @@ -308,8 +308,9 @@ func TestCreateContainerConfigFail(t *testing.T) { assert.Error(err) } -func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err error) { - var coldPlugVFIO hv.PCIePort +func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, err error) { + var hotPlugVFIO config.PCIePort + var coldPlugVFIO config.PCIePort if dir == "" { return "", fmt.Errorf("BUG: need directory") } @@ -330,11 +331,11 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err blockDeviceDriver := "virtio-scsi" enableIOThreads := true hotplugVFIOOnRootBus := true - pcieRootPort := uint32(2) disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := path.Join(dir, "virtiofsd") - coldPlugVFIO = hv.RootPort + hotPlugVFIO = config.BridgePort + coldPlugVFIO = config.RootPort configFileOptions := ktu.RuntimeConfigOptions{ Hypervisor: "qemu", @@ -349,10 +350,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err BlockDeviceDriver: blockDeviceDriver, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, - PCIeRootPort: pcieRootPort, DisableNewNetNs: disableNewNetNs, SharedFS: sharedFS, VirtioFSDaemon: virtioFSdaemon, + HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, } diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index dee6291ed..ef2a5c4b0 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -81,6 +81,17 @@ const ( // VirtioFSNydus means use nydus for the shared file system VirtioFSNydus = "virtio-fs-nydus" + + // NoSharedFS means *no* shared file system solution will be used + // and files will be copied into the guest system. + // + // WARNING: This should be carefully used, and only used in very few + // specific cases, as any update to the mount will *NOT* be reflected + // during the lifecycle of the pod, causing issues with rotation of + // secrets, certs, or configurations via kubernetes objects like + // configMaps or secrets, as those will be copied into the guest at + // *pod* *creation* *time*. + NoSharedFS = "none" ) const ( @@ -114,14 +125,117 @@ const ( // SysDevPrefix is static string of /sys/dev var SysDevPrefix = "/sys/dev" -// SysIOMMUPath is static string of /sys/kernel/iommu_groups -var SysIOMMUPath = "/sys/kernel/iommu_groups" +// SysIOMMUGroupPath is static string of /sys/kernel/iommu_groups +var SysIOMMUGroupPath = "/sys/kernel/iommu_groups" // SysBusPciDevicesPath is static string of /sys/bus/pci/devices var SysBusPciDevicesPath = "/sys/bus/pci/devices" var getSysDevPath = getSysDevPathImpl +// PCIePortBusPrefix gives us the correct bus nameing dependeing on the port +// used to hot(cold)-plug the device +type PCIePortBusPrefix string + +const ( + PCIeRootPortPrefix PCIePortBusPrefix = "rp" + PCIeSwitchPortPrefix PCIePortBusPrefix = "sw" + PCIeSwitchUpstreamPortPrefix PCIePortBusPrefix = "swup" + PCIeSwitchhDownstreamPortPrefix PCIePortBusPrefix = "swdp" + PCIBridgePortPrefix PCIePortBusPrefix = "bp" +) + +func (p PCIePortBusPrefix) String() string { + switch p { + case PCIeRootPortPrefix: + fallthrough + case PCIeSwitchPortPrefix: + fallthrough + case PCIeSwitchUpstreamPortPrefix: + fallthrough + case PCIeSwitchhDownstreamPortPrefix: + fallthrough + case PCIBridgePortPrefix: + return string(p) + } + return fmt.Sprintf("", string(p)) +} + +// PCIePort distinguish only between root and switch port +type PCIePort string + +const ( + // RootPort attach VFIO devices to a root-port + RootPort PCIePort = "root-port" + // SwitchPort attach VFIO devices to a switch-port + SwitchPort = "switch-port" + // BridgePort is the default + BridgePort = "bridge-port" + // NoPort is for disabling VFIO hotplug/coldplug + NoPort = "no-port" + // InvalidPort is for invalid port + InvalidPort = "invalid-port" +) + +func (p PCIePort) String() string { + switch p { + case RootPort: + fallthrough + case SwitchPort: + fallthrough + case BridgePort: + fallthrough + case NoPort: + fallthrough + case InvalidPort: + return string(p) + } + return fmt.Sprintf("", string(p)) +} + +var PCIePortPrefixMapping = map[PCIePort]PCIePortBusPrefix{ + RootPort: PCIeRootPortPrefix, + SwitchPort: PCIeSwitchhDownstreamPortPrefix, + BridgePort: PCIBridgePortPrefix, +} + +func (p PCIePort) Invalid() bool { + switch p { + case RootPort: + fallthrough + case SwitchPort: + fallthrough + case BridgePort: + fallthrough + case NoPort: + return false + } + return true +} + +func (p PCIePort) Valid() bool { + switch p { + case RootPort: + fallthrough + case SwitchPort: + fallthrough + case BridgePort: + fallthrough + case NoPort: + return true + } + return false +} + +type PCIePortMapping map[string]bool + +var ( + // Each of this structures keeps track of the devices attached to the + // different types of PCI ports. We can deduces the Bus number from it + // and eliminate duplicates being assigned. + PCIeDevices = map[PCIePort]PCIePortMapping{} +) + // DeviceInfo is an embedded type that contains device data common to all types of devices. type DeviceInfo struct { // DriverOptions is specific options for each device driver @@ -167,6 +281,9 @@ type DeviceInfo struct { // ColdPlug specifies whether the device must be cold plugged (true) // or hot plugged (false). ColdPlug bool + + // Specifies the PCIe port type to which the device is attached + Port PCIePort } // BlockDrive represents a block storage drive which may be used in case the storage @@ -268,14 +385,8 @@ const ( VFIOAPDeviceMediatedType ) -type VFIODev interface { - GetID() *string - GetType() VFIODeviceType - GetSysfsDev() *string -} - -// VFIOPCIDev represents a VFIO PCI device used for hotplugging -type VFIOPCIDev struct { +// VFIODev represents a VFIO PCI device used for hotplugging +type VFIODev struct { // ID is used to identify this drive in the hypervisor options. ID string @@ -305,44 +416,15 @@ type VFIOPCIDev struct { // IsPCIe specifies device is PCIe or PCI IsPCIe bool -} - -func (d VFIOPCIDev) GetID() *string { - return &d.ID -} - -func (d VFIOPCIDev) GetType() VFIODeviceType { - return d.Type -} - -func (d VFIOPCIDev) GetSysfsDev() *string { - return &d.SysfsDev -} - -type VFIOAPDev struct { - // ID is used to identify this drive in the hypervisor options. - ID string - - // sysfsdev of VFIO mediated device - SysfsDev string // APDevices are the Adjunct Processor devices assigned to the mdev APDevices []string - // Type of VFIO device - Type VFIODeviceType -} + // Rank identifies a device in a IOMMU group + Rank int -func (d VFIOAPDev) GetID() *string { - return &d.ID -} - -func (d VFIOAPDev) GetType() VFIODeviceType { - return d.Type -} - -func (d VFIOAPDev) GetSysfsDev() *string { - return &d.SysfsDev + // Port is the PCIe port type to which the device is attached + Port PCIePort } // RNGDev represents a random number generator device diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index bfffa31a2..8c9055ae2 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -47,9 +47,9 @@ func deviceLogger() *logrus.Entry { return api.DeviceLogger() } -// Identify PCIe device by reading the size of the PCI config space +// IsPCIeDevice identifies PCIe device by reading the size of the PCI config space // Plain PCI device have 256 bytes of config space where PCIe devices have 4K -func isPCIeDevice(bdf string) bool { +func IsPCIeDevice(bdf string) bool { if len(strings.Split(bdf, ":")) == 2 { bdf = PCIDomain + ":" + bdf } @@ -157,14 +157,12 @@ func checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (boo // GetAllVFIODevicesFromIOMMUGroup returns all the VFIO devices in the IOMMU group // We can reuse this function at various levels, sandbox, container. -// Only the VFIO module is allowed to do bus assignments, all other modules need to -// ignore it if used as helper function to get VFIO information. -func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignment bool) ([]*config.VFIODev, error) { +func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODev, error) { vfioDevs := []*config.VFIODev{} vfioGroup := filepath.Base(device.HostPath) - iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices") + iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioGroup, "devices") deviceFiles, err := os.ReadDir(iommuDevicesPath) if err != nil { @@ -174,7 +172,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme // Pass all devices in iommu group for i, deviceFile := range deviceFiles { //Get bdf of device eg 0000:00:1c.0 - deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath) + deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(deviceFile.Name(), iommuDevicesPath) if err != nil { return nil, err } @@ -196,27 +194,24 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme switch vfioDeviceType { case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - isPCIe := isPCIeDevice(deviceBDF) // Do not directly assign to `vfio` -- need to access field still - vfioPCI := config.VFIOPCIDev{ + vfio = config.VFIODev{ ID: id, Type: vfioDeviceType, BDF: deviceBDF, SysfsDev: deviceSysfsDev, - IsPCIe: isPCIe, + IsPCIe: IsPCIeDevice(deviceBDF), Class: pciClass, + Rank: -1, + Port: device.Port, } - if isPCIe && !ignoreBusAssignment { - vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs)) - AllPCIeDevs[deviceBDF] = true - } - vfio = vfioPCI + case config.VFIOAPDeviceMediatedType: devices, err := GetAPVFIODevices(deviceSysfsDev) if err != nil { return nil, err } - vfio = config.VFIOAPDev{ + vfio = config.VFIODev{ ID: id, SysfsDev: deviceSysfsDev, Type: config.VFIOAPDeviceMediatedType, diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 106220dcf..801b1a81f 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -28,14 +28,9 @@ const ( vfioRemoveIDPath = "/sys/bus/pci/drivers/vfio-pci/remove_id" iommuGroupPath = "/sys/bus/pci/devices/%s/iommu_group" vfioDevPath = "/dev/vfio/%s" - pcieRootPortPrefix = "rp" vfioAPSysfsDir = "/sys/devices/vfio_ap" ) -var ( - AllPCIeDevs = map[string]bool{} -) - // VFIODevice is a vfio device meant to be passed to the hypervisor // to be used by the Virtual Machine. type VFIODevice struct { @@ -70,10 +65,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece } }() - device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo, false) + device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo) if err != nil { return err } + for _, vfio := range device.VfioDevs { + if vfio.IsPCIe { + busIndex := len(config.PCIeDevices[vfio.Port]) + vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) + config.PCIeDevices[vfio.Port][vfio.BDF] = true + } + } coldPlug := device.DeviceInfo.ColdPlug deviceLogger().WithField("cold-plug", coldPlug).Info("Attaching VFIO device") @@ -169,23 +171,18 @@ func (device *VFIODevice) Load(ds config.DeviceState) { for _, dev := range ds.VFIODevs { var vfio config.VFIODev - vfioDeviceType := (*device.VfioDevs[0]).GetType() - switch vfioDeviceType { + switch dev.Type { case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - bdf := "" - if pciDev, ok := (*dev).(config.VFIOPCIDev); ok { - bdf = pciDev.BDF - } - vfio = config.VFIOPCIDev{ - ID: *(*dev).GetID(), - Type: config.VFIODeviceType((*dev).GetType()), - BDF: bdf, - SysfsDev: *(*dev).GetSysfsDev(), + vfio = config.VFIODev{ + ID: dev.ID, + Type: config.VFIODeviceType(dev.Type), + BDF: dev.BDF, + SysfsDev: dev.SysfsDev, } case config.VFIOAPDeviceMediatedType: - vfio = config.VFIOAPDev{ - ID: *(*dev).GetID(), - SysfsDev: *(*dev).GetSysfsDev(), + vfio = config.VFIODev{ + ID: dev.ID, + SysfsDev: dev.SysfsDev, } default: deviceLogger().WithError( @@ -200,7 +197,7 @@ func (device *VFIODevice) Load(ds config.DeviceState) { // It should implement GetAttachCount() and DeviceID() as api.Device implementation // here it shares function from *GenericDevice so we don't need duplicate codes -func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) { +func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) { sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName) vfioDeviceType, err = GetVFIODeviceType(sysfsDevStr) if err != nil { @@ -210,14 +207,18 @@ func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceS switch vfioDeviceType { case config.VFIOPCIDeviceNormalType: // Get bdf of device eg. 0000:00:1c.0 - deviceBDF = getBDF(deviceFileName) + // OLD IMPL: deviceBDF = getBDF(deviceFileName) + // The old implementation did not consider the case where + // vfio devices are located on different root busses. The + // kata-agent will handle the case now, here, use the full PCI addr + deviceBDF = deviceFileName // Get sysfs path used by cloud-hypervisor deviceSysfsDev = filepath.Join(config.SysBusPciDevicesPath, deviceFileName) case config.VFIOPCIDeviceMediatedType: // Get sysfsdev of device eg. /sys/devices/pci0000:00/0000:00:02.0/f79944e4-5a3d-11e8-99ce-479cbab002e4 sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName) deviceSysfsDev, err = GetSysfsDev(sysfsDevStr) - deviceBDF = getBDF(getMediatedBDF(deviceSysfsDev)) + deviceBDF = GetBDF(getMediatedBDF(deviceSysfsDev)) case config.VFIOAPDeviceMediatedType: sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName) deviceSysfsDev, err = GetSysfsDev(sysfsDevStr) @@ -240,7 +241,7 @@ func getMediatedBDF(deviceSysfsDev string) string { // getBDF returns the BDF of pci device // Expected input string format is []:[][].[] eg. 0000:02:10.0 -func getBDF(deviceSysStr string) string { +func GetBDF(deviceSysStr string) string { tokens := strings.SplitN(deviceSysStr, ":", 2) if len(tokens) == 1 { return "" diff --git a/src/runtime/pkg/device/drivers/vfio_test.go b/src/runtime/pkg/device/drivers/vfio_test.go index 6a1ab61eb..9a03fa030 100644 --- a/src/runtime/pkg/device/drivers/vfio_test.go +++ b/src/runtime/pkg/device/drivers/vfio_test.go @@ -20,7 +20,7 @@ func TestGetVFIODetails(t *testing.T) { } data := []testData{ - {"0000:02:10.0", "02:10.0"}, + {"0000:02:10.0", "0000:02:10.0"}, {"0000:0210.0", ""}, {"f79944e4-5a3d-11e8-99ce-", ""}, {"f79944e4-5a3d-11e8-99ce", ""}, @@ -29,7 +29,7 @@ func TestGetVFIODetails(t *testing.T) { } for _, d := range data { - deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(d.deviceStr, "") + deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(d.deviceStr, "") switch vfioDeviceType { case config.VFIOPCIDeviceNormalType: diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go index baf1209a7..735061d9e 100644 --- a/src/runtime/pkg/device/manager/manager.go +++ b/src/runtime/pkg/device/manager/manager.go @@ -71,7 +71,11 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS dm.blockDriver = config.VirtioSCSI } - drivers.AllPCIeDevs = make(map[string]bool) + config.PCIeDevices = make(map[config.PCIePort]config.PCIePortMapping) + + config.PCIeDevices[config.RootPort] = make(map[string]bool) + config.PCIeDevices[config.SwitchPort] = make(map[string]bool) + config.PCIeDevices[config.BridgePort] = make(map[string]bool) for _, dev := range devices { dm.devices[dev.DeviceID()] = dev @@ -118,7 +122,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device } if IsVFIO(devInfo.HostPath) { return drivers.NewVFIODevice(&devInfo), nil - } else if isVhostUserBlk(devInfo) { + } else if IsVhostUserBlk(devInfo) { if devInfo.DriverOptions == nil { devInfo.DriverOptions = make(map[string]string) } diff --git a/src/runtime/pkg/device/manager/manager_test.go b/src/runtime/pkg/device/manager/manager_test.go index 49e339f60..70c76b67d 100644 --- a/src/runtime/pkg/device/manager/manager_test.go +++ b/src/runtime/pkg/device/manager/manager_test.go @@ -116,14 +116,14 @@ func TestAttachVFIODevice(t *testing.T) { _, err = os.Create(deviceConfigFile) assert.Nil(t, err) - savedIOMMUPath := config.SysIOMMUPath - config.SysIOMMUPath = tmpDir + savedIOMMUPath := config.SysIOMMUGroupPath + config.SysIOMMUGroupPath = tmpDir savedSysBusPciDevicesPath := config.SysBusPciDevicesPath config.SysBusPciDevicesPath = devicesDir defer func() { - config.SysIOMMUPath = savedIOMMUPath + config.SysIOMMUGroupPath = savedIOMMUPath config.SysBusPciDevicesPath = savedSysBusPciDevicesPath }() diff --git a/src/runtime/pkg/device/manager/utils.go b/src/runtime/pkg/device/manager/utils.go index e78205d0c..a9e4ee8c6 100644 --- a/src/runtime/pkg/device/manager/utils.go +++ b/src/runtime/pkg/device/manager/utils.go @@ -37,7 +37,7 @@ func isBlock(devInfo config.DeviceInfo) bool { } // isVhostUserBlk checks if the device is a VhostUserBlk device. -func isVhostUserBlk(devInfo config.DeviceInfo) bool { +func IsVhostUserBlk(devInfo config.DeviceInfo) bool { return devInfo.DevType == "b" && devInfo.Major == config.VhostUserBlkMajor } diff --git a/src/runtime/pkg/device/manager/utils_test.go b/src/runtime/pkg/device/manager/utils_test.go index b57992b3d..6752719dd 100644 --- a/src/runtime/pkg/device/manager/utils_test.go +++ b/src/runtime/pkg/device/manager/utils_test.go @@ -70,7 +70,7 @@ func TestIsVhostUserBlk(t *testing.T) { } for _, d := range data { - isVhostUserBlk := isVhostUserBlk( + isVhostUserBlk := IsVhostUserBlk( config.DeviceInfo{ DevType: d.devType, Major: d.major, diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 5ff258aed..73b6aba44 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -123,6 +123,14 @@ const ( // PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port. PCIeRootPort DeviceDriver = "pcie-root-port" + // PCIeSwitchUpstreamPort is a PCIe switch upstream port + // A upstream port connects to a PCIe Root Port + PCIeSwitchUpstreamPort DeviceDriver = "x3130-upstream" + + // PCIeSwitchDownstreamPort is a PCIe switch downstream port + // PCIe devices can be hot-plugged to the downstream port. + PCIeSwitchDownstreamPort DeviceDriver = "xio3130-downstream" + // Loader is the Loader device driver. Loader DeviceDriver = "loader" @@ -236,6 +244,7 @@ const ( // SecExecGuest represents an s390x Secure Execution (Protected Virtualization in QEMU) object SecExecGuest ObjectType = "s390-pv-guest" + // PEFGuest represent ppc64le PEF(Protected Execution Facility) object. PEFGuest ObjectType = "pef-guest" ) @@ -410,7 +419,6 @@ func (object Object) QemuParams(config *Config) []string { deviceParams = append(deviceParams, string(object.Driver)) deviceParams = append(deviceParams, fmt.Sprintf("id=%s", object.DeviceID)) deviceParams = append(deviceParams, fmt.Sprintf("host-path=%s", object.File)) - } if len(deviceParams) > 0 { @@ -1722,6 +1730,106 @@ func (b PCIeRootPortDevice) Valid() bool { return true } +// PCIeSwitchUpstreamPortDevice is the port connecting to the root port +type PCIeSwitchUpstreamPortDevice struct { + ID string // format: sup{n}, n>=0 + Bus string // default is rp0 +} + +// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice. +func (b PCIeSwitchUpstreamPortDevice) QemuParams(config *Config) []string { + var qemuParams []string + var deviceParams []string + + driver := PCIeSwitchUpstreamPort + + deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID)) + deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus)) + + qemuParams = append(qemuParams, "-device") + qemuParams = append(qemuParams, strings.Join(deviceParams, ",")) + return qemuParams +} + +// Valid returns true if the PCIeSwitchUpstreamPortDevice structure is valid and complete. +func (b PCIeSwitchUpstreamPortDevice) Valid() bool { + if b.ID == "" { + return false + } + if b.Bus == "" { + return false + } + return true +} + +// PCIeSwitchDownstreamPortDevice is the port connecting to the root port +type PCIeSwitchDownstreamPortDevice struct { + ID string // format: sup{n}, n>=0 + Bus string // default is rp0 + Chassis string // (slot, chassis) pair is mandatory and must be unique for each downstream port, >=0, default is 0x00 + Slot string // >=0, default is 0x00 + // This to work needs patches to QEMU + BusReserve string + // Pref64 and Pref32 are not allowed to be set simultaneously + Pref64Reserve string // reserve prefetched MMIO aperture, 64-bit + Pref32Reserve string // reserve prefetched MMIO aperture, 32-bit + MemReserve string // reserve non-prefetched MMIO aperture, 32-bit *only* + IOReserve string // IO reservation + +} + +// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice. +func (b PCIeSwitchDownstreamPortDevice) QemuParams(config *Config) []string { + var qemuParams []string + var deviceParams []string + driver := PCIeSwitchDownstreamPort + + deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID)) + deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus)) + deviceParams = append(deviceParams, fmt.Sprintf("chassis=%s", b.Chassis)) + deviceParams = append(deviceParams, fmt.Sprintf("slot=%s", b.Slot)) + if b.BusReserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("bus-reserve=%s", b.BusReserve)) + } + + if b.Pref64Reserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("pref64-reserve=%s", b.Pref64Reserve)) + } + + if b.Pref32Reserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("pref32-reserve=%s", b.Pref32Reserve)) + } + + if b.MemReserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("mem-reserve=%s", b.MemReserve)) + } + + if b.IOReserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("io-reserve=%s", b.IOReserve)) + } + + qemuParams = append(qemuParams, "-device") + qemuParams = append(qemuParams, strings.Join(deviceParams, ",")) + return qemuParams +} + +// Valid returns true if the PCIeSwitchUpstremPortDevice structure is valid and complete. +func (b PCIeSwitchDownstreamPortDevice) Valid() bool { + if b.ID == "" { + return false + } + if b.Bus == "" { + return false + } + if b.Chassis == "" { + return false + } + if b.Slot == "" { + return false + } + return true +} + // VFIODevice represents a qemu vfio device meant for direct access by guest OS. type VFIODevice struct { // Bus-Device-Function of device diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go index 482b7e9e2..f0ba941de 100644 --- a/src/runtime/pkg/hypervisors/hypervisor_state.go +++ b/src/runtime/pkg/hypervisors/hypervisor_state.go @@ -5,7 +5,7 @@ package hypervisors -import "fmt" +import "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" // Bridge is a bridge where devices can be hot plugged type Bridge struct { @@ -28,37 +28,8 @@ type CPUDevice struct { ID string } -// PCIePort distinguish only between root and switch port -type PCIePort string - -const ( - // RootPort attach VFIO devices to a root-port - RootPort PCIePort = "root-port" - // SwitchPort attach VFIO devices to a switch-port - SwitchPort = "switch-port" - // BridgePort is the default - BridgePort = "bridge-port" - // NoPort is for disabling VFIO hotplug/coldplug - NoPort = "no-port" -) - -func (p PCIePort) String() string { - switch p { - case RootPort: - return "root-port" - case SwitchPort: - return "switch-port" - case BridgePort: - return "bridge-port" - case NoPort: - return "no-port" - } - return fmt.Sprintf("", string(p)) -} - type HypervisorState struct { BlockIndexMap map[int]struct{} - // Type of hypervisor, E.g. qemu/firecracker/acrn. Type string UUID string @@ -74,7 +45,7 @@ type HypervisorState struct { HotpluggedMemory int VirtiofsDaemonPid int Pid int - PCIeRootPort int - ColdPlugVFIO PCIePort + HotPlugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort HotplugVFIOOnRootBus bool } diff --git a/src/runtime/pkg/katatestutils/utils.go b/src/runtime/pkg/katatestutils/utils.go index 4c8257a40..ec1d85c3a 100644 --- a/src/runtime/pkg/katatestutils/utils.go +++ b/src/runtime/pkg/katatestutils/utils.go @@ -14,7 +14,7 @@ import ( "strconv" "testing" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/assert" ) @@ -224,8 +224,8 @@ type RuntimeConfigOptions struct { JaegerUser string JaegerPassword string PFlash []string - PCIeRootPort uint32 - ColdPlugVFIO hv.PCIePort + HotPlugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort DefaultVCPUCount uint32 DefaultMaxVCPUCount uint32 DefaultMemSize uint32 @@ -318,7 +318,6 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string { disable_block_device_use = ` + strconv.FormatBool(config.DisableBlock) + ` enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + ` hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + ` - pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + ` cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `" msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + ` enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + ` diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index a59ce8a17..f76986876 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -10,7 +10,7 @@ package katautils import ( - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" + config "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" ) // name is the name of the runtime @@ -82,7 +82,6 @@ const defaultEnableDebug bool = false const defaultDisableNestingChecks bool = false const defaultMsize9p uint32 = 8192 const defaultHotplugVFIOOnRootBus bool = false -const defaultPCIeRootPort = 0 const defaultEntropySource = "/dev/urandom" const defaultGuestHookPath string = "" const defaultVirtioFSCacheMode = "never" @@ -115,4 +114,5 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock" // Default config file used by stateless systems. var defaultRuntimeConfiguration = "@CONFIG_PATH@" -const defaultColdPlugVFIO = hv.NoPort +const defaultHotPlugVFIO = config.NoPort +const defaultColdPlugVFIO = config.NoPort diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 6bff49cfe..1397954aa 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -20,7 +20,6 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm" govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" @@ -79,98 +78,98 @@ type factory struct { } type hypervisor struct { - Path string `toml:"path"` - JailerPath string `toml:"jailer_path"` - Kernel string `toml:"kernel"` - CtlPath string `toml:"ctlpath"` - Initrd string `toml:"initrd"` - Image string `toml:"image"` - RootfsType string `toml:"rootfs_type"` - Firmware string `toml:"firmware"` - FirmwareVolume string `toml:"firmware_volume"` - MachineAccelerators string `toml:"machine_accelerators"` - CPUFeatures string `toml:"cpu_features"` - KernelParams string `toml:"kernel_params"` - MachineType string `toml:"machine_type"` - BlockDeviceDriver string `toml:"block_device_driver"` - EntropySource string `toml:"entropy_source"` - SharedFS string `toml:"shared_fs"` - VirtioFSDaemon string `toml:"virtio_fs_daemon"` - VirtioFSCache string `toml:"virtio_fs_cache"` - VhostUserStorePath string `toml:"vhost_user_store_path"` - FileBackedMemRootDir string `toml:"file_mem_backend"` - GuestHookPath string `toml:"guest_hook_path"` - GuestMemoryDumpPath string `toml:"guest_memory_dump_path"` - SeccompSandbox string `toml:"seccompsandbox"` - GuestPreAttestationURI string `toml:"guest_pre_attestation_kbs_uri"` - GuestPreAttestationMode string `toml:"guest_pre_attestation_kbs_mode"` - GuestPreAttestationKeyset string `toml:"guest_pre_attestation_keyset"` - SEVCertChainPath string `toml:"sev_cert_chain"` - BlockDeviceAIO string `toml:"block_device_aio"` - RemoteHypervisorSocket string `toml:"remote_hypervisor_socket"` - HypervisorPathList []string `toml:"valid_hypervisor_paths"` - JailerPathList []string `toml:"valid_jailer_paths"` - CtlPathList []string `toml:"valid_ctlpaths"` - VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"` - VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"` - PFlashList []string `toml:"pflashes"` - VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"` - FileBackedMemRootList []string `toml:"valid_file_mem_backends"` - EntropySourceList []string `toml:"valid_entropy_sources"` - EnableAnnotations []string `toml:"enable_annotations"` - RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"` - TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"` - MemOffset uint64 `toml:"memory_offset"` - DefaultMaxMemorySize uint64 `toml:"default_maxmemory"` - DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"` - DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"` - DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"` - DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"` - NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"` - NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"` - NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"` - NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"` - VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"` - VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"` - DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` - MemorySize uint32 `toml:"default_memory"` - MemSlots uint32 `toml:"memory_slots"` - DefaultBridges uint32 `toml:"default_bridges"` - Msize9p uint32 `toml:"msize_9p"` - PCIeRootPort uint32 `toml:"pcie_root_port"` - GuestPreAttestationGRPCTimeout uint32 `toml:"guest_pre_attestation_grpc_timeout"` - SEVGuestPolicy uint32 `toml:"sev_guest_policy"` - SNPGuestPolicy uint64 `toml:"snp_guest_policy"` - RemoteHypervisorTimeout uint32 `toml:"remote_hypervisor_timeout"` - NumVCPUs int32 `toml:"default_vcpus"` - BlockDeviceCacheSet bool `toml:"block_device_cache_set"` - BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"` - BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"` - EnableVhostUserStore bool `toml:"enable_vhost_user_store"` - VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"` - DisableBlockDeviceUse bool `toml:"disable_block_device_use"` - MemPrealloc bool `toml:"enable_mem_prealloc"` - HugePages bool `toml:"enable_hugepages"` - VirtioMem bool `toml:"enable_virtio_mem"` - IOMMU bool `toml:"enable_iommu"` - IOMMUPlatform bool `toml:"enable_iommu_platform"` - Debug bool `toml:"enable_debug"` - DisableNestingChecks bool `toml:"disable_nesting_checks"` - EnableIOThreads bool `toml:"enable_iothreads"` - DisableImageNvdimm bool `toml:"disable_image_nvdimm"` - HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` - ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"` - DisableVhostNet bool `toml:"disable_vhost_net"` - GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` - ConfidentialGuest bool `toml:"confidential_guest"` - SevSnpGuest bool `toml:"sev_snp_guest"` - GuestSwap bool `toml:"enable_guest_swap"` - Rootless bool `toml:"rootless"` - DisableSeccomp bool `toml:"disable_seccomp"` - DisableSeLinux bool `toml:"disable_selinux"` - DisableGuestSeLinux bool `toml:"disable_guest_selinux"` - LegacySerial bool `toml:"use_legacy_serial"` - GuestPreAttestation bool `toml:"guest_pre_attestation"` + Path string `toml:"path"` + JailerPath string `toml:"jailer_path"` + Kernel string `toml:"kernel"` + CtlPath string `toml:"ctlpath"` + Initrd string `toml:"initrd"` + Image string `toml:"image"` + RootfsType string `toml:"rootfs_type"` + Firmware string `toml:"firmware"` + FirmwareVolume string `toml:"firmware_volume"` + MachineAccelerators string `toml:"machine_accelerators"` + CPUFeatures string `toml:"cpu_features"` + KernelParams string `toml:"kernel_params"` + MachineType string `toml:"machine_type"` + BlockDeviceDriver string `toml:"block_device_driver"` + EntropySource string `toml:"entropy_source"` + SharedFS string `toml:"shared_fs"` + VirtioFSDaemon string `toml:"virtio_fs_daemon"` + VirtioFSCache string `toml:"virtio_fs_cache"` + VhostUserStorePath string `toml:"vhost_user_store_path"` + FileBackedMemRootDir string `toml:"file_mem_backend"` + GuestHookPath string `toml:"guest_hook_path"` + GuestMemoryDumpPath string `toml:"guest_memory_dump_path"` + SeccompSandbox string `toml:"seccompsandbox"` + GuestPreAttestationURI string `toml:"guest_pre_attestation_kbs_uri"` + GuestPreAttestationMode string `toml:"guest_pre_attestation_kbs_mode"` + GuestPreAttestationKeyset string `toml:"guest_pre_attestation_keyset"` + SEVCertChainPath string `toml:"sev_cert_chain"` + BlockDeviceAIO string `toml:"block_device_aio"` + RemoteHypervisorSocket string `toml:"remote_hypervisor_socket"` + HypervisorPathList []string `toml:"valid_hypervisor_paths"` + JailerPathList []string `toml:"valid_jailer_paths"` + CtlPathList []string `toml:"valid_ctlpaths"` + VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"` + VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"` + PFlashList []string `toml:"pflashes"` + VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"` + FileBackedMemRootList []string `toml:"valid_file_mem_backends"` + EntropySourceList []string `toml:"valid_entropy_sources"` + EnableAnnotations []string `toml:"enable_annotations"` + RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"` + TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"` + MemOffset uint64 `toml:"memory_offset"` + DefaultMaxMemorySize uint64 `toml:"default_maxmemory"` + DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"` + DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"` + DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"` + DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"` + NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"` + NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"` + NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"` + NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"` + VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"` + VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"` + DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"` + MemorySize uint32 `toml:"default_memory"` + MemSlots uint32 `toml:"memory_slots"` + DefaultBridges uint32 `toml:"default_bridges"` + Msize9p uint32 `toml:"msize_9p"` + GuestPreAttestationGRPCTimeout uint32 `toml:"guest_pre_attestation_grpc_timeout"` + SEVGuestPolicy uint32 `toml:"sev_guest_policy"` + SNPGuestPolicy uint64 `toml:"snp_guest_policy"` + RemoteHypervisorTimeout uint32 `toml:"remote_hypervisor_timeout"` + NumVCPUs int32 `toml:"default_vcpus"` + BlockDeviceCacheSet bool `toml:"block_device_cache_set"` + BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"` + BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"` + EnableVhostUserStore bool `toml:"enable_vhost_user_store"` + VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"` + DisableBlockDeviceUse bool `toml:"disable_block_device_use"` + MemPrealloc bool `toml:"enable_mem_prealloc"` + HugePages bool `toml:"enable_hugepages"` + VirtioMem bool `toml:"enable_virtio_mem"` + IOMMU bool `toml:"enable_iommu"` + IOMMUPlatform bool `toml:"enable_iommu_platform"` + Debug bool `toml:"enable_debug"` + DisableNestingChecks bool `toml:"disable_nesting_checks"` + EnableIOThreads bool `toml:"enable_iothreads"` + DisableImageNvdimm bool `toml:"disable_image_nvdimm"` + HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` + HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"` + ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"` + DisableVhostNet bool `toml:"disable_vhost_net"` + GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` + ConfidentialGuest bool `toml:"confidential_guest"` + SevSnpGuest bool `toml:"sev_snp_guest"` + GuestSwap bool `toml:"enable_guest_swap"` + Rootless bool `toml:"rootless"` + DisableSeccomp bool `toml:"disable_seccomp"` + DisableSeLinux bool `toml:"disable_selinux"` + DisableGuestSeLinux bool `toml:"disable_guest_selinux"` + LegacySerial bool `toml:"use_legacy_serial"` + GuestPreAttestation bool `toml:"guest_pre_attestation"` } type runtime struct { @@ -298,12 +297,18 @@ func (h hypervisor) firmware() (string, error) { return ResolvePath(p) } -func (h hypervisor) coldPlugVFIO() hv.PCIePort { +func (h hypervisor) coldPlugVFIO() config.PCIePort { if h.ColdPlugVFIO == "" { return defaultColdPlugVFIO } return h.ColdPlugVFIO } +func (h hypervisor) hotPlugVFIO() config.PCIePort { + if h.HotPlugVFIO == "" { + return defaultHotPlugVFIO + } + return h.HotPlugVFIO +} func (h hypervisor) firmwareVolume() (string, error) { p := h.FirmwareVolume @@ -523,7 +528,7 @@ func (h hypervisor) blockDeviceAIO() (string, error) { } func (h hypervisor) sharedFS() (string, error) { - supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus} + supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus, config.NoSharedFS} if h.SharedFS == "" { return config.VirtioFS, nil @@ -838,6 +843,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { KernelPath: kernel, InitrdPath: initrd, ImagePath: image, + RootfsType: rootfsType, FirmwarePath: firmware, FirmwareVolumePath: firmwareVolume, PFlash: pflashes, @@ -880,8 +886,8 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { Msize9p: h.msize9p(), DisableImageNvdimm: h.DisableImageNvdimm, HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, + HotPlugVFIO: h.hotPlugVFIO(), ColdPlugVFIO: h.coldPlugVFIO(), - PCIeRootPort: h.PCIeRootPort, DisableVhostNet: h.DisableVhostNet, EnableVhostUserStore: h.EnableVhostUserStore, VhostUserStorePath: h.vhostUserStorePath(), @@ -907,7 +913,6 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { SNPGuestPolicy: h.getSnpGuestPolicy(), SEVCertChainPath: h.SEVCertChainPath, DisableGuestSeLinux: h.DisableGuestSeLinux, - RootfsType: rootfsType, }, nil } @@ -1034,11 +1039,12 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { return vc.HypervisorConfig{}, err } - if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus { - return vc.HypervisorConfig{}, errors.New("clh only support virtio-fs or virtio-fs-nydus") + if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus && sharedFS != config.NoSharedFS { + return vc.HypervisorConfig{}, + fmt.Errorf("Cloud Hypervisor does not support %s shared filesystem option", sharedFS) } - if h.VirtioFSDaemon == "" { + if (sharedFS == config.VirtioFS || sharedFS == config.VirtioFSNydus) && h.VirtioFSDaemon == "" { return vc.HypervisorConfig{}, fmt.Errorf("cannot enable %s without daemon path in configuration file", sharedFS) } @@ -1084,7 +1090,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { Msize9p: h.msize9p(), HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, ColdPlugVFIO: h.coldPlugVFIO(), - PCIeRootPort: h.PCIeRootPort, + HotPlugVFIO: h.hotPlugVFIO(), DisableVhostNet: true, GuestHookPath: h.guestHookPath(), VirtioFSExtraArgs: h.VirtioFSExtraArgs, @@ -1302,6 +1308,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { KernelPath: defaultKernelPath, ImagePath: defaultImagePath, InitrdPath: defaultInitrdPath, + RootfsType: defaultRootfsType, FirmwarePath: defaultFirmwarePath, FirmwareVolumePath: defaultFirmwareVolumePath, MachineAccelerators: defaultMachineAccelerators, @@ -1330,9 +1337,10 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { Msize9p: defaultMsize9p, HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus, ColdPlugVFIO: defaultColdPlugVFIO, - PCIeRootPort: defaultPCIeRootPort, + HotPlugVFIO: defaultHotPlugVFIO, GuestHookPath: defaultGuestHookPath, VhostUserStorePath: defaultVhostUserStorePath, + VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect, VirtioFSCache: defaultVirtioFSCacheMode, DisableImageNvdimm: defaultDisableImageNvdimm, RxRateLimiterMaxRate: defaultRxRateLimiterMaxRate, @@ -1352,8 +1360,6 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { SEVGuestPolicy: defaultSEVGuestPolicy, SNPGuestPolicy: defaultSNPGuestPolicy, SEVCertChainPath: defaultSEVCertChainPath, - VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect, - RootfsType: defaultRootfsType, } } @@ -1711,9 +1717,10 @@ func checkConfig(config oci.RuntimeConfig) error { return err } + hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO machineType := config.HypervisorConfig.HypervisorMachineType - if err := checkPCIeConfig(coldPlugVFIO, machineType); err != nil { + if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType); err != nil { return err } @@ -1723,18 +1730,32 @@ func checkConfig(config oci.RuntimeConfig) error { // checkPCIeConfig ensures the PCIe configuration is valid. // Only allow one of the following settings for cold-plug: // no-port, root-port, switch-port -func checkPCIeConfig(vfioPort hv.PCIePort, machineType string) error { +func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string) error { // Currently only QEMU q35 supports advanced PCIe topologies // firecracker, dragonball do not have right now any PCIe support if machineType != "q35" { return nil } - if vfioPort == hv.NoPort || vfioPort == hv.RootPort || vfioPort == hv.SwitchPort { + + if coldPlug != config.NoPort && hotPlug != config.NoPort { + return fmt.Errorf("invalid hot-plug=%s and cold-plug=%s settings, only one of them can be set", coldPlug, hotPlug) + } + if coldPlug == config.NoPort && hotPlug == config.NoPort { + return nil + } + var port config.PCIePort + if coldPlug != config.NoPort { + port = coldPlug + } + if hotPlug != config.NoPort { + port = hotPlug + } + if port == config.NoPort || port == config.BridgePort || port == config.RootPort || port == config.SwitchPort { return nil } - return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s", - vfioPort, hv.NoPort, hv.RootPort, hv.SwitchPort) + return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s, %s", + coldPlug, config.NoPort, config.BridgePort, config.RootPort, config.SwitchPort) } // checkNetNsConfig performs sanity checks on disable_new_netns config. diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index 97f134579..b786ce25f 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -18,8 +18,8 @@ import ( "syscall" "testing" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" @@ -63,15 +63,16 @@ func createConfig(configPath string, fileData string) error { // createAllRuntimeConfigFiles creates all files necessary to call // loadConfiguration(). -func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConfig, err error) { +func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntimeConfig, err error) { if dir == "" { - return config, fmt.Errorf("BUG: need directory") + return testConfig, fmt.Errorf("BUG: need directory") } if hypervisor == "" { - return config, fmt.Errorf("BUG: need hypervisor") + return testConfig, fmt.Errorf("BUG: need hypervisor") } - var coldPlugVFIO hv.PCIePort + var hotPlugVFIO config.PCIePort + var coldPlugVFIO config.PCIePort hypervisorPath := path.Join(dir, "hypervisor") kernelPath := path.Join(dir, "kernel") kernelParams := "foo=bar xyz" @@ -85,8 +86,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf blockDeviceAIO := "io_uring" enableIOThreads := true hotplugVFIOOnRootBus := true - pcieRootPort := uint32(2) - coldPlugVFIO = hv.RootPort + hotPlugVFIO = config.NoPort + coldPlugVFIO = config.BridgePort disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := path.Join(dir, "virtiofsd") @@ -108,7 +109,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf BlockDeviceAIO: blockDeviceAIO, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, - PCIeRootPort: pcieRootPort, + HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, DisableNewNetNs: disableNewNetNs, DefaultVCPUCount: defaultVCPUCount, @@ -134,7 +135,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf configPath := path.Join(dir, "runtime.toml") err = createConfig(configPath, runtimeConfigFileData) if err != nil { - return config, err + return testConfig, err } configPathLink := path.Join(filepath.Dir(configPath), "link-to-configuration.toml") @@ -142,7 +143,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf // create a link to the config file err = syscall.Symlink(configPath, configPathLink) if err != nil { - return config, err + return testConfig, err } files := []string{hypervisorPath, kernelPath, imagePath} @@ -151,7 +152,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf // create the resource (which must be >0 bytes) err := WriteFile(file, "foo", testFileMode) if err != nil { - return config, err + return testConfig, err } } @@ -172,7 +173,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf DefaultBridges: defaultBridgesCount, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, - PCIeRootPort: pcieRootPort, + HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, Msize9p: defaultMsize9p, MemSlots: defaultMemSlots, @@ -217,10 +218,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf err = SetKernelParams(&runtimeConfig) if err != nil { - return config, err + return testConfig, err } - config = testRuntimeConfig{ + rtimeConfig := testRuntimeConfig{ RuntimeConfig: runtimeConfig, RuntimeConfigFile: configPath, ConfigPath: configPath, @@ -229,7 +230,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf LogPath: logPath, } - return config, nil + return rtimeConfig, nil } // testLoadConfiguration accepts an optional function that can be used @@ -570,6 +571,7 @@ func TestMinimalRuntimeConfig(t *testing.T) { BlockDeviceAIO: defaultBlockDeviceAIO, DisableGuestSeLinux: defaultDisableGuestSeLinux, SNPGuestPolicy: defaultSNPGuestPolicy, + HotPlugVFIO: defaultHotPlugVFIO, ColdPlugVFIO: defaultColdPlugVFIO, } @@ -604,7 +606,7 @@ func TestMinimalRuntimeConfig(t *testing.T) { func TestNewQemuHypervisorConfig(t *testing.T) { dir := t.TempDir() - var coldPlugVFIO hv.PCIePort + var coldPlugVFIO config.PCIePort hypervisorPath := path.Join(dir, "hypervisor") kernelPath := path.Join(dir, "kernel") imagePath := path.Join(dir, "image") @@ -612,8 +614,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) { disableBlock := true enableIOThreads := true hotplugVFIOOnRootBus := true - pcieRootPort := uint32(2) - coldPlugVFIO = hv.RootPort + coldPlugVFIO = config.BridgePort orgVHostVSockDevicePath := utils.VHostVSockDevicePath blockDeviceAIO := "io_uring" defer func() { @@ -632,7 +633,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) { DisableBlockDeviceUse: disableBlock, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, - PCIeRootPort: pcieRootPort, ColdPlugVFIO: coldPlugVFIO, RxRateLimiterMaxRate: rxRateLimiterMaxRate, TxRateLimiterMaxRate: txRateLimiterMaxRate, @@ -688,10 +688,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) { t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus) } - if config.PCIeRootPort != pcieRootPort { - t.Errorf("Expected value for PCIeRootPort %v, got %v", pcieRootPort, config.PCIeRootPort) - } - if config.RxRateLimiterMaxRate != rxRateLimiterMaxRate { t.Errorf("Expected value for rx rate limiter %v, got %v", rxRateLimiterMaxRate, config.RxRateLimiterMaxRate) } @@ -814,7 +810,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) { disableBlock := true enableIOThreads := true hotplugVFIOOnRootBus := true - pcieRootPort := uint32(2) hypervisor := hypervisor{ Path: hypervisorPath, @@ -825,7 +820,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) { DisableBlockDeviceUse: disableBlock, EnableIOThreads: enableIOThreads, HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, - PCIeRootPort: pcieRootPort, } _, err := newQemuHypervisorConfig(hypervisor) diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index b443a3f88..7881227a1 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -460,6 +460,10 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig, return err } + if err := addHypervisorHotColdPlugVfioOverrides(ocispec, config); err != nil { + return err + } + if value, ok := ocispec.Annotations[vcAnnotations.MachineType]; ok { if value != "" { config.HypervisorConfig.HypervisorMachineType = value @@ -515,12 +519,6 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig, return err } - if err := newAnnotationConfiguration(ocispec, vcAnnotations.PCIeRootPort).setUint(func(pcieRootPort uint64) { - config.HypervisorConfig.PCIeRootPort = uint32(pcieRootPort) - }); err != nil { - return err - } - if value, ok := ocispec.Annotations[vcAnnotations.EntropySource]; ok { if !checkPathIsInGlobs(runtime.HypervisorConfig.EntropySourceList, value) { return fmt.Errorf("entropy source %v required from annotation is not valid", value) @@ -583,6 +581,37 @@ func addHypervisorPathOverrides(ocispec specs.Spec, config *vc.SandboxConfig, ru return nil } +func addHypervisorPCIePortOverride(value string) (config.PCIePort, error) { + if value == "" { + return config.NoPort, nil + } + port := config.PCIePort(value) + if port.Invalid() { + return config.InvalidPort, fmt.Errorf("Invalid PCIe port \"%v\" specified in annotation", value) + } + return port, nil +} + +func addHypervisorHotColdPlugVfioOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error { + + var err error + if value, ok := ocispec.Annotations[vcAnnotations.HotPlugVFIO]; ok { + if sbConfig.HypervisorConfig.HotPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil { + return err + } + // If hot-plug is specified disable cold-plug and vice versa + sbConfig.HypervisorConfig.ColdPlugVFIO = config.NoPort + } + if value, ok := ocispec.Annotations[vcAnnotations.ColdPlugVFIO]; ok { + if sbConfig.HypervisorConfig.ColdPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil { + return err + } + // If cold-plug is specified disable hot-plug and vice versa + sbConfig.HypervisorConfig.HotPlugVFIO = config.NoPort + } + return nil +} + func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, runtime RuntimeConfig) error { if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMemory).setUintWithCheck(func(memorySz uint64) error { diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go index ebc7f019c..b27ba7f8e 100644 --- a/src/runtime/pkg/oci/utils_test.go +++ b/src/runtime/pkg/oci/utils_test.go @@ -599,7 +599,7 @@ func TestContainerPipeSizeAnnotation(t *testing.T) { func TestAddHypervisorAnnotations(t *testing.T) { assert := assert.New(t) - config := vc.SandboxConfig{ + sbConfig := vc.SandboxConfig{ Annotations: make(map[string]string), } @@ -628,8 +628,8 @@ func TestAddHypervisorAnnotations(t *testing.T) { runtimeConfig.HypervisorConfig.VirtioFSDaemonList = []string{"/bin/*ls*"} ocispec.Annotations[vcAnnotations.KernelParams] = "vsyscall=emulate iommu=on" - addHypervisorConfigOverrides(ocispec, &config, runtimeConfig) - assert.Exactly(expectedHyperConfig, config.HypervisorConfig) + addHypervisorConfigOverrides(ocispec, &sbConfig, runtimeConfig) + assert.Exactly(expectedHyperConfig, sbConfig.HypervisorConfig) ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1" ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1" @@ -660,7 +660,8 @@ func TestAddHypervisorAnnotations(t *testing.T) { ocispec.Annotations[vcAnnotations.GuestHookPath] = "/usr/bin/" ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true" ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true" - ocispec.Annotations[vcAnnotations.PCIeRootPort] = "2" + ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = config.BridgePort + ocispec.Annotations[vcAnnotations.HotPlugVFIO] = config.NoPort ocispec.Annotations[vcAnnotations.IOMMUPlatform] = "true" ocispec.Annotations[vcAnnotations.SGXEPC] = "64Mi" ocispec.Annotations[vcAnnotations.UseLegacySerial] = "true" @@ -668,55 +669,58 @@ func TestAddHypervisorAnnotations(t *testing.T) { ocispec.Annotations[vcAnnotations.RxRateLimiterMaxRate] = "10000000" ocispec.Annotations[vcAnnotations.TxRateLimiterMaxRate] = "10000000" - addAnnotations(ocispec, &config, runtimeConfig) - assert.Equal(config.HypervisorConfig.NumVCPUs, uint32(1)) - assert.Equal(config.HypervisorConfig.DefaultMaxVCPUs, uint32(1)) - assert.Equal(config.HypervisorConfig.MemorySize, uint32(1024)) - assert.Equal(config.HypervisorConfig.MemSlots, uint32(20)) - assert.Equal(config.HypervisorConfig.MemOffset, uint64(512)) - assert.Equal(config.HypervisorConfig.VirtioMem, true) - assert.Equal(config.HypervisorConfig.MemPrealloc, true) - assert.Equal(config.HypervisorConfig.FileBackedMemRootDir, "/dev/shm") - assert.Equal(config.HypervisorConfig.HugePages, true) - assert.Equal(config.HypervisorConfig.IOMMU, true) - assert.Equal(config.HypervisorConfig.BlockDeviceDriver, "virtio-scsi") - assert.Equal(config.HypervisorConfig.BlockDeviceAIO, "io_uring") - assert.Equal(config.HypervisorConfig.DisableBlockDeviceUse, true) - assert.Equal(config.HypervisorConfig.EnableIOThreads, true) - assert.Equal(config.HypervisorConfig.BlockDeviceCacheSet, true) - assert.Equal(config.HypervisorConfig.BlockDeviceCacheDirect, true) - assert.Equal(config.HypervisorConfig.BlockDeviceCacheNoflush, true) - assert.Equal(config.HypervisorConfig.SharedFS, "virtio-fs") - assert.Equal(config.HypervisorConfig.VirtioFSDaemon, "/bin/false") - assert.Equal(config.HypervisorConfig.VirtioFSCache, "auto") - assert.ElementsMatch(config.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"}) - assert.Equal(config.HypervisorConfig.Msize9p, uint32(512)) - assert.Equal(config.HypervisorConfig.HypervisorMachineType, "q35") - assert.Equal(config.HypervisorConfig.MachineAccelerators, "nofw") - assert.Equal(config.HypervisorConfig.CPUFeatures, "pmu=off") - assert.Equal(config.HypervisorConfig.DisableVhostNet, true) - assert.Equal(config.HypervisorConfig.GuestHookPath, "/usr/bin/") - assert.Equal(config.HypervisorConfig.DisableImageNvdimm, true) - assert.Equal(config.HypervisorConfig.HotplugVFIOOnRootBus, true) - assert.Equal(config.HypervisorConfig.PCIeRootPort, uint32(2)) - assert.Equal(config.HypervisorConfig.IOMMUPlatform, true) - assert.Equal(config.HypervisorConfig.SGXEPCSize, int64(67108864)) - assert.Equal(config.HypervisorConfig.LegacySerial, true) - assert.Equal(config.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000)) - assert.Equal(config.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000)) + err := addAnnotations(ocispec, &sbConfig, runtimeConfig) + assert.NoError(err) + + assert.Equal(sbConfig.HypervisorConfig.NumVCPUs, uint32(1)) + assert.Equal(sbConfig.HypervisorConfig.DefaultMaxVCPUs, uint32(1)) + assert.Equal(sbConfig.HypervisorConfig.MemorySize, uint32(1024)) + assert.Equal(sbConfig.HypervisorConfig.MemSlots, uint32(20)) + assert.Equal(sbConfig.HypervisorConfig.MemOffset, uint64(512)) + assert.Equal(sbConfig.HypervisorConfig.VirtioMem, true) + assert.Equal(sbConfig.HypervisorConfig.MemPrealloc, true) + assert.Equal(sbConfig.HypervisorConfig.FileBackedMemRootDir, "/dev/shm") + assert.Equal(sbConfig.HypervisorConfig.HugePages, true) + assert.Equal(sbConfig.HypervisorConfig.IOMMU, true) + assert.Equal(sbConfig.HypervisorConfig.BlockDeviceDriver, "virtio-scsi") + assert.Equal(sbConfig.HypervisorConfig.BlockDeviceAIO, "io_uring") + assert.Equal(sbConfig.HypervisorConfig.DisableBlockDeviceUse, true) + assert.Equal(sbConfig.HypervisorConfig.EnableIOThreads, true) + assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheSet, true) + assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheDirect, true) + assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheNoflush, true) + assert.Equal(sbConfig.HypervisorConfig.SharedFS, "virtio-fs") + assert.Equal(sbConfig.HypervisorConfig.VirtioFSDaemon, "/bin/false") + assert.Equal(sbConfig.HypervisorConfig.VirtioFSCache, "auto") + assert.ElementsMatch(sbConfig.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"}) + assert.Equal(sbConfig.HypervisorConfig.Msize9p, uint32(512)) + assert.Equal(sbConfig.HypervisorConfig.HypervisorMachineType, "q35") + assert.Equal(sbConfig.HypervisorConfig.MachineAccelerators, "nofw") + assert.Equal(sbConfig.HypervisorConfig.CPUFeatures, "pmu=off") + assert.Equal(sbConfig.HypervisorConfig.DisableVhostNet, true) + assert.Equal(sbConfig.HypervisorConfig.GuestHookPath, "/usr/bin/") + assert.Equal(sbConfig.HypervisorConfig.DisableImageNvdimm, true) + assert.Equal(sbConfig.HypervisorConfig.HotplugVFIOOnRootBus, true) + assert.Equal(string(sbConfig.HypervisorConfig.ColdPlugVFIO), string(config.BridgePort)) + assert.Equal(string(sbConfig.HypervisorConfig.HotPlugVFIO), string(config.NoPort)) + assert.Equal(sbConfig.HypervisorConfig.IOMMUPlatform, true) + assert.Equal(sbConfig.HypervisorConfig.SGXEPCSize, int64(67108864)) + assert.Equal(sbConfig.HypervisorConfig.LegacySerial, true) + assert.Equal(sbConfig.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000)) + assert.Equal(sbConfig.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000)) // In case an absurd large value is provided, the config value if not over-ridden ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "655536" - err := addAnnotations(ocispec, &config, runtimeConfig) + err = addAnnotations(ocispec, &sbConfig, runtimeConfig) assert.Error(err) ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "-1" - err = addAnnotations(ocispec, &config, runtimeConfig) + err = addAnnotations(ocispec, &sbConfig, runtimeConfig) assert.Error(err) ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1" ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "-1" - err = addAnnotations(ocispec, &config, runtimeConfig) + err = addAnnotations(ocispec, &sbConfig, runtimeConfig) assert.Error(err) ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1" diff --git a/src/runtime/virtcontainers/acrn.go b/src/runtime/virtcontainers/acrn.go index ad54a0477..837cbaf54 100644 --- a/src/runtime/virtcontainers/acrn.go +++ b/src/runtime/virtcontainers/acrn.go @@ -90,7 +90,7 @@ func (a *Acrn) Capabilities(ctx context.Context) types.Capabilities { span, _ := katatrace.Trace(ctx, a.Logger(), "Capabilities", acrnTracingTags, map[string]string{"sandbox_id": a.id}) defer span.End() - return a.arch.capabilities() + return a.arch.capabilities(a.config) } func (a *Acrn) HypervisorConfig() HypervisorConfig { diff --git a/src/runtime/virtcontainers/acrn_arch_base.go b/src/runtime/virtcontainers/acrn_arch_base.go index 77fb8e9e5..0b9ee53cc 100644 --- a/src/runtime/virtcontainers/acrn_arch_base.go +++ b/src/runtime/virtcontainers/acrn_arch_base.go @@ -33,7 +33,7 @@ type acrnArch interface { kernelParameters(debug bool) []Param //capabilities returns the capabilities supported by acrn - capabilities() types.Capabilities + capabilities(config HypervisorConfig) types.Capabilities // memoryTopology returns the memory topology using the given amount of memoryMb and hostMemoryMb memoryTopology(memMb uint64) Memory @@ -361,7 +361,7 @@ func (a *acrnArchBase) memoryTopology(memoryMb uint64) Memory { return memory } -func (a *acrnArchBase) capabilities() types.Capabilities { +func (a *acrnArchBase) capabilities(config HypervisorConfig) types.Capabilities { var caps types.Capabilities caps.SetBlockDeviceSupport() diff --git a/src/runtime/virtcontainers/acrn_arch_base_test.go b/src/runtime/virtcontainers/acrn_arch_base_test.go index 61db47416..c34974e69 100644 --- a/src/runtime/virtcontainers/acrn_arch_base_test.go +++ b/src/runtime/virtcontainers/acrn_arch_base_test.go @@ -83,8 +83,9 @@ func TestAcrnArchBaseKernelParameters(t *testing.T) { func TestAcrnArchBaseCapabilities(t *testing.T) { assert := assert.New(t) acrnArchBase := newAcrnArchBase() + config := HypervisorConfig{} - c := acrnArchBase.capabilities() + c := acrnArchBase.capabilities(config) assert.True(c.IsBlockDeviceSupported()) assert.True(c.IsBlockDeviceHotplugSupported()) assert.False(c.IsFsSharingSupported()) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index cfd924099..d658e1f77 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -349,6 +349,10 @@ func (clh *cloudHypervisor) createVirtiofsDaemon(sharedPath string) (VirtiofsDae } func (clh *cloudHypervisor) setupVirtiofsDaemon(ctx context.Context) error { + if clh.config.SharedFS == config.NoSharedFS { + return nil + } + if clh.config.SharedFS == config.Virtio9P { return errors.New("cloud-hypervisor only supports virtio based file sharing") } @@ -860,12 +864,12 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error { defer cancel() // Create the clh device config via the constructor to ensure default values are properly assigned - clhDevice := *chclient.NewDeviceConfig(*(*device).GetSysfsDev()) + clhDevice := *chclient.NewDeviceConfig(device.SysfsDev) pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice) if err != nil { return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err)) } - clh.devicesIds[*(*device).GetID()] = pciInfo.GetId() + clh.devicesIds[device.ID] = pciInfo.GetId() // clh doesn't use bridges, so the PCI path is simply the slot // number of the device. This will break if clh starts using @@ -882,14 +886,11 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error { return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf) } - guestPciPath, err := types.PciPathFromString(tokens[0]) - - pciDevice, ok := (*device).(config.VFIOPCIDev) - if !ok { + if device.Type == config.VFIOAPDeviceMediatedType { return fmt.Errorf("VFIO device %+v is not PCI, only PCI is supported in Cloud Hypervisor", device) } - pciDevice.GuestPciPath = guestPciPath - *device = pciDevice + + device.GuestPciPath, err = types.PciPathFromString(tokens[0]) return err } @@ -933,7 +934,7 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int case BlockDev: deviceID = clhDriveIndexToID(devInfo.(*config.BlockDrive).Index) case VfioDev: - deviceID = *devInfo.(config.VFIODev).GetID() + deviceID = devInfo.(*config.VFIODev).ID default: clh.Logger().WithFields(log.Fields{"devInfo": devInfo, "deviceType": devType}).Error("HotplugRemoveDevice: unsupported device") @@ -1210,7 +1211,9 @@ func (clh *cloudHypervisor) Capabilities(ctx context.Context) types.Capabilities clh.Logger().WithField("function", "Capabilities").Info("get Capabilities") var caps types.Capabilities - caps.SetFsSharingSupport() + if clh.config.SharedFS != config.NoSharedFS { + caps.SetFsSharingSupport() + } caps.SetBlockDeviceHotplugSupport() return caps } diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go index b5c800e95..d617ab4e1 100644 --- a/src/runtime/virtcontainers/clh_test.go +++ b/src/runtime/virtcontainers/clh_test.go @@ -673,7 +673,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) { _, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev) assert.NoError(err, "Hotplug remove block device expected no error") - _, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIOPCIDev{}, VfioDev) + _, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIODev{}, VfioDev) assert.NoError(err, "Hotplug remove vfio block device expected no error") _, err = clh.HotplugRemoveDevice(context.Background(), nil, NetDev) @@ -726,3 +726,30 @@ func TestClhSetConfig(t *testing.T) { assert.Equal(clh.config, config) } + +func TestClhCapabilities(t *testing.T) { + assert := assert.New(t) + + hConfig, err := newClhConfig() + assert.NoError(err) + + clh := &cloudHypervisor{} + assert.Equal(clh.config, HypervisorConfig{}) + + hConfig.SharedFS = config.VirtioFS + + err = clh.setConfig(&hConfig) + assert.NoError(err) + + var ctx context.Context + c := clh.Capabilities(ctx) + assert.True(c.IsFsSharingSupported()) + + hConfig.SharedFS = config.NoSharedFS + + err = clh.setConfig(&hConfig) + assert.NoError(err) + + c = clh.Capabilities(ctx) + assert.False(c.IsFsSharingSupported()) +} diff --git a/src/runtime/virtcontainers/documentation/api/1.0/api.md b/src/runtime/virtcontainers/documentation/api/1.0/api.md index d3071a86f..ca5cb4a1a 100644 --- a/src/runtime/virtcontainers/documentation/api/1.0/api.md +++ b/src/runtime/virtcontainers/documentation/api/1.0/api.md @@ -288,12 +288,12 @@ type HypervisorConfig struct { // root bus instead of a bridge. HotplugVFIOOnRootBus bool - // PCIeRootPort is used to indicate the number of PCIe Root Port devices - // The PCIe Root Port device is used to hot-plug the PCIe device - PCIeRootPort uint32 + // HotPlugVFIO is used to indicate if devices need to be hotplugged on the + // root port, switch, bridge or no port + HotPlugVFIO hv.PCIePort // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the - // root port, switch or no port + // root port, switch, bridge or no port ColdPlugVFIO hv.PCIePort // BootToBeTemplate used to indicate if the VM is created to be a template VM diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index fe0b96fac..f2c86c6a6 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -389,7 +389,6 @@ type HypervisorConfig struct { Gid uint32 SEVGuestPolicy uint32 SNPGuestPolicy uint64 - PCIeRootPort uint32 NumVCPUs uint32 RemoteHypervisorTimeout uint32 IOMMUPlatform bool @@ -420,7 +419,10 @@ type HypervisorConfig struct { DisableSeLinux bool DisableGuestSeLinux bool LegacySerial bool - ColdPlugVFIO hv.PCIePort + HotPlugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort + VFIODevices []config.DeviceInfo + VhostUserBlkDevices []config.DeviceInfo } // vcpu mapping from vcpu number to thread number diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index b54ad9fe8..3afb2fceb 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -21,6 +21,7 @@ import ( "github.com/docker/go-units" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/api" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers" volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/pkg/uuid" @@ -1148,7 +1149,7 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c * ContainerPath: dev.ContainerPath, Type: kataVfioPciDevType, Id: groupNum, - Options: nil, + Options: make([]string, len(devList)), } // We always pass the device information to the agent, since @@ -1158,16 +1159,16 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c * if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel { kataDevice.Type = kataVfioPciGuestKernelDevType } + for i, dev := range devList { + if dev.Type == config.VFIOAPDeviceMediatedType { + kataDevice.Type = kataVfioApDevType + kataDevice.Options = dev.APDevices + } else { - if (*devList[0]).GetType() == config.VFIOAPDeviceMediatedType { - kataDevice.Type = kataVfioApDevType - kataDevice.Options = (*devList[0]).(config.VFIOAPDev).APDevices - } else { - kataDevice.Options = make([]string, len(devList)) - for i, device := range devList { - pciDevice := (*device).(config.VFIOPCIDev) - kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDevice.BDF, pciDevice.GuestPciPath) + devBDF := drivers.GetBDF(dev.BDF) + kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", devBDF, dev.GuestPciPath) } + } return kataDevice @@ -1354,7 +1355,6 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co if _, err = k.sendReq(ctx, req); err != nil { return nil, err } - return buildProcessFromExecID(req.ExecId) } diff --git a/src/runtime/virtcontainers/persist.go b/src/runtime/virtcontainers/persist.go index cbba44e60..91ab51ebf 100644 --- a/src/runtime/virtcontainers/persist.go +++ b/src/runtime/virtcontainers/persist.go @@ -245,7 +245,6 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) { DisableNestingChecks: sconfig.HypervisorConfig.DisableNestingChecks, DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm, HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus, - PCIeRootPort: sconfig.HypervisorConfig.PCIeRootPort, BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate, BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate, DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet, @@ -487,8 +486,8 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) { DisableNestingChecks: hconf.DisableNestingChecks, DisableImageNvdimm: hconf.DisableImageNvdimm, HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus, + HotPlugVFIO: hconf.HotPlugVFIO, ColdPlugVFIO: hconf.ColdPlugVFIO, - PCIeRootPort: hconf.PCIeRootPort, BootToBeTemplate: hconf.BootToBeTemplate, BootFromTemplate: hconf.BootFromTemplate, DisableVhostNet: hconf.DisableVhostNet, diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go index e4facc6b9..6ca5ee690 100644 --- a/src/runtime/virtcontainers/persist/api/config.go +++ b/src/runtime/virtcontainers/persist/api/config.go @@ -7,7 +7,7 @@ package persistapi import ( - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/opencontainers/runc/libcontainer/configs" specs "github.com/opencontainers/runtime-spec/specs-go" ) @@ -131,10 +131,6 @@ type HypervisorConfig struct { // Enable SGX. Hardware-based isolation and memory encryption. SGXEPCSize int64 - // PCIeRootPort is used to indicate the number of PCIe Root Port devices - // The PCIe Root Port device is used to hot-plug the PCIe device - PCIeRootPort uint32 - // NumVCPUs specifies default number of vCPUs for the VM. NumVCPUs uint32 @@ -199,9 +195,13 @@ type HypervisorConfig struct { // root bus instead of a bridge. HotplugVFIOOnRootBus bool + // HotPlugVFIO is used to indicate if devices need to be hotplugged on the + // root, switch, bridge or no-port + HotPlugVFIO config.PCIePort + // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the - // root port or a switch or no-port - ColdPlugVFIO hv.PCIePort + // root, bridge, switch or no-port + ColdPlugVFIO config.PCIePort // BootToBeTemplate used to indicate if the VM is created to be a template VM BootToBeTemplate bool diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index a8bd4c1b2..b9a8dfc10 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -143,9 +143,11 @@ const ( // root bus instead of a bridge. HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus" - // PCIeRootPort is used to indicate the number of PCIe Root Port devices - // The PCIe Root Port device is used to hot-plug the PCIe device - PCIeRootPort = kataAnnotHypervisorPrefix + "pcie_root_port" + // ColdPlugVFIO is a sandbox annotation used to indicate if devices need to be coldplugged. + ColdPlugVFIO = kataAnnotHypervisorPrefix + "cold_plug_vfio" + + // HotPlugVFIO is a sandbox annotation used to indicate if devices need to be hotplugged. + HotPlugVFIO = kataAnnotHypervisorPrefix + "hot_plug_vfio" // EntropySource is a sandbox annotation to specify the path to a host source of // entropy (/dev/random, /dev/urandom or real hardware RNG device) diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 6ded5cbe0..43cb78f21 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -66,6 +66,11 @@ const romFile = "" // Default value is false. const defaultDisableModern = false +// A deeper PCIe topology than 5 is already not advisable just for the sake +// of having enough buffer we limit ourselves to 10 and exit if we reach +// the root bus +const maxPCIeTopoDepth = 10 + type qmpChannel struct { qmp *govmmQemu.QMP ctx context.Context @@ -76,15 +81,15 @@ type qmpChannel struct { // QemuState keeps Qemu's state type QemuState struct { - UUID string - Bridges []types.Bridge - // HotpluggedCPUs is the list of CPUs that were hot-added + UUID string + HotPlugVFIO config.PCIePort + Bridges []types.Bridge HotpluggedVCPUs []hv.CPUDevice HotpluggedMemory int VirtiofsDaemonPid int - PCIeRootPort int HotplugVFIOOnRootBus bool - ColdPlugVFIO hv.PCIePort + HotplugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort } // qemu is an Hypervisor interface implementation for the Linux qemu hypervisor. @@ -207,7 +212,7 @@ func (q *qemu) Capabilities(ctx context.Context) types.Capabilities { span, _ := katatrace.Trace(ctx, q.Logger(), "Capabilities", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() - return q.arch.capabilities() + return q.arch.capabilities(q.config) } func (q *qemu) HypervisorConfig() HypervisorConfig { @@ -278,10 +283,10 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso q.Logger().Debug("Creating UUID") q.state.UUID = uuid.Generate().String() - + q.state.HotPlugVFIO = q.config.HotPlugVFIO q.state.ColdPlugVFIO = q.config.ColdPlugVFIO q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus - q.state.PCIeRootPort = int(q.config.PCIeRootPort) + q.state.HotPlugVFIO = q.config.HotPlugVFIO // The path might already exist, but in case of VM templating, // we have to create it since the sandbox has not created it yet. @@ -727,27 +732,12 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi } } - // Add PCIe Root Port devices to hypervisor - // The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port. - // For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt - memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory() - - if hypervisorConfig.PCIeRootPort > 0 { - qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort, memSize32bit, memSize64bit) - } - - // The default OVMF MMIO aperture is too small for some PCIe devices - // with huge BARs so we need to increase it. - // memSize64bit is in bytes, convert to MB, OVMF expects MB as a string - if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") { - pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024)) - fwCfg := govmmQemu.FwCfg{ - Name: "opt/ovmf/X-PciMmio64Mb", - Str: pciMmio64Mb, + if machine.Type == QemuQ35 || machine.Type == QemuVirt { + if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig, machine.Type); err != nil { + q.Logger().WithError(err).Errorf("Cannot create PCIe topology") + return err } - qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg) } - q.qemuConfig = qemuConfig q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath) @@ -773,6 +763,101 @@ func (q *qemu) checkBpfEnabled() { } } +// If a user uses 8 GPUs with 4 devices in each IOMMU Group that means we need +// to hotplug 32 devices. We do not have enough PCIe root bus slots to +// accomplish this task. Kata will use already some slots for vfio-xxxx-pci +// devices. +// Max PCI slots per root bus is 32 +// Max PCIe root ports is 16 +// Max PCIe switch ports is 16 +// There is only 64kB of IO memory each root,switch port will consume 4k hence +// only 16 ports possible. +func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig, machineType string) error { + + // If no-port set just return no need to add PCIe Root Port or PCIe Switches + if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort && machineType == QemuQ35 { + return nil + } + + // Add PCIe Root Port or PCIe Switches to the hypervisor + // The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged + // into a PCIe Root Port or PCIe Switch. + // For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt + + // Deduce the right values for mem-reserve and pref-64-reserve memory regions + memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory() + + // The default OVMF MMIO aperture is too small for some PCIe devices + // with huge BARs so we need to increase it. + // memSize64bit is in bytes, convert to MB, OVMF expects MB as a string + if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") { + pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024)) + fwCfg := govmmQemu.FwCfg{ + Name: "opt/ovmf/X-PciMmio64Mb", + Str: pciMmio64Mb, + } + qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg) + } + + // Get the number of hot(cold)-pluggable ports needed from the provided + // VFIO devices and VhostUserBlockDevices + var numOfPluggablePorts uint32 = 0 + for _, dev := range hypervisorConfig.VFIODevices { + var err error + dev.HostPath, err = config.GetHostPath(dev, false, "") + if err != nil { + return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err) + } + devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev) + if err != nil { + return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err) + } + for _, vfioDevice := range devicesPerIOMMUGroup { + if drivers.IsPCIeDevice(vfioDevice.BDF) { + numOfPluggablePorts = numOfPluggablePorts + 1 + } + } + } + vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus) + vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort) + + numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices) + + // If number of PCIe root ports > 16 then bail out otherwise we may + // use up all slots or IO memory on the root bus and vfio-XXX-pci devices + // cannot be added which are crucial for Kata max slots on root bus is 32 + // max slots on the complete pci(e) topology is 256 in QEMU + if vfioOnRootPort { + // On Arm the vhost-user-block device is a PCIe device we need + // to account for it in the number of pluggable ports + if machineType == QemuVirt { + numOfPluggablePorts = numOfPluggablePorts + uint32(numOfVhostUserBlockDevices) + } + if numOfPluggablePorts > maxPCIeRootPort { + return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) + } + qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) + return nil + } + if vfioOnSwitchPort { + // On Arm the vhost-user-block device is a PCIe device we need + // to account for it in the number of pluggable ports + if machineType == QemuVirt { + numOfPluggableRootPorts := uint32(numOfVhostUserBlockDevices) + if numOfPluggableRootPorts > maxPCIeRootPort { + return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) + } + qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggableRootPorts, memSize32bit, memSize64bit) + } + if numOfPluggablePorts > maxPCIeSwitchPort { + return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort) + } + qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit) + return nil + } + return nil +} + func (q *qemu) vhostFSSocketPath(id string) (string, error) { return utils.BuildSocketPath(q.config.VMStorePath, id, vhostFSSocket) } @@ -1612,6 +1697,7 @@ func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDri } func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation, devID string) (err error) { + err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false, vAttr.ReconnectTime) if err != nil { return err @@ -1629,18 +1715,14 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V switch machineType { case QemuVirt: - if q.state.PCIeRootPort <= 0 { - return fmt.Errorf("Vhost-user-blk device is a PCIe device if machine type is virt. Need to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for virt") - } - //The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0 //Since the dev is the first and only one on this bus(root port), it should be 0. addr := "00" - bridgeId := fmt.Sprintf("%s%d", pcieRootPortPrefix, len(drivers.AllPCIeDevs)) - drivers.AllPCIeDevs[devID] = true + bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort])) + config.PCIeDevices[config.RootPort][devID] = true - bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeId) + bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID) bridgeSlot, err := q.qomGetSlot(bridgeQomPath) if err != nil { return err @@ -1656,7 +1738,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V return err } - if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeId); err != nil { + if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeID); err != nil { return err } @@ -1770,41 +1852,108 @@ func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) { // Query QMP to find a device's PCI path given its QOM path or ID func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) { - // XXX: For now we assume there's exactly one bridge, since - // that's always how we configure qemu from Kata for now. It - // would be good to generalize this to different PCI - // topologies + + var slots []types.PciSlot + devSlot, err := q.qomGetSlot(qemuID) if err != nil { return types.PciPath{}, err } + slots = append(slots, devSlot) - busq, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qemuID, "parent_bus") + // This only works for Q35 and Virt + r, _ := regexp.Compile(`^/machine/.*/pcie.0`) + + var parentPath = qemuID + // We do not want to use a forever loop here, a deeper PCIe topology + // than 5 is already not advisable just for the sake of having enough + // buffer we limit ourselves to 10 and leave the loop early if we hit + // the root bus. + for i := 1; i <= maxPCIeTopoDepth; i++ { + parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus") + if err != nil { + return types.PciPath{}, err + } + + busQOM, ok := parenBusQOM.(string) + if !ok { + return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM) + } + + // If we hit /machine/q35/pcie.0 we're done this is the root bus + // we climbed the complete hierarchy + if r.Match([]byte(busQOM)) { + break + } + + // `bus` is the QOM path of the QOM bus object, but we need + // the PCI parent_bus which manages that bus. There doesn't seem + // to be a way to get that other than to simply drop the last + // path component. + idx := strings.LastIndex(busQOM, "/") + if idx == -1 { + return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM) + } + parentBus := busQOM[:idx] + + parentSlot, err := q.qomGetSlot(parentBus) + if err != nil { + return types.PciPath{}, err + } + + // Prepend the slots, since we're climbing the hierarchy + slots = append([]types.PciSlot{parentSlot}, slots...) + parentPath = parentBus + } + return types.PciPathFromSlots(slots...) +} + +func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) { + return q.executeVFIODeviceAdd(device) +} + +func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.VFIODev) (err error) { + return q.executeVFIODeviceAdd(device) +} + +func (q *qemu) hotplugVFIODeviceBridgePort(ctx context.Context, device *config.VFIODev) (err error) { + addr, bridge, err := q.arch.addDeviceToBridge(ctx, device.ID, types.PCI) if err != nil { - return types.PciPath{}, err + return err } - bus, ok := busq.(string) - if !ok { - return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, busq) - } + defer func() { + if err != nil { + q.arch.removeDeviceFromBridge(device.ID) + } + }() + return q.executePCIVFIODeviceAdd(device, addr, bridge.ID) +} - // `bus` is the QOM path of the QOM bus object, but we need - // the PCI bridge which manages that bus. There doesn't seem - // to be a way to get that other than to simply drop the last - // path component. - idx := strings.LastIndex(bus, "/") - if idx == -1 { - return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", bus) +func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, bridgeID string) error { + switch device.Type { + case config.VFIOPCIDeviceNormalType: + return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, addr, bridgeID, romFile) + case config.VFIOPCIDeviceMediatedType: + return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile) + case config.VFIOAPDeviceMediatedType: + return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + default: + return fmt.Errorf("Incorrect VFIO device type found") } - bridge := bus[:idx] +} - bridgeSlot, err := q.qomGetSlot(bridge) - if err != nil { - return types.PciPath{}, err +func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error { + switch device.Type { + case config.VFIOPCIDeviceNormalType: + return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, device.Bus, romFile) + case config.VFIOPCIDeviceMediatedType: + return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile) + case config.VFIOAPDeviceMediatedType: + return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + default: + return fmt.Errorf("Incorrect VFIO device type found") } - - return types.PciPathFromSlots(bridgeSlot, devSlot) } func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op Operation) (err error) { @@ -1812,109 +1961,53 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op return err } - devID := *(*device).GetID() - machineType := q.HypervisorConfig().HypervisorMachineType - if op == AddDevice { - buf, _ := json.Marshal(device) q.Logger().WithFields(logrus.Fields{ - "machine-type": machineType, - "hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus, - "pcie-root-port": q.state.PCIeRootPort, - "device-info": string(buf), + "machine-type": q.HypervisorConfig().HypervisorMachineType, + "hot-plug-vfio": q.state.HotPlugVFIO, + "device-info": string(buf), }).Info("Start hot-plug VFIO device") - - // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus - // for pc machine type instead of bridge. This is useful for devices that require - // a large PCI BAR which is a currently a limitation with PCI bridges. - if q.state.HotplugVFIOOnRootBus { - switch (*device).GetType() { - case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - // In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port. - pciDevice, ok := (*device).(config.VFIOPCIDev) - if !ok { - return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device) - } - switch machineType { - case QemuQ35: - if pciDevice.IsPCIe && q.state.PCIeRootPort <= 0 { - q.Logger().WithField("dev-id", (*device).GetID()).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35") - pciDevice.Bus = "" - } - default: - pciDevice.Bus = "" - } - *device = pciDevice - - if pciDevice.Type == config.VFIOPCIDeviceNormalType { - err = q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, pciDevice.Bus, romFile) - } else { - err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), "", pciDevice.Bus, romFile) - } - case config.VFIOAPDeviceMediatedType: - err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev()) - } + // In case MachineType is q35, a PCIe device is hotplugged on + // a PCIe Root Port or alternatively on a PCIe Switch Port + if q.HypervisorConfig().HypervisorMachineType != QemuQ35 && q.HypervisorConfig().HypervisorMachineType != QemuVirt { + device.Bus = "" } else { - addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI) + var err error + // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus + // for pc machine type instead of bridge. This is useful for devices that require + // a large PCI BAR which is a currently a limitation with PCI bridges. + if q.state.HotPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus { + err = q.hotplugVFIODeviceRootPort(ctx, device) + } else if q.state.HotPlugVFIO == config.SwitchPort { + err = q.hotplugVFIODeviceSwitchPort(ctx, device) + } else { + err = q.hotplugVFIODeviceBridgePort(ctx, device) + } if err != nil { return err } - - defer func() { - if err != nil { - q.arch.removeDeviceFromBridge(devID) - } - }() - - switch (*device).GetType() { - case config.VFIOPCIDeviceNormalType: - pciDevice, ok := (*device).(config.VFIOPCIDev) - if !ok { - return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device) - } - err = q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, addr, bridge.ID, romFile) - case config.VFIOPCIDeviceMediatedType: - err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), addr, bridge.ID, romFile) - case config.VFIOAPDeviceMediatedType: - err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev()) - default: - return fmt.Errorf("Incorrect VFIO device type found") - } - } - if err != nil { - return err - } - - switch (*device).GetType() { - case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - pciDevice, ok := (*device).(config.VFIOPCIDev) - if !ok { - return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device) - } - // XXX: Depending on whether we're doing root port or - // bridge hotplug, and how the bridge is set up in - // other parts of the code, we may or may not already - // have information about the slot number of the - // bridge and or the device. For simplicity, just - // query both of them back from qemu - guestPciPath, err := q.qomGetPciPath(devID) - pciDevice.GuestPciPath = guestPciPath - *device = pciDevice - return err } + // XXX: Depending on whether we're doing root port or + // bridge hotplug, and how the bridge is set up in + // other parts of the code, we may or may not already + // have information about the slot number of the + // bridge and or the device. For simplicity, just + // query both of them back from qemu + device.GuestPciPath, err = q.qomGetPciPath(device.ID) return err - } else { - q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device") - - if !q.state.HotplugVFIOOnRootBus { - if err := q.arch.removeDeviceFromBridge(devID); err != nil { - return err - } - } - - return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID) } + + q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device") + + if !q.state.HotplugVFIOOnRootBus { + if err := q.arch.removeDeviceFromBridge(device.ID); err != nil { + return err + } + } + + return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID) + } func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error { @@ -2612,7 +2705,7 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin for i := uint32(0); i < number; i++ { devices = append(devices, govmmQemu.PCIeRootPortDevice{ - ID: fmt.Sprintf("%s%d", pcieRootPortPrefix, i), + ID: fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, i), Bus: bus, Chassis: chassis, Slot: strconv.FormatUint(uint64(i), 10), @@ -2626,6 +2719,79 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin return devices } +// gollangci-lint enforces multi-line comments to be a block comment +// not multiple single line comments ... +/* pcie.0 bus +// ------------------------------------------------- +// | +// ------------- +// | Root Port | +// ------------- +// -------------------------|------------------------ +// | ----------------- | +// | PCI Express | Upstream Port | | +// | Switch ----------------- | +// | | | | +// | ------------------- ------------------- | +// | | Downstream Port | | Downstream Port | | +// | ------------------- ------------------- | +// -------------|-----------------------|------------ +// ------------- -------------- +// | GPU/ACCEL | | IB/ETH NIC | +// ------------- -------------- +*/ +// genericAppendPCIeSwitch adds a PCIe Swtich +func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device { + + // Q35, Virt have the correct PCIe support, + // hence ignore all other machines + if machineType != QemuQ35 && machineType != QemuVirt { + return devices + } + + // Using an own ID for the root port, so we do not clash with already + // existing root ports adding "s" for switch prefix + pcieRootPort := govmmQemu.PCIeRootPortDevice{ + ID: fmt.Sprintf("%s%s%d", config.PCIeSwitchPortPrefix, config.PCIeRootPortPrefix, 0), + Bus: defaultBridgeBus, + Chassis: "1", + Slot: strconv.FormatUint(uint64(0), 10), + Multifunction: false, + Addr: "0", + MemReserve: fmt.Sprintf("%dB", memSize32bit), + Pref64Reserve: fmt.Sprintf("%dB", memSize64bit), + } + + devices = append(devices, pcieRootPort) + + pcieSwitchUpstreamPort := govmmQemu.PCIeSwitchUpstreamPortDevice{ + ID: fmt.Sprintf("%s%d", config.PCIeSwitchUpstreamPortPrefix, 0), + Bus: pcieRootPort.ID, + } + devices = append(devices, pcieSwitchUpstreamPort) + + currentChassis, err := strconv.Atoi(pcieRootPort.Chassis) + if err != nil { + return devices + } + nextChassis := currentChassis + 1 + + for i := uint32(0); i < number; i++ { + + pcieSwitchDownstreamPort := govmmQemu.PCIeSwitchDownstreamPortDevice{ + ID: fmt.Sprintf("%s%d", config.PCIeSwitchhDownstreamPortPrefix, i), + Bus: pcieSwitchUpstreamPort.ID, + Chassis: fmt.Sprintf("%d", nextChassis), + Slot: strconv.FormatUint(uint64(i), 10), + // TODO: MemReserve: fmt.Sprintf("%dB", memSize32bit), + // TODO: Pref64Reserve: fmt.Sprintf("%dB", memSize64bit), + } + devices = append(devices, pcieSwitchDownstreamPort) + } + + return devices +} + func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) { span, _ := katatrace.Trace(ctx, q.Logger(), "GetThreadIDs", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() @@ -2801,7 +2967,6 @@ func (q *qemu) Save() (s hv.HypervisorState) { s.UUID = q.state.UUID s.HotpluggedMemory = q.state.HotpluggedMemory s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus - s.PCIeRootPort = q.state.PCIeRootPort for _, bridge := range q.arch.getBridges() { s.Bridges = append(s.Bridges, hv.Bridge{ @@ -2825,7 +2990,6 @@ func (q *qemu) Load(s hv.HypervisorState) { q.state.HotpluggedMemory = s.HotpluggedMemory q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid - q.state.PCIeRootPort = s.PCIeRootPort for _, bridge := range s.Bridges { q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr)) diff --git a/src/runtime/virtcontainers/qemu_amd64.go b/src/runtime/virtcontainers/qemu_amd64.go index d24953e61..4a6449273 100644 --- a/src/runtime/virtcontainers/qemu_amd64.go +++ b/src/runtime/virtcontainers/qemu_amd64.go @@ -26,6 +26,7 @@ import ( "google.golang.org/grpc/credentials/insecure" "github.com/intel-go/cpuid" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu" ) @@ -182,7 +183,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) { return q, nil } -func (q *qemuAmd64) capabilities() types.Capabilities { +func (q *qemuAmd64) capabilities(hConfig HypervisorConfig) types.Capabilities { var caps types.Capabilities if q.qemuMachine.Type == QemuQ35 || @@ -191,7 +192,9 @@ func (q *qemuAmd64) capabilities() types.Capabilities { } caps.SetMultiQueueSupport() - caps.SetFsSharingSupport() + if hConfig.SharedFS != config.NoSharedFS { + caps.SetFsSharingSupport() + } return caps } @@ -323,6 +326,7 @@ func (q *qemuAmd64) appendProtectionDevice(devices []govmmQemu.Device, firmware, ReducedPhysBits: 1, }), "", nil case noneProtection: + return devices, firmware, nil default: diff --git a/src/runtime/virtcontainers/qemu_amd64_test.go b/src/runtime/virtcontainers/qemu_amd64_test.go index 850118f69..17a537956 100644 --- a/src/runtime/virtcontainers/qemu_amd64_test.go +++ b/src/runtime/virtcontainers/qemu_amd64_test.go @@ -42,13 +42,14 @@ func TestQemuAmd64BadMachineType(t *testing.T) { func TestQemuAmd64Capabilities(t *testing.T) { assert := assert.New(t) + config := HypervisorConfig{} amd64 := newTestQemu(assert, QemuQ35) - caps := amd64.capabilities() + caps := amd64.capabilities(config) assert.True(caps.IsBlockDeviceHotplugSupported()) amd64 = newTestQemu(assert, QemuMicrovm) - caps = amd64.capabilities() + caps = amd64.capabilities(config) assert.False(caps.IsBlockDeviceHotplugSupported()) } diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index 3de1aabc9..ead1a4a65 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -61,7 +61,7 @@ type qemuArch interface { kernelParameters(debug bool) []Param //capabilities returns the capabilities supported by QEMU - capabilities() types.Capabilities + capabilities(config HypervisorConfig) types.Capabilities // bridges sets the number bridges for the machine type bridges(number uint32) @@ -150,6 +150,9 @@ type qemuArch interface { // appendPCIeRootPortDevice appends a pcie-root-port device to pcie.0 bus appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device + // appendPCIeSwitch appends a ioh3420 device to a pcie-root-port + appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device + // append vIOMMU device appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Device, error) @@ -204,7 +207,8 @@ const ( defaultBridgeBus = "pcie.0" defaultPCBridgeBus = "pci.0" maxDevIDSize = 31 - pcieRootPortPrefix = "rp" + maxPCIeRootPort = 16 // Limitation from QEMU + maxPCIeSwitchPort = 16 // Limitation from QEMU ) // This is the PCI start address assigned to the first bridge that @@ -313,11 +317,13 @@ func (q *qemuArchBase) kernelParameters(debug bool) []Param { return params } -func (q *qemuArchBase) capabilities() types.Capabilities { +func (q *qemuArchBase) capabilities(hConfig HypervisorConfig) types.Capabilities { var caps types.Capabilities caps.SetBlockDeviceHotplugSupport() caps.SetMultiQueueSupport() - caps.SetFsSharingSupport() + if hConfig.SharedFS != config.NoSharedFS { + caps.SetFsSharingSupport() + } return caps } @@ -708,17 +714,17 @@ func (q *qemuArchBase) appendVhostUserDevice(ctx context.Context, devices []govm } func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device { - pciDevice := vfioDev.(config.VFIOPCIDev) - if pciDevice.BDF == "" { + + if vfioDev.BDF == "" { return devices } devices = append(devices, govmmQemu.VFIODevice{ - BDF: pciDevice.BDF, - VendorID: pciDevice.VendorID, - DeviceID: pciDevice.DeviceID, - Bus: pciDevice.Bus, + BDF: vfioDev.BDF, + VendorID: vfioDev.VendorID, + DeviceID: vfioDev.DeviceID, + Bus: vfioDev.Bus, }, ) @@ -834,6 +840,13 @@ func (q *qemuArchBase) appendPCIeRootPortDevice(devices []govmmQemu.Device, numb return genericAppendPCIeRootPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit) } +// appendPCIeSwitchPortDevice appends a PCIe Switch with ports +func (q *qemuArchBase) appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device { + return genericAppendPCIeSwitchPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit) +} + +// getBARsMaxAddressableMemory we need to know the BAR sizes to configure the +// PCIe Root Port or PCIe Downstream Port attaching a device with huge BARs. func (q *qemuArchBase) getBARsMaxAddressableMemory() (uint64, uint64) { pci := nvpci.New() diff --git a/src/runtime/virtcontainers/qemu_arch_base_test.go b/src/runtime/virtcontainers/qemu_arch_base_test.go index 51c11bd91..75d7de029 100644 --- a/src/runtime/virtcontainers/qemu_arch_base_test.go +++ b/src/runtime/virtcontainers/qemu_arch_base_test.go @@ -117,9 +117,16 @@ func TestQemuArchBaseKernelParameters(t *testing.T) { func TestQemuArchBaseCapabilities(t *testing.T) { assert := assert.New(t) qemuArchBase := newQemuArchBase() + hConfig := HypervisorConfig{} + hConfig.SharedFS = config.VirtioFS - c := qemuArchBase.capabilities() + c := qemuArchBase.capabilities(hConfig) assert.True(c.IsBlockDeviceHotplugSupported()) + assert.True(c.IsFsSharingSupported()) + + hConfig.SharedFS = config.NoSharedFS + c = qemuArchBase.capabilities(hConfig) + assert.False(c.IsFsSharingSupported()) } func TestQemuArchBaseBridges(t *testing.T) { @@ -463,7 +470,7 @@ func TestQemuArchBaseAppendVFIODevice(t *testing.T) { }, } - vfDevice := config.VFIOPCIDev{ + vfDevice := config.VFIODev{ BDF: bdf, } @@ -483,7 +490,7 @@ func TestQemuArchBaseAppendVFIODeviceWithVendorDeviceID(t *testing.T) { }, } - vfDevice := config.VFIOPCIDev{ + vfDevice := config.VFIODev{ BDF: bdf, VendorID: vendorID, DeviceID: deviceID, diff --git a/src/runtime/virtcontainers/qemu_ppc64le.go b/src/runtime/virtcontainers/qemu_ppc64le.go index 2d4010fbc..7d71d72ba 100644 --- a/src/runtime/virtcontainers/qemu_ppc64le.go +++ b/src/runtime/virtcontainers/qemu_ppc64le.go @@ -11,6 +11,7 @@ import ( "fmt" "time" + "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" "github.com/sirupsen/logrus" @@ -97,7 +98,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) { return q, nil } -func (q *qemuPPC64le) capabilities() types.Capabilities { +func (q *qemuPPC64le) capabilities(hConfig HypervisorConfig) types.Capabilities { var caps types.Capabilities // pseries machine type supports hotplugging drives @@ -106,7 +107,9 @@ func (q *qemuPPC64le) capabilities() types.Capabilities { } caps.SetMultiQueueSupport() - caps.SetFsSharingSupport() + if hConfig.SharedFS != config.NoSharedFS { + caps.SetFsSharingSupport() + } return caps } diff --git a/src/runtime/virtcontainers/qemu_test.go b/src/runtime/virtcontainers/qemu_test.go index bfa348145..418075e26 100644 --- a/src/runtime/virtcontainers/qemu_test.go +++ b/src/runtime/virtcontainers/qemu_test.go @@ -111,9 +111,6 @@ func TestQemuCreateVM(t *testing.T) { config6 := newQemuConfig() config6.DisableGuestSeLinux = false - config7 := newQemuConfig() - config7.PCIeRootPort = 1 - config8 := newQemuConfig() config8.EnableVhostUserStore = true config8.HugePages = true @@ -161,7 +158,6 @@ func TestQemuCreateVM(t *testing.T) { {config3, false, true}, {config5, false, true}, {config6, false, false}, - {config7, false, true}, {config8, false, true}, {config9, true, false}, {config10, false, true}, diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 530713475..a0c078012 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -36,7 +36,6 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers" deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager" - hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" resCtrl "github.com/kata-containers/kata-containers/src/runtime/pkg/resourcecontrol" exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental" @@ -106,15 +105,10 @@ type HypervisorPidKey struct{} // SandboxStatus describes a sandbox status. type SandboxStatus struct { - ContainersStatus []ContainerStatus - - // Annotations allow clients to store arbitrary values, - // for example to add additional status values required - // to support particular specifications. - Annotations map[string]string - + Annotations map[string]string ID string Hypervisor HypervisorType + ContainersStatus []ContainerStatus State types.SandboxState HypervisorConfig HypervisorConfig } @@ -530,6 +524,7 @@ func createSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Fac return s, nil } +//nolint:gocyclo func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factory) (sb *Sandbox, retErr error) { span, ctx := katatrace.Trace(ctx, nil, "newSandbox", sandboxTracingTags, map[string]string{"sandbox_id": sandboxConfig.ID}) defer span.End() @@ -630,22 +625,49 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor // If we have a confidential guest we need to cold-plug the PCIe VFIO devices // until we have TDISP/IDE PCIe support. - coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != hv.NoPort) - var devs []config.DeviceInfo + coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort) + // Aggregate all the containner devices for hot-plug and use them to dedcue + // the correct amount of ports to reserve for the hypervisor. + hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort) + + var vfioDevices []config.DeviceInfo + // vhost-user-block device is a PCIe device in Virt, keep track of it + // for correct number of PCIe root ports. + var vhostUserBlkDevices []config.DeviceInfo + for cnt, containers := range sandboxConfig.Containers { for dev, device := range containers.DeviceInfos { - if coldPlugVFIO && deviceManager.IsVFIO(device.ContainerPath) { + + if deviceManager.IsVhostUserBlk(device) { + vhostUserBlkDevices = append(vhostUserBlkDevices, device) + continue + } + isVFIO := deviceManager.IsVFIO(device.ContainerPath) + if hotPlugVFIO && isVFIO { + vfioDevices = append(vfioDevices, device) + sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO + } + if coldPlugVFIO && isVFIO { device.ColdPlug = true - devs = append(devs, device) + device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO + vfioDevices = append(vfioDevices, device) // We need to remove the devices marked for cold-plug // otherwise at the container level the kata-agent // will try to hot-plug them. - infos := sandboxConfig.Containers[cnt].DeviceInfos - infos = append(infos[:dev], infos[dev+1:]...) - sandboxConfig.Containers[cnt].DeviceInfos = infos + sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging" } } + var filteredDevices []config.DeviceInfo + for _, device := range containers.DeviceInfos { + if device.ID != "remove-we-are-cold-plugging" { + filteredDevices = append(filteredDevices, device) + } + } + sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices + } + sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices + sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices // store doesn't require hypervisor to be stored immediately if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil { @@ -660,7 +682,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor return s, nil } - for _, dev := range devs { + for _, dev := range vfioDevices { _, err := s.AddDevice(ctx, dev) if err != nil { s.Logger().WithError(err).Debug("Cannot cold-plug add device") @@ -1723,7 +1745,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error { defer span.End() for i := range s.config.Containers { - c, err := newContainer(ctx, s, &s.config.Containers[i]) if err != nil { return err @@ -1742,7 +1763,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error { if err := s.updateResources(ctx); err != nil { return err } - if err := s.resourceControllerUpdate(ctx); err != nil { return err } @@ -1754,7 +1774,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error { if err := s.storeSandbox(ctx); err != nil { return err } - return nil } @@ -1918,15 +1937,11 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy // adding a group of VFIO devices for _, dev := range vfioDevices { if _, err := s.hypervisor.HotplugAddDevice(ctx, dev, VfioDev); err != nil { - bdf := "" - if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok { - bdf = pciDevice.BDF - } s.Logger(). WithFields(logrus.Fields{ "sandbox": s.id, - "vfio-device-ID": (*dev).GetID(), - "vfio-device-BDF": bdf, + "vfio-device-ID": dev.ID, + "vfio-device-BDF": dev.BDF, }).WithError(err).Error("failed to hotplug VFIO device") return err } @@ -1941,6 +1956,7 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy return err case config.VhostUserBlk: vhostUserBlkDevice, ok := device.(*drivers.VhostUserBlkDevice) + if !ok { return fmt.Errorf("device type mismatch, expect device type to be %s", devType) } @@ -1975,15 +1991,11 @@ func (s *Sandbox) HotplugRemoveDevice(ctx context.Context, device api.Device, de // remove a group of VFIO devices for _, dev := range vfioDevices { if _, err := s.hypervisor.HotplugRemoveDevice(ctx, dev, VfioDev); err != nil { - bdf := "" - if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok { - bdf = pciDevice.BDF - } s.Logger().WithError(err). WithFields(logrus.Fields{ "sandbox": s.id, - "vfio-device-ID": (*dev).GetID(), - "vfio-device-BDF": bdf, + "vfio-device-ID": dev.ID, + "vfio-device-BDF": dev.BDF, }).Error("failed to hot unplug VFIO device") return err } diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go index de3b1885c..90a2af7ee 100644 --- a/src/runtime/virtcontainers/sandbox_test.go +++ b/src/runtime/virtcontainers/sandbox_test.go @@ -593,11 +593,11 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) { _, err = os.Create(deviceFile) assert.Nil(t, err) - savedIOMMUPath := config.SysIOMMUPath - config.SysIOMMUPath = tmpDir + savedIOMMUPath := config.SysIOMMUGroupPath + config.SysIOMMUGroupPath = tmpDir defer func() { - config.SysIOMMUPath = savedIOMMUPath + config.SysIOMMUGroupPath = savedIOMMUPath }() dm := manager.NewDeviceManager(config.VirtioSCSI, false, "", 0, nil) diff --git a/tests/common.bash b/tests/common.bash index d4aa44684..090bd7fd9 100644 --- a/tests/common.bash +++ b/tests/common.bash @@ -240,3 +240,17 @@ restart_containerd_service() { clean_env_ctr return 0 } + +# @path_results: path to the input metric-results folder +# @tarball_fname: path and filename to the output tarball +function compress_metrics_results_dir() +{ + local path_results="${1:-results}" + local tarball_fname="${2:-}" + + [ -z "${tarball_fname}" ] && die "Missing the tarball filename or the path to save the tarball results is incorrect." + [ ! -d "${path_results}" ] && die "Missing path to the results folder." + + cd "${path_results}" && tar -czf "${tarball_fname}" *.json && cd - + info "tarball generated: ${tarball_fname}" +} diff --git a/tests/integration/gha-run.sh b/tests/integration/gha-run.sh index 103ce2cda..c5cd573d6 100755 --- a/tests/integration/gha-run.sh +++ b/tests/integration/gha-run.sh @@ -31,13 +31,16 @@ function login_azure() { } function create_cluster() { + # First, ensure that the cluster didn't fail to get cleaned up from a previous run. + delete_cluster || true + az aks create \ -g "kataCI" \ -n "$(_print_cluster_name)" \ -s "Standard_D4s_v5" \ --node-count 1 \ --generate-ssh-keys \ - $([ "${KATA_HOST_OS}" = "cbl-mariner" ] && echo "--os-sku mariner --workload-runtime KataMshvVmIsolation") + $([ "${KATA_HOST_OS}" = "cbl-mariner" ] && echo "--os-sku AzureLinux --workload-runtime KataMshvVmIsolation") } function install_bats() { @@ -55,10 +58,28 @@ function get_cluster_credentials() { -n "$(_print_cluster_name)" } +function ensure_yq() { + : "${GOPATH:=${GITHUB_WORKSPACE}}" + export GOPATH + export PATH="${GOPATH}/bin:${PATH}" + INSTALL_IN_GOPATH=true "${repo_root_dir}/ci/install_yq.sh" +} + function run_tests() { platform="${1}" + ensure_yq + + # Emsure we're in the default namespace + kubectl config set-context --current --namespace=default + + # Delete any spurious tests namespace that was left behind + kubectl delete namespace kata-containers-k8s-tests &> /dev/null || true sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}|g" "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" + if [ "${KATA_HOST_OS}" = "cbl-mariner" ]; then + yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[+].name' "HOST_OS" + yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[-1].value' "${KATA_HOST_OS}" + fi cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" | grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" || die "Failed to setup the tests image" @@ -80,6 +101,10 @@ function run_tests() { sleep 60s fi + # Create a new namespace for the tests and switch to it + kubectl apply -f ${integration_dir}/kubernetes/runtimeclass_workloads/tests-namespace.yaml + kubectl config set-context --current --namespace=kata-containers-k8s-tests + pushd "${integration_dir}/kubernetes" bash setup.sh bash run_kubernetes_tests.sh @@ -89,6 +114,10 @@ function run_tests() { function cleanup() { platform="${1}" + # Switch back to the default namespace and delete the tests one + kubectl config set-context --current --namespace=default + kubectl delete namespace kata-containers-k8s-tests + if [ "${platform}" = "tdx" ]; then deploy_spec="-k "${tools_dir}/packaging/kata-deploy/kata-deploy/overlays/k3s"" cleanup_spec="-k "${tools_dir}/packaging/kata-deploy/kata-cleanup/overlays/k3s"" @@ -115,11 +144,12 @@ function delete_cluster() { az aks delete \ -g "kataCI" \ -n "$(_print_cluster_name)" \ - --yes \ - --no-wait + --yes } function main() { + export KATA_HOST_OS="${KATA_HOST_OS:-}" + action="${1:-}" case "${action}" in diff --git a/tests/integration/kubernetes/k8s-pod-quota.bats b/tests/integration/kubernetes/k8s-pod-quota.bats index addc37bb3..d9a527725 100644 --- a/tests/integration/kubernetes/k8s-pod-quota.bats +++ b/tests/integration/kubernetes/k8s-pod-quota.bats @@ -14,13 +14,12 @@ setup() { @test "Pod quota" { resource_name="pod-quota" deployment_name="deploymenttest" - namespace="test-quota-ns" # Create the resourcequota kubectl create -f "${pod_config_dir}/resource-quota.yaml" # View information about resourcequota - kubectl get -n "$namespace" resourcequota "$resource_name" \ + kubectl get resourcequota "$resource_name" \ --output=yaml | grep 'pods: "2"' # Create deployment @@ -28,10 +27,9 @@ setup() { # View deployment kubectl wait --for=condition=Available --timeout=$timeout \ - -n "$namespace" deployment/${deployment_name} + deployment/${deployment_name} } teardown() { - kubectl delete -n "$namespace" deployment "$deployment_name" kubectl delete -f "${pod_config_dir}/resource-quota.yaml" } diff --git a/tests/integration/kubernetes/run_kubernetes_tests.sh b/tests/integration/kubernetes/run_kubernetes_tests.sh index 0975ec0d5..db1e16633 100644 --- a/tests/integration/kubernetes/run_kubernetes_tests.sh +++ b/tests/integration/kubernetes/run_kubernetes_tests.sh @@ -54,10 +54,6 @@ else ) fi -if [ ${KATA_HOST_OS} == "cbl-mariner" ]; then - exit 0 -fi - # we may need to skip a few test cases when running on non-x86_64 arch arch_config_file="${kubernetes_dir}/filter_out_per_arch/${TARGET_ARCH}.yaml" if [ -f "${arch_config_file}" ]; then diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml index 680577a5f..6341b0b1f 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml +++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml @@ -6,7 +6,6 @@ apiVersion: v1 kind: Pod metadata: - namespace: default name: custom-dns-test spec: terminationGracePeriodSeconds: 0 diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml index 672c54e68..90fc28667 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml +++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml @@ -8,7 +8,6 @@ apiVersion: v1 kind: Pod metadata: name: pod-oom - namespace: default spec: runtimeClassName: kata restartPolicy: Never diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml index ecdaf5e64..9383349d7 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml +++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml @@ -7,7 +7,6 @@ apiVersion: apps/v1 kind: Deployment metadata: name: deploymenttest - namespace: test-quota-ns spec: selector: matchLabels: diff --git a/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml b/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml index a8d84d9ad..8ae0a1998 100644 --- a/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml +++ b/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml @@ -14,7 +14,6 @@ items: kind: ResourceQuota metadata: name: pod-quota - namespace: test-quota-ns spec: hard: pods: "2" diff --git a/tests/integration/kubernetes/runtimeclass_workloads/tests-namespace.yaml b/tests/integration/kubernetes/runtimeclass_workloads/tests-namespace.yaml new file mode 100644 index 000000000..916003d13 --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/tests-namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: kata-containers-k8s-tests diff --git a/tests/integration/kubernetes/setup.sh b/tests/integration/kubernetes/setup.sh index 0c3baf2dc..614f38827 100755 --- a/tests/integration/kubernetes/setup.sh +++ b/tests/integration/kubernetes/setup.sh @@ -13,8 +13,24 @@ set_runtime_class() { sed -i -e "s|runtimeClassName: kata|runtimeClassName: kata-${KATA_HYPERVISOR}|" ${kubernetes_dir}/runtimeclass_workloads/*.yaml } +set_kernel_path() { + if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then + mariner_kernel_path="/usr/share/cloud-hypervisor/vmlinux.bin" + find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.kernel]' "${mariner_kernel_path}" \; + fi +} + +set_initrd_path() { + if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then + initrd_path="/opt/kata/share/kata-containers/kata-containers-initrd-cbl-mariner.img" + find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.initrd]' "${initrd_path}" \; + fi +} + main() { set_runtime_class + set_kernel_path + set_initrd_path } main "$@" diff --git a/tests/metrics/README.md b/tests/metrics/README.md index 1dd996046..d017ed3fc 100644 --- a/tests/metrics/README.md +++ b/tests/metrics/README.md @@ -55,6 +55,8 @@ For further details see the [time tests documentation](time). Tests that measure the size and overheads of the runtime. Generally this is looking at memory footprint sizes, but could also cover disk space or even CPU consumption. +For further details see the [density tests documentation](density). + ### Networking Tests relating to networking. General items could include: diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml new file mode 100644 index 000000000..562b2c83b --- /dev/null +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml @@ -0,0 +1,34 @@ +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# This file contains baseline expectations +# for checked results by checkmetrics tool. +# +# values set specifically for packet.com c1.small worker. + +[[metric]] +name = "boot-times" +type = "json" +description = "measure container lifecycle timings" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result" +checktype = "mean" +midval = 0.42 +minpercent = 20.0 +maxpercent = 20.0 + +[[metric]] +name = "memory-footprint" +type = "json" +description = "measure memory usage" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"memory-footprint\".Results | .[] | .average.Result" +checktype = "mean" +midval = 2518364.00 +minpercent = 20.0 +maxpercent = 20.0 diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml new file mode 100644 index 000000000..c6bc85147 --- /dev/null +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml @@ -0,0 +1,34 @@ +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# This file contains baseline expectations +# for checked results by checkmetrics tool. +# +# values set specifically for Equinix m3.small.x86. + +[[metric]] +name = "boot-times" +type = "json" +description = "measure container lifecycle timings" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result" +checktype = "mean" +midval = 0.61 +minpercent = 20.0 +maxpercent = 20.0 + +[[metric]] +name = "memory-footprint" +type = "json" +description = "measure memory usage" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"memory-footprint\".Results | .[] | .average.Result" +checktype = "mean" +midval = 2435844.00 +minpercent = 20.0 +maxpercent = 20.0 diff --git a/tests/metrics/density/README.md b/tests/metrics/density/README.md new file mode 100644 index 000000000..e07ee18b3 --- /dev/null +++ b/tests/metrics/density/README.md @@ -0,0 +1,53 @@ +# Kata Containers density metrics tests + +This directory contains a number of tests to help measure container +memory footprint. Some measures are based around the +[PSS](https://en.wikipedia.org/wiki/Proportional_set_size) of the runtime +components, and others look at the system level (`free` and `/proc/meminfo` +for instance) impact. + +## `memory_usage` + +This test measures the PSS footprint of the runtime components whilst +launching a number of small ([BusyBox](https://hub.docker.com/_/busybox/)) containers +using ctr. + +## `fast_footprint` + +This test takes system level resource measurements after launching a number of +containers in parallel and optionally waiting for KSM to settle its memory +compaction cycles. + +The script is quite configurable via environment variables, including: + +* Which container workload to run. +* How many containers to launch. +* How many containers are launched in parallel. +* How long to wait until taking the measures. + +See the script itself for more details. + +This test shares many config options with the `footprint_data` test. Thus, referring +to the [footprint test documentation](footprint_data.md) may be useful. + +> *Note:* If this test finds KSM is enabled on the host, it will wait for KSM +> to "settle" before taking the final measurement. If your KSM is not configured +> to process all the allocated VM memory fast enough, the test will hit a timeout +> and proceed to take the final measurement anyway. + +## `footprint_data` + +Similar to the `fast_footprint` test, but this test launches the containers +sequentially and takes a system level measurement between each launch. Thus, +this test provides finer grained information on system scaling, but takes +significantly longer to run than the `fast_footprint` test. If you are only +interested in the final figure or the average impact, you may be better running +the `fast_footprint` test. + +For more details see the [footprint test documentation](footprint_data.md). + +## `memory_usage_inside_container` + +Measures the memory statistics *inside* the container. This allows evaluation of +the overhead the VM kernel and rootfs are having on the memory that was requested +by the container co-ordination system, and thus supplied to the VM. diff --git a/tests/metrics/density/fast_footprint.sh b/tests/metrics/density/fast_footprint.sh new file mode 100755 index 000000000..9c84f57fc --- /dev/null +++ b/tests/metrics/density/fast_footprint.sh @@ -0,0 +1,433 @@ +#!/bin/bash +# Copyright (c) 2017-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# A script to gather memory 'footprint' information as we launch more +# and more containers +# +# The script gathers information about both user and kernel space consumption +# Output is into a .json file, named using some of the config component names +# (such as footprint-busybox.json) + +# Pull in some common, useful, items +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +# Note that all vars that can be set from outside the script (that is, +# passed in the ENV), use the ':-' setting to allow being over-ridden + +# Default sleep, in seconds, to let containers come up and finish their +# initialisation before we take the measures. Some of the larger +# containers can take a number of seconds to get running. +PAYLOAD_SLEEP="${PAYLOAD_SLEEP:-10}" + +# How long, in seconds, do we wait for KSM to 'settle down', before we +# timeout and just continue anyway. +KSM_WAIT_TIME="${KSM_WAIT_TIME:-300}" + +# How long, in seconds, do we poll for ctr to complete launching all the +# containers? +CTR_POLL_TIMEOUT="${CTR_POLL_TIMEOUT:-300}" + +# How many containers do we launch in parallel before taking the PAYLOAD_SLEEP +# nap +PARALLELISM="${PARALLELISM:-10}" + +### The default config - run a small busybox image +# Define what we will be running (app under test) +# Default is we run busybox, as a 'small' workload +PAYLOAD="${PAYLOAD:-quay.io/prometheus/busybox:latest}" +PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}" + +### +# which RUNTIME we use is picked up from the env in +# common.bash. You can over-ride by setting RUNTIME in your env + +### +# Define the cutoff checks for when we stop running the test + # Run up to this many containers +NUM_CONTAINERS="${NUM_CONTAINERS:-100}" + # Run until we have consumed this much memory (from MemFree) +MAX_MEMORY_CONSUMED="${MAX_MEMORY_CONSUMED:-256*1024*1024*1024}" + # Run until we have this much MemFree left +MIN_MEMORY_FREE="${MIN_MEMORY_FREE:-2*1024*1024*1024}" + +# Tools we need to have installed in order to operate +REQUIRED_COMMANDS="smem awk" + +# If we 'dump' the system caches before we measure then we get less +# noise in the results - they show more what our un-reclaimable footprint is +DUMP_CACHES="${DUMP_CACHES:-1}" + +# Affects the name of the file to store the results in +TEST_NAME="${TEST_NAME:-fast-footprint-busybox}" + +############# end of configurable items ################### + +# vars to remember where we started so we can calc diffs +base_mem_avail=0 +base_mem_free=0 + +# dump the kernel caches, so we get a more precise (or just different) +# view of what our footprint really is. +function dump_caches() { + sudo bash -c "echo 3 > /proc/sys/vm/drop_caches" +} + +function init() { + restart_containerd_service + + check_cmds $REQUIRED_COMMANDS + sudo -E "${CTR_EXE}" image pull "$PAYLOAD" + + # Modify the test name if running with KSM enabled + check_for_ksm + + # Use the common init func to get to a known state + init_env + + # Prepare to start storing results + metrics_json_init + + # Store up baseline measures + base_mem_avail=$(free -b | head -2 | tail -1 | awk '{print $7}') + base_mem_free=$(get_memfree) + + # Store our configuration for this run + save_config +} + +save_config(){ + metrics_json_start_array + + local json="$(cat << EOF + { + "testname": "${TEST_NAME}", + "payload": "${PAYLOAD}", + "payload_args": "${PAYLOAD_ARGS}", + "payload_sleep": ${PAYLOAD_SLEEP}, + "ksm_settle_time": ${KSM_WAIT_TIME}, + "num_containers": ${NUM_CONTAINERS}, + "parallelism": ${PARALLELISM}, + "max_memory_consumed": "${MAX_MEMORY_CONSUMED}", + "min_memory_free": "${MIN_MEMORY_FREE}", + "dump_caches": "${DUMP_CACHES}" + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Config" +} + +function cleanup() { + # Finish storing the results + metrics_json_save + + clean_env_ctr +} + +# helper function to get USS of process in arg1 +function get_proc_uss() { + item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $4}') + ((item*=1024)) + echo $item +} + +# helper function to get PSS of process in arg1 +function get_proc_pss() { + item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $5}') + ((item*=1024)) + echo $item +} + +# Get the PSS for the whole of userspace (all processes) +# This allows us to see if we had any impact on the rest of the system, for instance +# dockerd grows as we launch containers, so we should account for that in our total +# memory breakdown +function grab_all_pss() { + item=$(sudo smem -t | tail -1 | awk '{print $5}') + ((item*=1024)) + + local json="$(cat << EOF + "all_pss": { + "pss": $item, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function grab_user_smem() { + # userspace + item=$(sudo smem -w | head -5 | tail -1 | awk '{print $3}') + ((item*=1024)) + + local json="$(cat << EOF + "user_smem": { + "userspace": $item, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function grab_slab() { + # Grabbing slab total from meminfo is easier than doing the math + # on slabinfo + item=$(fgrep "Slab:" /proc/meminfo | awk '{print $2}') + ((item*=1024)) + + local json="$(cat << EOF + "slab": { + "slab": $item, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function get_memfree() { + mem_free=$(sudo smem -w | head -6 | tail -1 | awk '{print $4}') + ((mem_free*=1024)) + echo $mem_free +} + +function grab_system() { + + # avail memory, from 'free' + local avail=$(free -b | head -2 | tail -1 | awk '{print $7}') + local avail_decr=$((base_mem_avail-avail)) + + # cached memory, from 'free' + local cached=$(free -b | head -2 | tail -1 | awk '{print $6}') + + # free memory from smem + local smem_free=$(get_memfree) + local free_decr=$((base_mem_free-item)) + + # Anon pages + local anon=$(fgrep "AnonPages:" /proc/meminfo | awk '{print $2}') + ((anon*=1024)) + + # Mapped pages + local mapped=$(egrep "^Mapped:" /proc/meminfo | awk '{print $2}') + ((mapped*=1024)) + + # Cached + local meminfo_cached=$(grep "^Cached:" /proc/meminfo | awk '{print $2}') + ((meminfo_cached*=1024)) + + local json="$(cat << EOF + "system": { + "avail": $avail, + "avail_decr": $avail_decr, + "cached": $cached, + "smem_free": $smem_free, + "free_decr": $free_decr, + "anon": $anon, + "mapped": $mapped, + "meminfo_cached": $meminfo_cached, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function grab_stats() { + # If configured, dump the caches so we get a more stable + # view of what our static footprint really is + if [[ "$DUMP_CACHES" ]] ; then + dump_caches + fi + + # user space data + # PSS taken all userspace + grab_all_pss + # user as reported by smem + grab_user_smem + + # System overview data + # System free and cached + grab_system + + # kernel data + # The 'total kernel space taken' we can work out as: + # ktotal = ((free-avail)-user) + # So, we don't grab that number from smem, as that is what it does + # internally anyhow. + # Still try to grab any finer kernel details that we can though + + # totals from slabinfo + grab_slab + + metrics_json_close_array_element +} + +function check_limits() { + mem_free=$(get_memfree) + if ((mem_free <= MIN_MEMORY_FREE)); then + echo 1 + return + fi + + mem_consumed=$((base_mem_avail-mem_free)) + if ((mem_consumed >= MAX_MEMORY_CONSUMED)); then + echo 1 + return + fi + + echo 0 +} + +launch_containers() { + local parloops leftovers + + (( parloops=${NUM_CONTAINERS}/${PARALLELISM} )) + (( leftovers=${NUM_CONTAINERS} - (${parloops}*${PARALLELISM}) )) + + echo "Launching ${parloops}x${PARALLELISM} containers + ${leftovers} etras" + + containers=() + + local iter n + for iter in $(seq 1 $parloops); do + echo "Launch iteration ${iter}" + for n in $(seq 1 $PARALLELISM); do + containers+=($(random_name)) + sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS & + done + + if [[ $PAYLOAD_SLEEP ]]; then + sleep $PAYLOAD_SLEEP + fi + + # check if we have hit one of our limits and need to wrap up the tests + if (($(check_limits))); then + echo "Ran out of resources, check_limits failed" + return + fi + done + + for n in $(seq 1 $leftovers); do + containers+=($(random_name)) + sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS & + done +} + +wait_containers() { + local t numcontainers + # nap 3s between checks + local step=3 + + for ((t=0; t<${CTR_POLL_TIMEOUT}; t+=step)); do + + numcontainers=$(sudo -E "${CTR_EXE}" c list -q | wc -l) + + if (( numcontainers >= ${NUM_CONTAINERS} )); then + echo "All containers now launched (${t}s)" + return + else + echo "Waiting for containers to launch (${numcontainers} at ${t}s)" + fi + sleep ${step} + done + + echo "Timed out waiting for containers to launch (${t}s)" + cleanup + die "Timed out waiting for containers to launch (${t}s)" +} + +function go() { + # Init the json cycle for this save + metrics_json_start_array + + # Grab the first set of stats before we run any containers. + grab_stats + + launch_containers + wait_containers + + if [ $ksm_on == "1" ]; then + echo "Wating for KSM to settle..." + wait_ksm_settle ${KSM_WAIT_TIME} + fi + + grab_stats + + # Wrap up the results array + metrics_json_end_array "Results" +} + +function show_vars() +{ + echo -e "\nEvironment variables:" + echo -e "\tName (default)" + echo -e "\t\tDescription" + echo -e "\tPAYLOAD (${PAYLOAD})" + echo -e "\t\tThe ctr image to run" + echo -e "\tPAYLOAD_ARGS (${PAYLOAD_ARGS})" + echo -e "\t\tAny extra arguments passed into the docker 'run' command" + echo -e "\tPAYLOAD_SLEEP (${PAYLOAD_SLEEP})" + echo -e "\t\tSeconds to sleep between launch and measurement, to allow settling" + echo -e "\tKSM_WAIT_TIME (${KSM_WAIT_TIME})" + echo -e "\t\tSeconds to wait for KSM to settle before we take the final measure" + echo -e "\tCTR_POLL_TIMEOUT (${CTR_POLL_TIMEOUT})" + echo -e "\t\tSeconds to poll for ctr to finish launching containers" + echo -e "\tPARALLELISM (${PARALLELISM})" + echo -e "\t\tNumber of containers we launch in parallel" + echo -e "\tNUM_CONTAINERS (${NUM_CONTAINERS})" + echo -e "\t\tThe total number of containers to run" + echo -e "\tMAX_MEMORY_CONSUMED (${MAX_MEMORY_CONSUMED})" + echo -e "\t\tThe maximum amount of memory to be consumed before terminating" + echo -e "\tMIN_MEMORY_FREE (${MIN_MEMORY_FREE})" + echo -e "\t\tThe minimum amount of memory allowed to be free before terminating" + echo -e "\tDUMP_CACHES (${DUMP_CACHES})" + echo -e "\t\tA flag to note if the system caches should be dumped before capturing stats" + echo -e "\tTEST_NAME (${TEST_NAME})" + echo -e "\t\tCan be set to over-ride the default JSON results filename" + +} + +function help() +{ + usage=$(cat << EOF +Usage: $0 [-h] [options] + Description: + Launch a series of workloads and take memory metric measurements after + each launch. + Options: + -h, Help page. +EOF +) + echo "$usage" + show_vars +} + +function main() { + + local OPTIND + while getopts "h" opt;do + case ${opt} in + h) + help + exit 0; + ;; + esac + done + shift $((OPTIND-1)) + + init + go + cleanup +} + +main "$@" diff --git a/tests/metrics/density/footprint_data.md b/tests/metrics/density/footprint_data.md new file mode 100644 index 000000000..b9ba27fe0 --- /dev/null +++ b/tests/metrics/density/footprint_data.md @@ -0,0 +1,87 @@ +# Footprint data script details + +The `footprint_data.sh` script runs a number of identical containers sequentially +via ctr and takes a number of memory related measurements after each +launch. The script is generally not used in a CI type environment, but is intended +to be run and analyzed manually. + +You can configure the script by setting a number of environment variables. + +The following sections list details of the configurable variables, along with a +small example invocation script. + +## Variables +Environment variables can take effect in two ways. + +Some variables affect how the payload is executed. The `RUNTIME` and `PAYLOAD` +arguments directly affect the payload execution with the following line in +the script: + +`$ ctr run --memory-limit $PAYLOAD_RUNTIME_ARGS --rm --runtime=$CONTAINERD_RUNTIME $PAYLOAD $NAME sh -c $PAYLOAD_ARGS` + +Other settings affect how memory footprint is measured and the test termination +conditions. + +| Variable | Function +| -------- | -------- +| `PAYLOAD` | The ctr image to run +| `PAYLOAD_ARGS` | Any arguments passed into the ctr image +| `PAYLOAD_RUNTIME_ARGS` | Any extra arguments passed into the ctr `run` command +| `PAYLOAD_SLEEP` | Seconds to sleep between launch and measurement, to allow settling +| `MAX_NUM_CONTAINERS` | The maximum number of containers to run before terminating +| `MAX_MEMORY_CONSUMED` | The maximum amount of memory to be consumed before terminating +| `MIN_MEMORY_FREE` | The minimum amount of memory allowed to be free before terminating +| `DUMP_CACHES` | A flag to note if the system caches should be dumped before capturing stats +| `DATAFILE` | Can be set to over-ride the default JSON results filename + +## Output files +The names of the JSON files generated by the test are dictated by some of the parameters +the test is utilising. The default filename is generated in the form of: +`footprint-${PAYLOAD}[-ksm].json` + +## Measurements +The test measures, calculates, and stores a number of data items: + +| Item | Description +| ---- | ----------- +| `uss` | USS for all the VM runtime components +| `pss` | PSS for all the VM runtime components +| `all_pss` | PSS of all of userspace - to monitor if we had other impact on the system +| `user_smem` | `smem` "userspace" consumption value +| `avail` | "available" memory from `free` +| `avail_decr` | "available" memory decrease since start of test +| `cached` | "Cached" memory from `/proc/meminfo` +| `smem_free` | Free memory as reported by `smem` +| `free_decr` | Decrease in Free memory reported by `smem` since start of test +| `anon` | `AnonPages` as reported from `/proc/meminfo` +| `mapped` | Mapped pages as reported from `/proc/meminfo` +| `cached` | Cached pages as reported from `/proc/meminfo` +| `slab` | Slab as reported from `/proc/meminfo` + +## Example script +The following script is an example of how to configure the environment variables and +invoke the test script to run a number of different container tests. + +``` +#!/bin/bash + +set -e +set -x + +export MAX_NUM_CONTAINERS=10 +export MAX_MEMORY_CONSUMED=6*1024*1024*1024 + +function run() { + ### + # Define what we will be running (app under test) + # Default is we run busybox, as a 'small' workload + export PAYLOAD="quay.io/prometheus/busybox:latest" + export PAYLOAD_ARGS="tail -f /dev/null" + export PAYLOAD_SLEEP=10 + export PAYLOAD_RUNTIME_ARGS="5120" + sudo -E bash $(pwd)/density/footprint_data.sh +} + +export CONTAINERD_RUNTIME=io.containerd.kata.v2 +run +``` diff --git a/tests/metrics/density/footprint_data.sh b/tests/metrics/density/footprint_data.sh new file mode 100755 index 000000000..f5fc11341 --- /dev/null +++ b/tests/metrics/density/footprint_data.sh @@ -0,0 +1,360 @@ +#!/bin/bash +# Copyright (c) 2017-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# A script to gather memory 'footprint' information as we launch more +# and more containers +# +# The script gathers information about both user and kernel space consumption +# Output is into a .json file, named using some of the config component names +# (such as footprint-busybox.json) + +# Pull in some common, useful, items +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +KSM_ENABLE_FILE="/sys/kernel/mm/ksm/run" + +# Note that all vars that can be set from outside the script (that is, +# passed in the ENV), use the ':-' setting to allow being over-ridden + +# Default sleep for 10s to let containers come up and finish their +# initialisation before we take the measures. Some of the larger +# containers can take a number of seconds to get running. +PAYLOAD_SLEEP="${PAYLOAD_SLEEP:-10}" + +### The default config - run a small busybox image +# Define what we will be running (app under test) +# Default is we run busybox, as a 'small' workload +PAYLOAD="${PAYLOAD:-quay.io/prometheus/busybox:latest}" +PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}" + +### +# Define the cutoff checks for when we stop running the test + # Run up to this many containers +MAX_NUM_CONTAINERS="${MAX_NUM_CONTAINERS:-10}" + # Run until we have consumed this much memory (from MemFree) +MAX_MEMORY_CONSUMED="${MAX_MEMORY_CONSUMED:-6*1024*1024*1024}" + # Run until we have this much MemFree left +MIN_MEMORY_FREE="${MIN_MEMORY_FREE:-2*1024*1024*1024}" + +# Tools we need to have installed in order to operate +REQUIRED_COMMANDS="smem awk" + +# If we 'dump' the system caches before we measure then we get less +# noise in the results - they show more what our un-reclaimable footprint is +DUMP_CACHES="${DUMP_CACHES:-1}" + +# Affects the name of the file to store the results in +TEST_NAME="${TEST_NAME:-footprint-busybox}" + +############# end of configurable items ################### + +# vars to remember where we started so we can calc diffs +base_mem_avail=0 +base_mem_free=0 + +# dump the kernel caches, so we get a more precise (or just different) +# view of what our footprint really is. +function dump_caches() { + sudo bash -c "echo 3 > /proc/sys/vm/drop_caches" +} + +function init() { + restart_containerd_service + + check_cmds $REQUIRED_COMMANDS + sudo -E "${CTR_EXE}" image pull "$PAYLOAD" + + # Modify the test name if running with KSM enabled + check_for_ksm + + # Use the common init func to get to a known state + init_env + + # Prepare to start storing results + metrics_json_init + + # Store up baseline measures + base_mem_avail=$(free -b | head -2 | tail -1 | awk '{print $7}') + base_mem_free=$(get_memfree) + + # Store our configuration for this run + save_config +} + +save_config(){ + metrics_json_start_array + + local json="$(cat << EOF + { + "testname": "${TEST_NAME}", + "payload": "${PAYLOAD}", + "payload_args": "${PAYLOAD_ARGS}", + "payload_sleep": ${PAYLOAD_SLEEP}, + "max_containers": ${MAX_NUM_CONTAINERS}, + "max_memory_consumed": "${MAX_MEMORY_CONSUMED}", + "min_memory_free": "${MIN_MEMORY_FREE}", + "dump_caches": "${DUMP_CACHES}" + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Config" +} + +function cleanup() { + # Finish storing the results + metrics_json_save + + clean_env_ctr +} + +# helper function to get USS of process in arg1 +function get_proc_uss() { + item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $4}') + ((item*=1024)) + echo $item +} + +# helper function to get PSS of process in arg1 +function get_proc_pss() { + item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $5}') + ((item*=1024)) + echo $item +} + +# Get the PSS for the whole of userspace (all processes) +# This allows us to see if we had any impact on the rest of the system, for instance +# containerd grows as we launch containers, so we should account for that in our total +# memory breakdown +function grab_all_pss() { + item=$(sudo smem -t | tail -1 | awk '{print $5}') + ((item*=1024)) + + local json="$(cat << EOF + "all_pss": { + "pss": $item, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function grab_user_smem() { + # userspace + item=$(sudo smem -w | head -5 | tail -1 | awk '{print $3}') + ((item*=1024)) + + local json="$(cat << EOF + "user_smem": { + "userspace": $item, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function grab_slab() { + # Grabbing slab total from meminfo is easier than doing the math + # on slabinfo + item=$(fgrep "Slab:" /proc/meminfo | awk '{print $2}') + ((item*=1024)) + + local json="$(cat << EOF + "slab": { + "slab": $item, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function get_memfree() { + mem_free=$(sudo smem -w | head -6 | tail -1 | awk '{print $4}') + ((mem_free*=1024)) + echo $mem_free +} + +function grab_system() { + # avail memory, from 'free' + local avail=$(free -b | head -2 | tail -1 | awk '{print $7}') + local avail_decr=$((base_mem_avail-avail)) + + # cached memory, from 'free' + local cached=$(free -b | head -2 | tail -1 | awk '{print $6}') + + # free memory from smem + local smem_free=$(get_memfree) + local free_decr=$((base_mem_free-item)) + + # Anon pages + local anon=$(fgrep "AnonPages:" /proc/meminfo | awk '{print $2}') + ((anon*=1024)) + + # Mapped pages + local mapped=$(egrep "^Mapped:" /proc/meminfo | awk '{print $2}') + ((mapped*=1024)) + + # Cached + local meminfo_cached=$(grep "^Cached:" /proc/meminfo | awk '{print $2}') + ((meminfo_cached*=1024)) + + local json="$(cat << EOF + "system": { + "avail": $avail, + "avail_decr": $avail_decr, + "cached": $cached, + "smem_free": $smem_free, + "free_decr": $free_decr, + "anon": $anon, + "mapped": $mapped, + "meminfo_cached": $meminfo_cached, + "Units": "KB" + } +EOF +)" + + metrics_json_add_array_fragment "$json" +} + +function grab_stats() { + # If configured, dump the caches so we get a more stable + # view of what our static footprint really is + if [[ "$DUMP_CACHES" ]] ; then + dump_caches + fi + + # user space data + # PSS taken all userspace + grab_all_pss + # user as reported by smem + grab_user_smem + + # System overview data + # System free and cached + grab_system + + # kernel data + # The 'total kernel space taken' we can work out as: + # ktotal = ((free-avail)-user) + # So, we don't grab that number from smem, as that is what it does + # internally anyhow. + # Still try to grab any finer kernel details that we can though + + # totals from slabinfo + grab_slab + + metrics_json_close_array_element +} + +function check_limits() { + mem_free=$(get_memfree) + if ((mem_free <= MIN_MEMORY_FREE)); then + echo 1 + return + fi + + mem_consumed=$((base_mem_avail-mem_free)) + if ((mem_consumed >= MAX_MEMORY_CONSUMED)); then + echo 1 + return + fi + + echo 0 +} + +function go() { + # Init the json cycle for this save + metrics_json_start_array + + containers=() + + for i in $(seq 1 $MAX_NUM_CONTAINERS); do + containers+=($(random_name)) + sudo -E "${CTR_EXE}" run --rm --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS + + if [[ $PAYLOAD_SLEEP ]]; then + sleep $PAYLOAD_SLEEP + fi + + grab_stats + + # check if we have hit one of our limits and need to wrap up the tests + if (($(check_limits))); then + # Wrap up the results array + metrics_json_end_array "Results" + return + fi + done + + # Wrap up the results array + metrics_json_end_array "Results" +} + + +function show_vars() +{ + echo -e "\nEvironment variables:" + echo -e "\tName (default)" + echo -e "\t\tDescription" + echo -e "\tPAYLOAD (${PAYLOAD})" + echo -e "\t\tThe ctr image to run" + echo -e "\tPAYLOAD_ARGS (${PAYLOAD_ARGS})" + echo -e "\t\tAny extra arguments passed into the ctr 'run' command" + echo -e "\tPAYLOAD_SLEEP (${PAYLOAD_SLEEP})" + echo -e "\t\tSeconds to sleep between launch and measurement, to allow settling" + echo -e "\tMAX_NUM_CONTAINERS (${MAX_NUM_CONTAINERS})" + echo -e "\t\tThe maximum number of containers to run before terminating" + echo -e "\tMAX_MEMORY_CONSUMED (${MAX_MEMORY_CONSUMED})" + echo -e "\t\tThe maximum amount of memory to be consumed before terminating" + echo -e "\tMIN_MEMORY_FREE (${MIN_MEMORY_FREE})" + echo -e "\t\tThe path to the ctr binary (for 'smem' measurements)" + echo -e "\tDUMP_CACHES (${DUMP_CACHES})" + echo -e "\t\tA flag to note if the system caches should be dumped before capturing stats" + echo -e "\tTEST_NAME (${TEST_NAME})" + echo -e "\t\tCan be set to over-ride the default JSON results filename" + +} + +function help() +{ + usage=$(cat << EOF +Usage: $0 [-h] [options] + Description: + Launch a series of workloads and take memory metric measurements after + each launch. + Options: + -h, Help page. +EOF +) + echo "$usage" + show_vars +} + +function main() { + + local OPTIND + while getopts "h" opt;do + case ${opt} in + h) + help + exit 0; + ;; + esac + done + shift $((OPTIND-1)) + + init + go + cleanup +} + +main "$@" diff --git a/tests/metrics/density/memory_usage.sh b/tests/metrics/density/memory_usage.sh new file mode 100755 index 000000000..57d2ce3cd --- /dev/null +++ b/tests/metrics/density/memory_usage.sh @@ -0,0 +1,383 @@ +#!/bin/bash +# Copyright (c) 2017-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Description of the test: +# This test launches a number of containers in idle mode, +# It will then sleep for a configurable period of time to allow +# any memory optimisations to 'settle, and then checks the +# amount of memory used by all the containers to come up with +# an average (using the PSS measurements) +# This test uses smem tool to get the memory used. + +set -e + +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +# Busybox image: Choose a small workload image, this is +# in order to measure the runtime footprint, not the workload +# footprint. +IMAGE='quay.io/prometheus/busybox:latest' + +CMD='tail -f /dev/null' +NUM_CONTAINERS="$1" +WAIT_TIME="$2" +AUTO_MODE="$3" +TEST_NAME="memory footprint" +SMEM_BIN="smem" +KSM_ENABLE_FILE="/sys/kernel/mm/ksm/run" +MEM_TMP_FILE=$(mktemp meminfo.XXXXXXXXXX) +PS_TMP_FILE=$(mktemp psinfo.XXXXXXXXXX) + +function remove_tmp_file() { + rm -rf "${MEM_TMP_FILE}" "${PS_TMP_FILE}" +} + +trap remove_tmp_file EXIT + +# Show help about this script +function help(){ +cat << EOF +Usage: $0 [auto] + Description: + : Number of containers to run. + : Time in seconds to wait before taking + metrics. + [auto] : Optional 'auto KSM settle' mode + waits for ksm pages_shared to settle down +EOF +} + + +function get_runc_pss_memory(){ + ctr_runc_shim_path="/usr/local/bin/containerd-shim-runc-v2" + get_pss_memory "${ctr_runc_shim_path}" +} + +function get_runc_individual_memory() { + runc_process_result=$(cat "${MEM_TMP_FILE}" | tr "\n" " " | sed -e 's/\s$//g' | sed 's/ /, /g') + + # Verify runc process result + if [ -z "${runc_process_result}" ];then + die "Runc process not found" + fi + + read -r -a runc_values <<< "${runc_process_result}" + + metrics_json_start_array + + local json="$(cat << EOF + { + "runc individual results": [ + $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do + printf '%s\n\t\t\t' "${runc_values[i]}" + done) + ] + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Raw results" +} + +# This function measures the PSS average +# memory of a process. +function get_pss_memory(){ + ps="$1" + mem_amount=0 + count=0 + avg=0 + + if [ -z "${ps}" ]; then + die "No argument to get_pss_memory()" + fi + + # Save all the processes names + # This will be help us to retrieve raw information + echo "${ps}" >> "${PS_TMP_FILE}" + + data=$(sudo "${SMEM_BIN}" --no-header -P "^${ps}" -c "pss" | sed 's/[[:space:]]//g' | tr '\n' ' ' | sed 's/[[:blank:]]*$//') + + # Save all the smem results + # This will help us to retrieve raw information + echo "${data}" >> "${MEM_TMP_FILE}" + + gral_data=$(echo "${data// /+}" | bc) + for i in "${gral_data}"; do + if (( $i > 0 ));then + mem_amount=$(( i + mem_amount )) + (( count++ )) + fi + done + + if (( "${count}" > 0 ));then + avg=$(bc -l <<< "scale=2; ${mem_amount} / ${count}") + fi + + echo "${avg}" +} + +function ppid() { + local pid + pid=$(ps -p "${1:-nopid}" -o ppid=) + echo "${pid//[[:blank:]]/}" +} + +# This function measures the PSS average +# memory of virtiofsd. +# It is a special case of get_pss_memory, +# virtiofsd forks itself so, smem sees the process +# two times, this function sum both pss values: +# pss_virtiofsd=pss_fork + pss_parent +function get_pss_memory_virtiofsd() { + mem_amount=0 + count=0 + avg=0 + + virtiofsd_path=${1:-} + if [ -z "${virtiofsd_path}" ]; then + die "virtiofsd_path not provided" + fi + + echo "${virtiofsd_path}" >> "${PS_TMP_FILE}" + + virtiofsd_pids=$(ps aux | grep [v]irtiofsd | awk '{print $2}' | head -1) + data=$(sudo smem --no-header -P "^${virtiofsd_path}" -c pid -c "pid pss") + + for p in "${virtiofsd_pids}"; do + parent_pid=$(ppid "${p}") + cmd="$(cat /proc/${p}/cmdline | tr -d '\0')" + cmd_parent="$(cat /proc/${parent_pid}/cmdline | tr -d '\0')" + if [ "${cmd}" != "${cmd_parent}" ]; then + pss_parent=$(printf "%s" "${data}" | grep "\s^${p}" | awk '{print $2}') + + fork=$(pgrep -P "${p}") + + pss_fork=$(printf "%s" "${data}" | grep "^\s*${fork}" | awk '{print $2}') + pss_process=$((pss_fork + pss_parent)) + + # Save all the smem results + # This will help us to retrieve raw information + echo "${pss_process}" >>"${MEM_TMP_FILE}" + + if ((pss_process > 0)); then + mem_amount=$((pss_process + mem_amount)) + ((count++)) + fi + fi + done + + if (( "${count}" > 0 ));then + avg=$(bc -l <<< "scale=2; ${mem_amount} / ${count}") + fi + echo "${avg}" +} + +function get_individual_memory(){ + # Getting all the individual container information + first_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==1' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g') + first_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==1' | sed 's/ /, /g') + + second_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==2' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g') + second_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==2' | sed 's/ /, /g') + + third_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==3' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g') + third_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==3' | sed 's/ /, /g') + + read -r -a first_values <<< "${first_process_result}" + read -r -a second_values <<< "${second_process_result}" + read -r -a third_values <<< "${third_process_result}" + + metrics_json_start_array + + local json="$(cat << EOF + { + "${first_process_name} memory": [ + $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do + [ -n "${first_values[i]}" ] && + printf '%s\n\t\t\t' "${first_values[i]}" + done) + ], + "${second_process_name} memory": [ + $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do + [ -n "${second_values[i]}" ] && + printf '%s\n\t\t\t' "${second_values[i]}" + done) + ], + "${third_process_name} memory": [ + $(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do + [ -n "${third_values[i]}" ] && + printf '%s\n\t\t\t' "${third_values[i]}" + done) + ] + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Raw results" +} + +# Try to work out the 'average memory footprint' of a container. +function get_memory_usage(){ + hypervisor_mem=0 + virtiofsd_mem=0 + shim_mem=0 + memory_usage=0 + + containers=() + + info "Creating ${NUM_CONTAINERS} containers" + for ((i=1; i<="${NUM_CONTAINERS}"; i++)); do + containers+=($(random_name)) + sudo "${CTR_EXE}" run --runtime "${CTR_RUNTIME}" -d "${IMAGE}" "${containers[-1]}" sh -c "${CMD}" + done + + if [ "${AUTO_MODE}" == "auto" ]; then + if (( ksm_on != 1 )); then + die "KSM not enabled, cannot use auto mode" + fi + + echo "Entering KSM settle auto detect mode..." + wait_ksm_settle "${WAIT_TIME}" + else + # If KSM is enabled, then you normally want to sleep long enough to + # let it do its work and for the numbers to 'settle'. + echo "napping ${WAIT_TIME} s" + sleep "${WAIT_TIME}" + fi + + metrics_json_start_array + # Check the runtime in order in order to determine which process will + # be measured about PSS + if [ "${RUNTIME}" == "runc" ]; then + runc_workload_mem="$(get_runc_pss_memory)" + memory_usage="${runc_workload_mem}" + + local json="$(cat << EOF + { + "average": { + "Result": ${memory_usage}, + "Units" : "KB" + }, + "runc": { + "Result": ${runc_workload_mem}, + "Units" : "KB" + } + } +EOF +)" + + else [ "$RUNTIME" == "kata-runtime" ] || [ "$RUNTIME" == "kata-qemu" ] + # Get PSS memory of VM runtime components. + # And check that the smem search has found the process - we get a "0" + # back if that procedure fails (such as if a process has changed its name + # or is not running when expected to be so) + # As an added bonus - this script must be run as root. + # Now if you do not have enough rights + # the smem failure to read the stats will also be trapped. + + hypervisor_mem="$(get_pss_memory ${HYPERVISOR_PATH})" + if [ "${hypervisor_mem}" == "0" ]; then + die "Failed to find PSS for ${HYPERVISOR_PATH}" + fi + + virtiofsd_mem="$(get_pss_memory_virtiofsd ${VIRTIOFSD_PATH})" + if [ "${virtiofsd_mem}" == "0" ]; then + echo >&2 "WARNING: Failed to find PSS for ${VIRTIOFSD_PATH}" + fi + shim_mem="$(get_pss_memory ${SHIM_PATH})" + if [ "${shim_mem}" == "0" ]; then + die "Failed to find PSS for ${SHIM_PATH}" + fi + + mem_usage="$(bc -l <<< "scale=2; ${hypervisor_mem} +${virtiofsd_mem} + ${shim_mem}")" + memory_usage="${mem_usage}" + + local json="$(cat << EOF + { + "average": { + "Result": ${mem_usage}, + "Units" : "KB" + }, + "qemus": { + "Result": ${hypervisor_mem}, + "Units" : "KB" + }, + "virtiofsds": { + "Result": ${virtiofsd_mem}, + "Units" : "KB" + }, + "shims": { + "Result": ${shim_mem}, + "Units" : "KB" + } + } +EOF +)" + fi + + metrics_json_add_array_element "$json" + metrics_json_end_array "Results" + + clean_env_ctr +} + +function save_config(){ + metrics_json_start_array + + local json="$(cat << EOF + { + "containers": "${NUM_CONTAINERS}", + "ksm": "${ksm_on}", + "auto": "${AUTO_MODE}", + "waittime": "${WAIT_TIME}", + "image": "${IMAGE}", + "command": "${CMD}" + } +EOF + +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Config" +} + +function main(){ + # Verify enough arguments + if [ $# != 2 ] && [ $# != 3 ];then + echo >&2 "error: Not enough arguments [$@]" + help + exit 1 + fi + + #Check for KSM before reporting test name, as it can modify it + check_for_ksm + + init_env + + check_cmds "${SMEM_BIN}" bc + check_images "${IMAGE}" + + if [ "${CTR_RUNTIME}" == "io.containerd.kata.v2" ]; then + export RUNTIME="kata-runtime" + elif [ "${CTR_RUNTIME}" == "io.containerd.runc.v2" ]; then + export RUNTIME="runc" + else + die "Unknown runtime ${CTR_RUNTIME}" + fi + + metrics_json_init + save_config + get_memory_usage + + if [ "$RUNTIME" == "runc" ]; then + get_runc_individual_memory + elif [ "$RUNTIME" == "kata-runtime" ]; then + get_individual_memory + fi + + metrics_json_save +} + +main "$@" diff --git a/tests/metrics/density/memory_usage_inside_container.sh b/tests/metrics/density/memory_usage_inside_container.sh new file mode 100755 index 000000000..071ded175 --- /dev/null +++ b/tests/metrics/density/memory_usage_inside_container.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# Copyright (c) 2017-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +# Description of the test: +# This test launches a busybox container and inside +# memory free, memory available and total memory +# is measured by using /proc/meminfo. + +set -e + +# General env +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +TEST_NAME="memory footprint inside container" +VERSIONS_FILE="${SCRIPT_PATH}/../../versions.yaml" +IMAGE='quay.io/prometheus/busybox:latest' +CMD="sleep 10; cat /proc/meminfo" +# We specify here in 'k', as that then matches the results we get from the meminfo, +# which makes later direct comparison easier. +MEMSIZE=${MEMSIZE:-$((2048*1024))} + +# this variable determines the number of attempts when a test +# result is considered not valid (a zero value or a negative value) +MAX_FAILED_ATTEMPTS=3 +memtotalAvg=0 +units_memtotal="" +memfreeAvg=0 +units_memfree="" +memavailableAvg=0 +units_memavailable="" + +# count_iters: is the index of the current iteration +count_iters=0 + +# valid_result: if value stored is '1' the result is valid, '0' otherwise +valid_result=0 + +parse_results() { + local raw_results="${1}" + + # Variables used for sum cummulative values in the case of two or more reps. + # and used to compute average results for 'json' output format. + local memtotal_acu="${2:-0}" + local memfree_acu="${3:-0}" + local memavailable_acu="${4:-0}" + + local memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $2}') + units_memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $3}') + + local memfree=$(echo "$raw_results" | awk '/MemFree/ {print $2}') + units_memfree=$(echo "$raw_results" | awk '/MemFree/ {print $3}') + + local memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $2}') + units_memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $3}') + + # check results: if any result is zero or negative, it is considered as invalid, and the test will be repeated. + if (( $(echo "$memtotal <= 0" | bc -l) )) || (( $(echo "$memfree <= 0" | bc -l) )) || (( $(echo "$memavailable <= 0" | bc -l) )); then + MAX_FAILED_ATTEMPTS=$((MAX_FAILED_ATTEMPTS-1)) + valid_result=0 + info "Skipping invalid result: memtotal: $memtotal memfree: $memfree memavailable: $memavailable" + return 0 + fi + + memtotalAvg=$((memtotal+memtotal_acu)) + memfreeAvg=$((memfree+memfree_acu)) + memavailableAvg=$((memavailable+memavailable_acu)) + valid_result=1 + info "Iteration# $count_iters memtotal: $memtotal memfree: $memfree memavailable: $memavailable" +} + +store_results_json() { + metrics_json_start_array + memtotalAvg=$(echo "scale=2; $memtotalAvg / $count_iters" | bc) + memfreeAvg=$(echo "scale=2; $memfreeAvg / $count_iters" | bc) + memavailableAvg=$(echo "scale=2; $memavailableAvg / $count_iters" | bc) + + local json="$(cat << EOF + { + "memrequest": { + "Result" : ${MEMSIZE}, + "Units" : "Kb" + }, + "memtotal": { + "Result" : ${memtotalAvg}, + "Units" : "${units_memtotal}" + }, + "memfree": { + "Result" : ${memfreeAvg}, + "Units" : "${units_memfree}" + }, + "memavailable": { + "Result" : ${memavailableAvg}, + "Units" : "${units_memavailable}" + }, + "repetitions": { + "Result" : ${count_iters} + } + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Results" + metrics_json_save +} + +function main() { + # switch to select output format + local num_iterations=${1:-1} + info "Iterations: $num_iterations" + + # Check tools/commands dependencies + cmds=("awk" "ctr") + init_env + check_cmds "${cmds[@]}" + check_images "${IMAGE}" + metrics_json_init + while [ $count_iters -lt $num_iterations ]; do + local output=$(sudo -E "${CTR_EXE}" run --memory-limit $((MEMSIZE*1024)) --rm --runtime=$CTR_RUNTIME $IMAGE busybox sh -c "$CMD" 2>&1) + parse_results "${output}" "${memtotalAvg}" "${memfreeAvg}" "${memavailableAvg}" + + # quit if number of attempts exceeds the allowed value. + [ ${MAX_FAILED_ATTEMPTS} -eq 0 ] && die "Max number of attempts exceeded." + [ ${valid_result} -eq 1 ] && count_iters=$((count_iters+1)) + done + store_results_json + clean_env_ctr +} + +# Parameters +# @1: num_iterations {integer} +main "$@" diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index cdff6c766..5f8bfbf08 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -9,24 +9,28 @@ set -o errexit set -o nounset set -o pipefail -kata_tarball_dir=${2:-kata-artifacts} +kata_tarball_dir="${2:-kata-artifacts}" metrics_dir="$(dirname "$(readlink -f "$0")")" source "${metrics_dir}/../common.bash" +source "${metrics_dir}/lib/common.bash" -create_symbolic_links() { - hypervisor="${1:-qemu}" +declare -r results_dir="${metrics_dir}/results" +declare -r checkmetrics_dir="${metrics_dir}/cmd/checkmetrics" +declare -r checkmetrics_config_dir="${checkmetrics_dir}/ci_worker" + +function create_symbolic_links() { local link_configuration_file="/opt/kata/share/defaults/kata-containers/configuration.toml" - local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${hypervisor}.toml" + local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${KATA_HYPERVISOR}.toml" - if [ ${hypervisor} != 'qemu' ] && [ ${hypervisor} != 'clh' ]; then - die "Failed to set the configuration.toml: '${hypervisor}' is not recognized as a valid hypervisor name." + if [ "${KATA_HYPERVISOR}" != 'qemu' ] && [ "${KATA_HYPERVISOR}" != 'clh' ]; then + die "Failed to set the configuration.toml: '${KATA_HYPERVISOR}' is not recognized as a valid hypervisor name." fi sudo ln -sf "${source_configuration_file}" "${link_configuration_file}" } # Configures containerd -overwrite_containerd_config() { +function overwrite_containerd_config() { containerd_config="/etc/containerd/config.toml" sudo rm "${containerd_config}" sudo tee "${containerd_config}" << EOF @@ -44,7 +48,7 @@ version = 2 EOF } -install_kata() { +function install_kata() { local kata_tarball="kata-static.tar.xz" declare -r katadir="/opt/kata" declare -r destdir="/" @@ -53,7 +57,7 @@ install_kata() { # Removing previous kata installation sudo rm -rf "${katadir}" - pushd ${kata_tarball_dir} + pushd "${kata_tarball_dir}" sudo tar -xvf "${kata_tarball}" -C "${destdir}" popd @@ -64,17 +68,26 @@ install_kata() { check_containerd_config_for_kata restart_containerd_service + install_checkmetrics } -check_containerd_config_for_kata() { +function install_checkmetrics() { + # Ensure we have the latest checkmetrics + pushd "${checkmetrics_dir}" + make + sudo make install + popd +} + +function check_containerd_config_for_kata() { # check containerd config declare -r line1="default_runtime_name = \"kata\"" declare -r line2="runtime_type = \"io.containerd.kata.v2\"" declare -r num_lines_containerd=2 declare -r containerd_path="/etc/containerd/config.toml" - local count_matches=$(grep -ic "$line1\|$line2" ${containerd_path}) + local count_matches=$(grep -ic "$line1\|$line2" "${containerd_path}") - if [ $count_matches = $num_lines_containerd ]; then + if [ "${count_matches}" = "${num_lines_containerd}" ]; then info "containerd ok" else info "overwriting containerd configuration w/ a valid one" @@ -82,21 +95,62 @@ check_containerd_config_for_kata() { fi } +function check_metrics() { + local cm_base_file="${checkmetrics_config_dir}/checkmetrics-json-${KATA_HYPERVISOR}-kata-metric8.toml" + checkmetrics --debug --percentage --basefile "${cm_base_file}" --metricsdir "${results_dir}" + cm_result=$? + if [ "${cm_result}" != 0 ]; then + die "run-metrics-ci: checkmetrics FAILED (${cm_result})" + fi +} + +function make_tarball_results() { + compress_metrics_results_dir "${metrics_dir}/results" "${GITHUB_WORKSPACE}/results-${KATA_HYPERVISOR}.tar.gz" +} + function run_test_launchtimes() { - hypervisor="${1}" + info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor" - info "Running Launch Time test using ${hypervisor} hypervisor" - - create_symbolic_links "${hypervisor}" + create_symbolic_links bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20 } +function run_test_memory_usage() { + info "Running memory-usage test using ${KATA_HYPERVISOR} hypervisor" + + create_symbolic_links + bash tests/metrics/density/memory_usage.sh 20 5 + + check_metrics +} + +function run_test_memory_usage_inside_container() { + info "Running memory-usage inside the container test using ${KATA_HYPERVISOR} hypervisor" + + # ToDo: remove the exit once the metrics workflow is stable + exit 0 + create_symbolic_links + bash tests/metrics/density/memory_usage_inside_container.sh 5 +} + +function run_test_blogbench() { + info "Running Blogbench test using ${KATA_HYPERVISOR} hypervisor" + + # ToDo: remove the exit once the metrics workflow is stable + exit 0 + create_symbolic_links + bash tests/metrics/storage/blogbench.sh +} + function main() { action="${1:-}" case "${action}" in install-kata) install_kata ;; - run-test-launchtimes-qemu) run_test_launchtimes "qemu" ;; - run-test-launchtimes-clh) run_test_launchtimes "clh" ;; + make-tarball-results) make_tarball_results ;; + run-test-launchtimes) run_test_launchtimes ;; + run-test-memory-usage) run_test_memory_usage ;; + run-test-memory-usage-inside-container) run_test_memory_usage_inside_container ;; + run-test-blogbench) run_test_blogbench ;; *) >&2 die "Invalid argument" ;; esac } diff --git a/tests/metrics/lib/common.bash b/tests/metrics/lib/common.bash index ad7b2a5c4..0bb31030d 100755 --- a/tests/metrics/lib/common.bash +++ b/tests/metrics/lib/common.bash @@ -47,14 +47,14 @@ quay.io/libpod" # # cmds=(“cmd1” “cmd2”) # check_cmds "${cmds[@]}" -check_cmds() +function check_cmds() { local cmd req_cmds=( "$@" ) for cmd in "${req_cmds[@]}"; do if ! command -v "$cmd" > /dev/null 2>&1; then die "command $cmd not available" fi - echo "command: $cmd: yes" + info "command: $cmd: yes" done } @@ -68,19 +68,20 @@ check_cmds() # # images=(“img1” “img2”) # check_imgs "${images[@]}" -check_images() +function check_images() { local img req_images=( "$@" ) for img in "${req_images[@]}"; do - echo "ctr pull'ing: $img" + info "ctr pull'ing: $img" if ! sudo "${CTR_EXE}" image pull "$img"; then die "Failed to pull image $img" fi - echo "ctr pull'd: $img" + info "ctr pull'd: $img" done } -generate_build_dockerfile() { +function generate_build_dockerfile() +{ local dockerfile="$1" local image="$2" local map_key="$3" @@ -99,14 +100,14 @@ generate_build_dockerfile() { # This function performs a build on the image names # passed in, to ensure that we have the latest changes from # the dockerfiles -build_dockerfile_image() +function build_dockerfile_image() { local image="$1" local dockerfile_path="$2" local dockerfile_dir=${2%/*} if [ -f "$dockerfile_path" ]; then - echo "docker building $image" + info "docker building $image" if ! sudo "${DOCKER_EXE}" build --build-arg http_proxy="${http_proxy}" --build-arg https_proxy="${https_proxy}" --label "$image" --tag "${image}" -f "$dockerfile_path" "$dockerfile_dir"; then die "Failed to docker build image $image" fi @@ -119,7 +120,7 @@ build_dockerfile_image() # This function removes the ctr image, builds a new one using a dockerfile # and imports the image from docker to ctr -check_ctr_images() +function check_ctr_images() { local ctr_image="$1" local dockerfile_path="$2" @@ -138,7 +139,7 @@ check_ctr_images() # A one time (per uber test cycle) init that tries to get the # system to a 'known state' as much as possible -metrics_onetime_init() +function metrics_onetime_init() { # The onetime init must be called once, and only once if [ ! -z "$onetime_init_done" ]; then @@ -155,14 +156,14 @@ metrics_onetime_init() # Print a banner to the logs noting clearly which test # we are about to run -test_banner() +function test_banner() { - echo -e "\n===== starting test [$1] =====" + info -e "\n===== starting test [$1] =====" } # Initialization/verification environment. This function makes # minimal steps for metrics/tests execution. -init_env() +function init_env() { test_banner "${TEST_NAME}" @@ -183,7 +184,8 @@ init_env() # This function checks if there are containers or # shim/proxy/hypervisor processes up, if found, they are # killed to start test with clean environment. -kill_processes_before_start() { +function kill_processes_before_start() +{ DOCKER_PROCS=$(sudo "${DOCKER_EXE}" ps -q) [[ -n "${DOCKER_PROCS}" ]] && clean_env @@ -195,26 +197,29 @@ kill_processes_before_start() { # Generate a random name - generally used when creating containers, but can # be used for any other appropriate purpose -random_name() { +function random_name() +{ mktemp -u kata-XXXXXX } -show_system_ctr_state() { - echo "Showing system state:" - echo " --Check containers--" +function show_system_ctr_state() +{ + info "Showing system state:" + info " --Check containers--" sudo "${CTR_EXE}" c list - echo " --Check tasks--" + info " --Check tasks--" sudo "${CTR_EXE}" task list local processes="containerd-shim-kata-v2" for p in ${processes}; do - echo " --pgrep ${p}--" + info " --pgrep ${p}--" pgrep -a ${p} done } -common_init(){ +function common_init() +{ if [ "$CTR_RUNTIME" == "io.containerd.kata.v2" ] || [ "$RUNTIME" == "containerd-shim-kata-v2" ]; then extract_kata_env else @@ -225,17 +230,18 @@ common_init(){ fi } - # Save the current KSM settings so we can restore them later -save_ksm_settings(){ - echo "saving KSM settings" +function save_ksm_settings() +{ + info "saving KSM settings" ksm_stored_run=$(cat ${KSM_ENABLE_FILE}) ksm_stored_pages=$(cat ${KSM_ENABLE_FILE}) ksm_stored_sleep=$(cat ${KSM_ENABLE_FILE}) } -set_ksm_aggressive(){ - echo "setting KSM to aggressive mode" +function set_ksm_aggressive() +{ + info "setting KSM to aggressive mode" # Flip the run off/on to ensure a restart/rescan sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}" sudo bash -c "echo ${KSM_AGGRESIVE_PAGES} > ${KSM_PAGES_FILE}" @@ -245,7 +251,7 @@ set_ksm_aggressive(){ if [ "${KATA_HYPERVISOR}" == "qemu" ]; then # Disable virtio-fs and save whether it was enabled previously set_virtio_out=$(sudo -E PATH="$PATH" "${LIB_DIR}/../../.ci/set_kata_config.sh" shared_fs virtio-9p) - echo "${set_virtio_out}" + info "${set_virtio_out}" grep -q "already" <<< "${set_virtio_out}" || was_virtio_fs=true; fi } @@ -256,8 +262,9 @@ restore_virtio_fs(){ info "Not restoring virtio-fs since it wasn't enabled previously" } -restore_ksm_settings(){ - echo "restoring KSM settings" +function restore_ksm_settings() +{ + info "restoring KSM settings" # First turn off the run to ensure if we are then re-enabling # that any changes take effect sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}" @@ -267,15 +274,17 @@ restore_ksm_settings(){ [ "${KATA_HYPERVISOR}" == "qemu" ] && restore_virtio_fs } -disable_ksm(){ - echo "disabling KSM" +function disable_ksm() +{ + info "disabling KSM" sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}" [ "${KATA_HYPERVISOR}" == "qemu" ] && restore_virtio_fs } # See if KSM is enabled. # If so, amend the test name to reflect that -check_for_ksm(){ +function check_for_ksm() +{ if [ ! -f ${KSM_ENABLE_FILE} ]; then return fi @@ -294,7 +303,8 @@ check_for_ksm(){ # a full scan has managed to do few new merges) # # arg1 - timeout in seconds -wait_ksm_settle(){ +function wait_ksm_settle() +{ [[ "$RUNTIME" == "runc" ]] || [[ "$CTR_RUNTIME" == "io.containerd.runc.v2" ]] && return local t pcnt local oldscan=-1 newscan @@ -305,7 +315,7 @@ wait_ksm_settle(){ # Wait some time for KSM to kick in to avoid early dismissal for ((t=0; t<5; t++)); do pages=$(cat "${KSM_PAGES_SHARED}") - [[ "$pages" -ne 0 ]] && echo "Discovered KSM activity" && break + [[ "$pages" -ne 0 ]] && info "Discovered KSM activity" && break sleep 1 done @@ -315,13 +325,13 @@ wait_ksm_settle(){ newscan=$(cat /sys/kernel/mm/ksm/full_scans) newpages=$(cat "${KSM_PAGES_SHARED}") - [[ "$newpages" -eq 0 ]] && echo "No need to wait for KSM to settle" && return + [[ "$newpages" -eq 0 ]] && info "No need to wait for KSM to settle" && return if (( newscan != oldscan )); then - echo -e "\nnew full_scan ($oldscan to $newscan)" + info -e "\nnew full_scan ($oldscan to $newscan)" # Do we have a previous scan to compare with - echo "check pages $oldpages to $newpages" + info "check pages $oldpages to $newpages" if (( oldpages != -1 )); then # avoid divide by zero problems @@ -330,14 +340,14 @@ wait_ksm_settle(){ # abs() pcnt=$(( $pcnt * -1 )) - echo "$oldpages to $newpages is ${pcnt}%" + info "$oldpages to $newpages is ${pcnt}%" if (( $pcnt <= 5 )); then - echo "KSM stabilised at ${t}s" + info "KSM stabilised at ${t}s" return fi else - echo "$oldpages KSM pages... waiting" + info "$oldpages KSM pages... waiting" fi fi oldscan=$newscan @@ -347,7 +357,7 @@ wait_ksm_settle(){ fi sleep 1 done - echo "Timed out after ${1}s waiting for KSM to settle" + info "Timed out after ${1}s waiting for KSM to settle" } common_init diff --git a/tests/metrics/storage/blogbench.sh b/tests/metrics/storage/blogbench.sh new file mode 100755 index 000000000..19a960103 --- /dev/null +++ b/tests/metrics/storage/blogbench.sh @@ -0,0 +1,124 @@ +#!/bin/bash +# +# Copyright (c) 2018-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Description of the test: +# This test runs the 'blogbench', and extracts the 'scores' for reads +# and writes +# Note - the scores are *not* normalised for the number of iterations run, +# they are total scores for all iterations (this is the blogbench default output) + +set -e + +# General env +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +TEST_NAME="blogbench" +IMAGE="docker.io/library/local-blogbench:latest" +DOCKERFILE="${SCRIPT_PATH}/blogbench_dockerfile/Dockerfile" + +# Number of iterations for blogbench to run - note, results are not +# scaled to iterations - more iterations results in bigger results +ITERATIONS="${ITERATIONS:-30}" + +# Directory to run the test on +# This is run inside of the container +TESTDIR="${TESTDIR:-/tmp}" +CMD="blogbench -i ${ITERATIONS} -d ${TESTDIR}" + +function main() { + # Check tools/commands dependencies + cmds=("awk" "docker") + + init_env + check_cmds "${cmds[@]}" + check_ctr_images "${IMAGE}" "${DOCKERFILE}" + metrics_json_init + + local output=$(sudo -E ${CTR_EXE} run --rm --runtime=${CTR_RUNTIME} ${IMAGE} test ${CMD}) + + # Save configuration + metrics_json_start_array + + local frequency=$(echo "${output}" | grep "Frequency" | cut -d "=" -f2 | cut -d ' ' -f2) + local iterations=$(echo "${output}" | grep -w "iterations" | cut -d ' ' -f3) + local spawing_writers=$(echo "${output}" | grep -w "writers" | cut -d ' ' -f2) + local spawing_rewriters=$(echo "${output}" | grep -w "rewriters" | cut -d ' ' -f2) + local spawing_commenters=$(echo "${output}" | grep -w "commenters" | cut -d ' ' -f2) + local spawing_readers=$(echo "${output}" | grep -w "readers" | cut -d ' ' -f2) + + local json="$(cat << EOF + { + "Frequency" : ${frequency}, + "Iterations" : ${iterations}, + "Number of spawing writers" : ${spawing_writers}, + "Number of spawing rewriters" : ${spawing_rewriters}, + "Number of spawing commenters" : ${spawing_commenters}, + "Number of spawing readers" : ${spawing_readers} + } +EOF +)" + metrics_json_add_array_element "${json}" + metrics_json_end_array "Config" + + # Save results + metrics_json_start_array + + local writes=$(tail -2 <<< "${output}" | head -1 | awk '{print $5}') + local reads=$(tail -1 <<< "${output}" | awk '{print $6}') + + # Obtaining other Blogbench results + local -r data=$(echo "${output}" | tail -n +12 | head -n -3) + local nb_blogs=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $1} ' | tr '\t' ',' | sed '$ s/.$//') + local r_articles=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $2} ' | tr '\t' ',' | sed '$ s/.$//') + local w_articles=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $3} ' | tr '\t' ',' | sed '$ s/.$//') + local r_pictures=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $4} ' | tr '\t' ',' | sed '$ s/.$//') + local w_pictures=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $5} ' | tr '\t' ',' | sed '$ s/.$//') + local r_comments=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $6} ' | tr '\t' ',' | sed '$ s/.$//') + local w_comments=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $7} ' | tr '\t' ',' | sed '$ s/.$//') + + local json="$(cat << EOF + { + "write": { + "Result" : "${writes}", + "Units" : "items" + }, + "read": { + "Result" : "${reads}", + "Units" : "items" + }, + "Nb blogs": { + "Result" : "${nb_blogs}" + }, + "R articles": { + "Result" : "${r_articles}" + }, + "W articles": { + "Result" : "${w_articles}" + }, + "R pictures": { + "Result" : "${r_pictures}" + }, + "W pictures": { + "Result" : "${w_pictures}" + }, + "R comments": { + "Result" : "${r_comments}" + }, + "W comments": { + "Result" : "${w_comments}" + } + } +EOF +)" + + metrics_json_add_array_element "${json}" + metrics_json_end_array "Results" + metrics_json_save + clean_env_ctr +} + +main "$@" diff --git a/tests/metrics/storage/blogbench_dockerfile/Dockerfile b/tests/metrics/storage/blogbench_dockerfile/Dockerfile new file mode 100644 index 000000000..593063798 --- /dev/null +++ b/tests/metrics/storage/blogbench_dockerfile/Dockerfile @@ -0,0 +1,32 @@ +# Copyright (c) 2018-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Set up an Ubuntu image with 'blogbench' installed + +# Usage: FROM [image name] +# hadolint ignore=DL3007 +FROM docker.io/library/ubuntu:latest + +# Version of the Dockerfile +LABEL DOCKERFILE_VERSION="1.0" + +# URL for blogbench test and blogbench version +ENV BLOGBENCH_URL "https://download.pureftpd.org/pub/blogbench" +ENV BLOGBENCH_VERSION 1.1 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential curl && \ + apt-get remove -y unattended-upgrades && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/ && \ + curl -OkL "${BLOGBENCH_URL}/blogbench-${BLOGBENCH_VERSION}.tar.gz" && \ + tar xzf "blogbench-${BLOGBENCH_VERSION}.tar.gz" -C / +WORKDIR "/blogbench-${BLOGBENCH_VERSION}" +RUN arch="$(uname -m)" && \ + export arch && \ + ./configure --build="${arch}" && \ + make && \ + make install-strip + +CMD ["/bin/bash"] diff --git a/tests/metrics/storage/web-tooling-dockerfile/Dockerfile.in b/tests/metrics/storage/web-tooling-dockerfile/Dockerfile.in new file mode 100755 index 000000000..8190d8560 --- /dev/null +++ b/tests/metrics/storage/web-tooling-dockerfile/Dockerfile.in @@ -0,0 +1,28 @@ +# Copyright (c) 2020-2021 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Set up an Ubuntu image with 'web tooling' installed + +# Usage: FROM [image name] +# hadolint ignore=DL3007 +FROM @UBUNTU_REGISTRY@/ubuntu:latest + +# Version of the Dockerfile +LABEL DOCKERFILE_VERSION="1.0" + +# URL for web tooling test +ENV WEB_TOOLING_URL "https://github.com/v8/web-tooling-benchmark" +ENV NODEJS_VERSION "setup_14.x" + +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential git curl sudo && \ + apt-get remove -y unattended-upgrades && \ + curl -OkL https://deb.nodesource.com/${NODEJS_VERSION} && chmod +x ${NODEJS_VERSION} && ./${NODEJS_VERSION} && \ + apt-get install -y --no-install-recommends nodejs && \ + apt-get clean && rm -rf /var/lib/apt/lists && \ + git clone ${WEB_TOOLING_URL} /web-tooling-benchmark +WORKDIR /web-tooling-benchmark/ +RUN npm install --unsafe-perm + +CMD ["/bin/bash"] diff --git a/tests/metrics/storage/webtooling.sh b/tests/metrics/storage/webtooling.sh new file mode 100755 index 000000000..f82849618 --- /dev/null +++ b/tests/metrics/storage/webtooling.sh @@ -0,0 +1,283 @@ +#!/bin/bash +# +# Copyright (c) 2020-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Description of the test: +# This test runs the 'web tooling benchmark' +# https://github.com/v8/web-tooling-benchmark + +set -o pipefail + +# General env +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" +NUM_CONTAINERS="${1:-}" + +IMAGE="docker.io/library/local-web-tooling:latest" +DOCKERFILE="${SCRIPT_PATH}/web-tooling-dockerfile/Dockerfile" + +# Directory to run the test inside of the container +TESTDIR="${TESTDIR:-/testdir}" +file_path="/web-tooling-benchmark" +file_name="output" + +# Directory where the webtooling results are stored +TMP_DIR=$(mktemp --tmpdir -d webtool.XXXXXXXXXX) + +# Options to control the start of the workload using a trigger-file +dst_dir="/host" +src_dir=$(mktemp --tmpdir -d webtool.XXXXXXXXXX) +trigger_file="$RANDOM.txt" +guest_trigger_file="$dst_dir/$trigger_file" +host_trigger_file="$src_dir/$trigger_file" +start_script="webtooling_start.sh" + +# CMD points to the script that starts the workload +CMD="$dst_dir/$start_script" +MOUNT_OPTIONS="type=bind,src=$src_dir,dst=$dst_dir,options=rbind:ro" +PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}" + +# This timeout is related with the amount of time that +# webtool benchmark needs to run inside the container +timeout=600 +INITIAL_NUM_PIDS=1 + +cpu_period="100000" +cpu_quota="200000" + +function remove_tmp_dir() { + rm -rf "$TMP_DIR" + rm -rf "$src_dir" +} + +trap remove_tmp_dir EXIT + +# Show help about this script +function help(){ +cat << EOF +Usage: $0 + Description: + : Number of containers to run. +EOF +} + +# script used to launch the workload +function create_start_script() { + local script="${src_dir/$start_script}" + rm -rf "${script}" + +cat <>"${script}" +#!/bin/bash +mkdir -p "${TESTDIR}" + +until [ -f ${guest_trigger_file} ]; do + sleep 1 +done +pushd "${file_path}" +node dist/cli.js > "${file_name}" +EOF + chmod +x "${script}" +} + +function verify_task_is_completed_on_all_containers() { + local containers=( $(sudo -E "${CTR_EXE}" c list -q) ) + local sleep_secs=10 + local max=$(bc <<<"${timeout} / ${sleep_secs}") + local wip_list=() + local count=1 + local sum=0 + local i="" + + while (( ${sum} < ${NUM_CONTAINERS} )); do + + for i in "${containers[@]}"; do + # Only check containers that have not completed the workload at this step + num_pids=$(sudo -E "${CTR_EXE}" t metrics "${i}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2) + + if [ "${num_pids}" -lt "${INITIAL_NUM_PIDS}" ]; then + ((sum++)) + else + wip_list+=("${i}") + fi + done + + # hold the list of containers that are still running the workload + containers=(${wip_list[*]}) + wip_list=() + + info "loop ${count} of ${max}: sleeping for ${sleep_secs} seconds" + sleep "${sleep_secs}" + ((count++)) + done +} + +function check_containers_are_up() { + info "Verify that the containers are running" + local containers_launched=0 + while (( $containers_launched < ${NUM_CONTAINERS} )); do + containers_launched="$(sudo -E ${CTR_EXE} t list | grep -c "RUNNING")" + sleep 1 + done +} + +function save_config() { + metrics_json_start_array + + local json="$(cat << EOF + { + "containers": "${NUM_CONTAINERS}", + "image": "${IMAGE}", + "units": "runs/s" + } +EOF +)" + metrics_json_add_array_element "${json}" + metrics_json_end_array "Config" +} + +function main() { + # Verify enough arguments + if [ $# != 1 ]; then + echo >&2 "error: Not enough arguments [$@]" + help + exit 1 + fi + + local i=0 + local containers=() + local cmds=("docker") + local not_started_count=$NUM_CONTAINERS + + restart_containerd_service + # Check tools/commands dependencies + init_env + check_cmds "${cmds[@]}" + check_ctr_images "$IMAGE" "$DOCKERFILE" + metrics_json_init + save_config + create_start_script + rm -rf "${host_trigger_file}" + + info "Creating ${NUM_CONTAINERS} containers" + + for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do + containers+=($(random_name)) + # Web tool benchmark needs 2 cpus to run completely in its cpu utilization + sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" --cpu-quota "${cpu_quota}" --cpu-period "${cpu_period}" --mount="${MOUNT_OPTIONS}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}" + ((not_started_count--)) + info "${not_started_count} remaining containers" + done + + # Check that the requested number of containers are running + local timeout_launch="10" + check_containers_are_up & pid=$! + (sleep "${timeout_launch}" && kill -HUP ${pid}) 2>/dev/null & pid_tout=$! + + if wait $pid 2>/dev/null; then + pkill -HUP -P "${pid_tout}" + wait "${pid_tout}" + else + warn "Time out exceeded" + return 1 + fi + + # Get the initial number of pids in a single container before the workload starts + INITIAL_NUM_PIDS=$(sudo -E "${CTR_EXE}" t metrics "${containers[-1]}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2) + ((INITIAL_NUM_PIDS++)) + + # Launch webtooling benchmark + local pids=() + local j=0 + for i in "${containers[@]}"; do + $(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD}") & + pids[${j}]=$! + ((j++)) + done + + # wait for all pids + for pid in ${pids[*]}; do + wait "${pid}" + done + + touch "${host_trigger_file}" + info "All containers are running the workload..." + + # Verify that all containers have completed the assigned task + verify_task_is_completed_on_all_containers & pid=$! + (sleep "$timeout" && kill -HUP $pid) 2>/dev/null & pid_tout=$! + if wait ${pid} 2>/dev/null; then + pkill -HUP -P "${pid_tout}" + wait "${pid_tout}" + else + warn "Time out exceeded" + return 1 + fi + + RESULTS_CMD="cat ${file_path}/${file_name}" + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec --exec-id "${RANDOM}" "${i}" sh -c "${RESULTS_CMD}" >> "${TMP_DIR}/results" + done + + # Save configuration + metrics_json_start_array + + local output=$(cat "${TMP_DIR}/results") + local cut_results="cut -d':' -f2 | sed -e 's/^[ \t]*//'| cut -d ' ' -f1 | tr '\n' ',' | sed 's/.$//'" + + local acorn=$(echo "${output}" | grep -w "acorn" | eval "${cut_results}") + local babel=$(echo "${output}" | grep -w "babel" | sed '/babel-minify/d' | eval "${cut_results}") + local babel_minify=$(echo "${output}" | grep -w "babel-minify" | eval "${cut_results}") + local babylon=$(echo "${output}" | grep -w "babylon" | eval "${cut_results}") + local buble=$(echo "${output}" | grep -w "buble" | eval "${cut_results}") + local chai=$(echo "${output}" | grep -w "chai" | eval "${cut_results}") + local coffeescript=$(echo "${output}" | grep -w "coffeescript" | eval "${cut_results}") + local espree=$(echo "${output}" | grep -w "espree" | eval "${cut_results}") + local esprima=$(echo "${output}" | grep -w "esprima" | eval "${cut_results}") + local jshint=$(echo "${output}" | grep -w "jshint" | eval "${cut_results}") + local lebab=$(echo "${output}" | grep -w "lebab" | eval "${cut_results}") + local postcss=$(echo "${output}" | grep -w "postcss" | eval "${cut_results}") + local prepack=$(echo "${output}" | grep -w "prepack" | eval "${cut_results}") + local prettier=$(echo "${output}" | grep -w "prettier" | eval "${cut_results}") + local source_map=$(echo "${output}" | grep -w "source-map" | eval "${cut_results}") + local terser=$(echo "${output}" | grep -w "terser" | eval "${cut_results}") + local typescript=$(echo "${output}" | grep -w "typescript" | eval "${cut_results}") + local uglify_js=$(echo "${output}" | grep -w "uglify-js" | eval "${cut_results}") + local geometric_mean=$(echo "${output}" | grep -w "Geometric" | eval "${cut_results}") + local average_tps=$(echo "${geometric_mean}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) + local tps=$(echo "${average_tps}*${NUM_CONTAINERS}" | bc -l) + + local json="$(cat << EOF + { + "Acorn" : "${acorn}", + "Babel" : "${babel}", + "Babel minify" : "${babel_minify}", + "Babylon" : "${babylon}", + "Buble" : "${buble}", + "Chai" : "${chai}", + "Coffeescript" : "${coffeescript}", + "Espree" : "${espree}", + "Esprima" : "${esprima}", + "Jshint" : "${jshint}", + "Lebab" : "${lebab}", + "Postcss" : "${postcss}", + "Prepack" : "${prepack}", + "Prettier" : "${prettier}", + "Source map" : "${source_map}", + "Terser" : "${terser}", + "Typescript" : "${typescript}", + "Uglify js" : "${uglify_js}", + "Geometric mean" : "${geometric_mean}", + "Average TPS" : "${average_tps}", + "TPS" : "${tps}" + } +EOF +)" + metrics_json_add_array_element "${json}" + metrics_json_end_array "Results" + metrics_json_save + clean_env_ctr +} + +main "$@" diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries-in-docker.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries-in-docker.sh index 606ce8435..5ceae83bd 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries-in-docker.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries-in-docker.sh @@ -71,6 +71,7 @@ docker run \ --env TDSHIM_CONTAINER_BUILDER="${TDSHIM_CONTAINER_BUILDER:-}" \ --env VIRTIOFSD_CONTAINER_BUILDER="${VIRTIOFSD_CONTAINER_BUILDER:-}" \ --env MEASURED_ROOTFS="${MEASURED_ROOTFS:-}" \ + --env USE_CACHE="${USE_CACHE:-}" \ --rm \ -w ${script_dir} \ build-kata-deploy "${kata_deploy_create}" $@ diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh index 47dc2dbfe..26d64fa07 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh @@ -48,6 +48,7 @@ readonly cached_artifacts_path="lastSuccessfulBuild/artifact/artifacts" ARCH=$(uname -m) MEASURED_ROOTFS=${MEASURED_ROOTFS:-no} +USE_CACHE="${USE_CACHE:-"yes"}" workdir="${WORKDIR:-$PWD}" @@ -87,6 +88,7 @@ options: --build= : all cloud-hypervisor + cloud-hypervisor-glibc firecracker kernel kernel-dragonball-experimental @@ -135,7 +137,11 @@ cleanup_and_fail() { return 1 } -install_cached_component() { +install_cached_tarball_component() { + if [ "${USE_CACHE}" != "yes" ]; then + return 1 + fi + local component="${1}" local jenkins_build_url="${2}" local current_version="${3}" @@ -214,7 +220,7 @@ install_cached_cc_shim_v2() { wget "${jenkins_build_url}/root_hash_tdx.txt" -O "shim_v2_root_hash_tdx.txt" || return 1 diff "${root_hash_tdx}" "shim_v2_root_hash_tdx.txt" > /dev/null || return 1 - install_cached_component \ + install_cached_tarball_component \ "${component}" \ "${jenkins_build_url}" \ "${current_version}" \ @@ -227,7 +233,7 @@ install_cached_cc_shim_v2() { # Install static CC cloud-hypervisor asset install_cc_clh() { - install_cached_component \ + install_cached_tarball_component \ "cloud-hypervisor" \ "${jenkins_url}/job/kata-containers-2.0-clh-cc-$(uname -m)/${cached_artifacts_path}" \ "$(get_from_kata_deps "assets.hypervisor.cloud_hypervisor.version")" \ @@ -287,7 +293,7 @@ install_cc_image() { local pause_version="$(get_from_kata_deps "externals.pause.version")" local rust_version="$(get_from_kata_deps "languages.rust.meta.newest-version")" - install_cached_component \ + install_cached_tarball_component \ "${component}" \ "${jenkins}" \ "${osbuilder_last_commit}-${guest_image_last_commit}-${initramfs_last_commit}-${agent_last_commit}-${libs_last_commit}-${attestation_agent_version}-${gperf_version}-${libseccomp_version}-${pause_version}-${rust_version}-${image_type}-${AA_KBC}" \ @@ -333,7 +339,7 @@ install_cc_kernel() { local kernel_kata_config_version="$(cat ${repo_root_dir}/tools/packaging/kernel/kata_config_version)" - install_cached_component \ + install_cached_tarball_component \ "kernel" \ "${jenkins_url}/job/kata-containers-2.0-kernel-cc-$(uname -m)/${cached_artifacts_path}" \ "${kernel_version}-${kernel_kata_config_version}" \ @@ -355,7 +361,7 @@ install_cc_qemu() { export qemu_repo="$(yq r $versions_yaml assets.hypervisor.qemu.url)" export qemu_version="$(yq r $versions_yaml assets.hypervisor.qemu.version)" - install_cached_component \ + install_cached_tarball_component \ "QEMU" \ "${jenkins_url}/job/kata-containers-2.0-qemu-cc-$(uname -m)/${cached_artifacts_path}" \ "${qemu_version}-$(calc_qemu_files_sha256sum)" \ @@ -413,7 +419,7 @@ install_cc_shimv2() { # Install static CC virtiofsd asset install_cc_virtiofsd() { local virtiofsd_version="$(get_from_kata_deps "externals.virtiofsd.version")-$(get_from_kata_deps "externals.virtiofsd.toolchain")" - install_cached_component \ + install_cached_tarball_component \ "virtiofsd" \ "${jenkins_url}/job/kata-containers-2.0-virtiofsd-cc-$(uname -m)/${cached_artifacts_path}" \ "${virtiofsd_version}" \ @@ -437,7 +443,7 @@ install_cached_kernel_component() { local kernel_kata_config_version="$(cat ${repo_root_dir}/tools/packaging/kernel/kata_config_version)" - install_cached_component \ + install_cached_tarball_component \ "kernel" \ "${jenkins_url}/job/kata-containers-2.0-kernel-${tee}-cc-$(uname -m)/${cached_artifacts_path}" \ "${kernel_version}-${kernel_kata_config_version}" \ @@ -449,7 +455,7 @@ install_cached_kernel_component() { [ "${tee}" == "tdx" ] && return 0 # SEV specific code path - install_cached_component \ + install_cached_tarball_component \ "kernel-modules" \ "${jenkins_url}/job/kata-containers-2.0-kernel-sev-cc-$(uname -m)/${cached_artifacts_path}" \ "${kernel_version}" \ @@ -505,7 +511,7 @@ install_cc_tee_qemu() { export qemu_version="$(yq r $versions_yaml assets.hypervisor.qemu.${tee}.tag)" export tee="${tee}" - install_cached_component \ + install_cached_tarball_component \ "QEMU ${tee}" \ "${jenkins_url}/job/kata-containers-2.0-qemu-${tee}-cc-$(uname -m)/${cached_artifacts_path}" \ "${qemu_version}-$(calc_qemu_files_sha256sum)" \ @@ -523,7 +529,7 @@ install_cc_tdx_qemu() { } install_cc_tdx_td_shim() { - install_cached_component \ + install_cached_tarball_component \ "td-shim" \ "${jenkins_url}/job/kata-containers-2.0-td-shim-cc-$(uname -m)/${cached_artifacts_path}" \ "$(get_from_kata_deps "externals.td-shim.version")-$(get_from_kata_deps "externals.td-shim.toolchain")" \ @@ -543,7 +549,7 @@ install_cc_tee_ovmf() { local component_name="ovmf" local component_version="$(get_from_kata_deps "externals.ovmf.${tee}.version")" [ "${tee}" == "tdx" ] && component_name="tdvf" - install_cached_component \ + install_cached_tarball_component \ "${component_name}" \ "${jenkins_url}/job/kata-containers-2.0-${component_name}-cc-$(uname -m)/${cached_artifacts_path}" \ "${component_version}" \ @@ -583,7 +589,7 @@ install_image() { local libseccomp_version="$(get_from_kata_deps "externals.libseccomp.version")" local rust_version="$(get_from_kata_deps "languages.rust.meta.newest-version")" - install_cached_component \ + install_cached_tarball_component \ "${component}" \ "${jenkins}" \ "${osbuilder_last_commit}-${guest_image_last_commit}-${agent_last_commit}-${libs_last_commit}-${gperf_version}-${libseccomp_version}-${rust_version}-image" \ @@ -616,7 +622,7 @@ install_initrd() { local libseccomp_version="$(get_from_kata_deps "externals.libseccomp.version")" local rust_version="$(get_from_kata_deps "languages.rust.meta.newest-version")" - install_cached_component \ + install_cached_tarball_component \ "${component}" \ "${jenkins}" \ "${osbuilder_last_commit}-${guest_image_last_commit}-${agent_last_commit}-${libs_last_commit}-${gperf_version}-${libseccomp_version}-${rust_version}-${initrd_type}" \ @@ -728,12 +734,12 @@ install_kernel_nvidia_gpu() { #Install GPU and SNP enabled kernel asset install_kernel_nvidia_gpu_snp() { - local kernel_url="$(get_from_kata_deps assets.kernel.snp.url)" + local kernel_url="$(get_from_kata_deps assets.kernel.sev.url)" install_kernel_helper \ - "assets.kernel.snp.version" \ + "assets.kernel.sev.version" \ "kernel-nvidia-gpu-snp" \ - "-x snp -g nvidia -u ${kernel_url} -H deb" + "-x sev -g nvidia -u ${kernel_url} -H deb" } #Install GPU and TDX experimental enabled kernel asset @@ -854,31 +860,52 @@ install_firecracker() { sudo install -D --owner root --group root --mode 0744 release-${firecracker_version}-${ARCH}/jailer-${firecracker_version}-${ARCH} "${destdir}/opt/kata/bin/jailer" } -# Install static cloud-hypervisor asset -install_clh() { - install_cached_component \ - "cloud-hypervisor" \ - "${jenkins_url}/job/kata-containers-main-clh-$(uname -m)/${cached_artifacts_path}" \ +install_clh_helper() { + libc="${1}" + features="${2}" + suffix="${3:-""}" + + install_cached_tarball_component \ + "cloud-hypervisor${suffix}" \ + "${jenkins_url}/job/kata-containers-main-clh-$(uname -m)${suffix}/${cached_artifacts_path}" \ "$(get_from_kata_deps "assets.hypervisor.cloud_hypervisor.version")" \ "" \ "${final_tarball_name}" \ "${final_tarball_path}" \ && return 0 - if [[ "${ARCH}" == "x86_64" ]]; then - export features="tdx" - fi - info "build static cloud-hypervisor" - "${clh_builder}" + libc="${libc}" features="${features}" "${clh_builder}" info "Install static cloud-hypervisor" mkdir -p "${destdir}/opt/kata/bin/" - sudo install -D --owner root --group root --mode 0744 cloud-hypervisor/cloud-hypervisor "${destdir}/opt/kata/bin/cloud-hypervisor" + sudo install -D --owner root --group root --mode 0744 cloud-hypervisor/cloud-hypervisor "${destdir}/opt/kata/bin/cloud-hypervisor${suffix}" +} + +# Install static cloud-hypervisor asset +install_clh() { + if [[ "${ARCH}" == "x86_64" ]]; then + features="mshv,tdx" + else + features="" + fi + + install_clh_helper "musl" "${features}" +} + +# Install static cloud-hypervisor-glibc asset +install_clh_glibc() { + if [[ "${ARCH}" == "x86_64" ]]; then + features="mshv" + else + features="" + fi + + install_clh_helper "gnu" "${features}" "-glibc" } # Install static virtiofsd asset install_virtiofsd() { - install_cached_component \ + install_cached_tarball_component \ "virtiofsd" \ "${jenkins_url}/job/kata-containers-main-virtiofsd-$(uname -m)/${cached_artifacts_path}" \ "$(get_from_kata_deps "externals.virtiofsd.version")-$(get_from_kata_deps "externals.virtiofsd.toolchain")" \ @@ -925,7 +952,7 @@ install_shimv2() { local RUST_VERSION="$(get_from_kata_deps "languages.rust.meta.newest-version")" local shim_v2_version="${shim_v2_last_commit}-${protocols_last_commit}-${runtime_rs_last_commit}-${GO_VERSION}-${RUST_VERSION}" - install_cached_component \ + install_cached_tarball_component \ "shim-v2" \ "${jenkins_url}/job/kata-containers-main-shim-v2-$(uname -m)/${cached_artifacts_path}" \ "${shim_v2_version}" \ @@ -1065,7 +1092,7 @@ handle_build() { cloud-hypervisor) install_clh ;; - cloud-hypervisor-glibc) ;; + cloud-hypervisor-glibc) install_clh_glibc ;; firecracker) install_firecracker ;; diff --git a/tools/packaging/kata-deploy/scripts/kata-deploy.sh b/tools/packaging/kata-deploy/scripts/kata-deploy.sh index 6bb660198..09d27cc65 100644 --- a/tools/packaging/kata-deploy/scripts/kata-deploy.sh +++ b/tools/packaging/kata-deploy/scripts/kata-deploy.sh @@ -64,6 +64,15 @@ function install_artifacts() { chmod +x /opt/kata/bin/* [ -d /opt/kata/runtime-rs/bin ] && \ chmod +x /opt/kata/runtime-rs/bin/* + + # Allow Mariner to use custom configuration. + if [ "${HOST_OS:-}" == "cbl-mariner" ]; then + config_path="/opt/kata/share/defaults/kata-containers/configuration-clh.toml" + clh_path="/opt/kata/bin/cloud-hypervisor-glibc" + sed -i -E 's|(enable_annotations) = .+|\1 = ["enable_iommu", "initrd", "kernel"]|' "${config_path}" + sed -i -E "s|(valid_hypervisor_paths) = .+|\1 = [\"${clh_path}\"]|" "${config_path}" + sed -i -E "s|(path) = \".+/cloud-hypervisor\"|\1 = \"${clh_path}\"|" "${config_path}" + fi } function wait_till_node_is_ready() { diff --git a/tools/packaging/kernel/build-kernel.sh b/tools/packaging/kernel/build-kernel.sh index fef85503a..35c54741f 100755 --- a/tools/packaging/kernel/build-kernel.sh +++ b/tools/packaging/kernel/build-kernel.sh @@ -243,25 +243,21 @@ get_kernel_frag_path() { if [[ "${gpu_vendor}" != "" ]];then info "Add kernel config for GPU due to '-g ${gpu_vendor}'" - local gpu_configs="$(ls ${gpu_path}/${gpu_vendor}.conf)" - all_configs="${all_configs} ${gpu_configs}" # If conf_guest is set we need to update the CONFIG_LOCALVERSION # to match the suffix created in install_kata # -nvidia-gpu-{snp|tdx}, the linux headers will be named the very # same if build with make deb-pkg for TDX or SNP. + local gpu_configs=$(mktemp).conf + local gpu_subst_configs="${gpu_path}/${gpu_vendor}.${arch_target}.conf.in" if [[ "${conf_guest}" != "" ]];then - local gpu_cc_configs=$(mktemp).conf - local gpu_subst_configs="$(ls ${gpu_path}/${gpu_vendor}.conf.in)" - export CONF_GUEST_SUFFIX="-${conf_guest}" - envsubst <${gpu_subst_configs} >${gpu_cc_configs} - unset CONF_GUEST_SUFFIX - - all_configs="${all_configs} ${gpu_cc_configs}" else - local gpu_configs="$(ls ${gpu_path}/${gpu_vendor}.conf)" - all_configs="${all_configs} ${gpu_configs}" + export CONF_GUEST_SUFFIX="" fi + envsubst <${gpu_subst_configs} >${gpu_configs} + unset CONF_GUEST_SUFFIX + + all_configs="${all_configs} ${gpu_configs}" fi if [ "${MEASURED_ROOTFS}" == "yes" ]; then diff --git a/tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in b/tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in new file mode 100644 index 000000000..8cb9cf511 --- /dev/null +++ b/tools/packaging/kernel/configs/fragments/gpu/nvidia.arm64.conf.in @@ -0,0 +1,29 @@ +# Support for loading modules. +# It is used to support loading GPU drivers. +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y + +# CRYPTO_FIPS requires this config when loading modules is enabled. +CONFIG_MODULE_SIG=y + +# Linux kernel version suffix +CONFIG_LOCALVERSION="-nvidia-gpu${CONF_GUEST_SUFFIX}" + +# Newer NVIDIA drivers need additional symbols +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y + + +# VFIO/IOMMU setttings +CONFIG_MMU_NOTIFIER=y +CONFIG_IOASID=y +CONFIG_IOMMU_IO_PGTABLE=y +CONFIG_IOMMU_IO_PGTABLE_LPAE=y +CONFIG_IOMMU_SVA=y +CONFIG_ARM_SMMU_V3=y +CONFIG_ARM_SMMU_V3_SVA=y + +# CC related configs +CONFIG_CRYPTO_ECC=y +CONFIG_CRYPTO_ECDH=y +CONFIG_CRYPTO_ECDSA=y diff --git a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf b/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf deleted file mode 100644 index 883c0f3af..000000000 --- a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf +++ /dev/null @@ -1,14 +0,0 @@ -# Support mmconfig PCI config space access. -# It's used to enable the MMIO access method for PCIe devices. -CONFIG_PCI_MMCONFIG=y - -# Support for loading modules. -# It is used to support loading GPU drivers. -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y - -# CRYPTO_FIPS requires this config when loading modules is enabled. -CONFIG_MODULE_SIG=y - -# Linux kernel version suffix -CONFIG_LOCALVERSION="-nvidia-gpu" diff --git a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf.in b/tools/packaging/kernel/configs/fragments/gpu/nvidia.x86_64.conf.in similarity index 62% rename from tools/packaging/kernel/configs/fragments/gpu/nvidia.conf.in rename to tools/packaging/kernel/configs/fragments/gpu/nvidia.x86_64.conf.in index 73cce6173..6ef830aab 100644 --- a/tools/packaging/kernel/configs/fragments/gpu/nvidia.conf.in +++ b/tools/packaging/kernel/configs/fragments/gpu/nvidia.x86_64.conf.in @@ -12,3 +12,14 @@ CONFIG_MODULE_SIG=y # Linux kernel version suffix CONFIG_LOCALVERSION="-nvidia-gpu${CONF_GUEST_SUFFIX}" + +# Newer NVIDIA drivers need additional symbols +CONFIG_X86_MCE=y +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y + +# CC related configs +CONFIG_CRYPTO_ECC=y +CONFIG_CRYPTO_ECDH=y +CONFIG_CRYPTO_ECDSA=y diff --git a/tools/packaging/kernel/kata_config_version b/tools/packaging/kernel/kata_config_version index d5e55fec1..8cccbbf4a 100644 --- a/tools/packaging/kernel/kata_config_version +++ b/tools/packaging/kernel/kata_config_version @@ -1 +1 @@ -109cc +109cc+ diff --git a/tools/packaging/static-build/cloud-hypervisor/build-static-clh.sh b/tools/packaging/static-build/cloud-hypervisor/build-static-clh.sh index 975a517a1..f381897bc 100755 --- a/tools/packaging/static-build/cloud-hypervisor/build-static-clh.sh +++ b/tools/packaging/static-build/cloud-hypervisor/build-static-clh.sh @@ -76,12 +76,12 @@ build_clh_from_source() { if [ -n "${features}" ]; then info "Build cloud-hypervisor enabling the following features: ${features}" - ./scripts/dev_cli.sh build --release --libc musl --features "${features}" + ./scripts/dev_cli.sh build --release --libc "${libc}" --features "${features}" else - ./scripts/dev_cli.sh build --release --libc musl + ./scripts/dev_cli.sh build --release --libc "${libc}" fi rm -f cloud-hypervisor - cp build/cargo_target/$(uname -m)-unknown-linux-musl/release/cloud-hypervisor . + cp build/cargo_target/$(uname -m)-unknown-linux-${libc}/release/cloud-hypervisor . popd } diff --git a/tools/packaging/static-build/ovmf/build.sh b/tools/packaging/static-build/ovmf/build.sh index 16e4292b8..d9eef8781 100755 --- a/tools/packaging/static-build/ovmf/build.sh +++ b/tools/packaging/static-build/ovmf/build.sh @@ -37,17 +37,17 @@ fi [ -n "$ovmf_repo" ] || die "failed to get ovmf repo" if [ "${ovmf_build}" == "x86_64" ]; then - [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.x86_64.version") - [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.x86_64.package") - [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.x86_64.package_output_dir") + [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.x86_64.version") + [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.x86_64.package") + [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.x86_64.package_output_dir") elif [ "${ovmf_build}" == "sev" ]; then - [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.sev.version") - [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.sev.package") - [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.sev.package_output_dir") + [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.sev.version") + [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.sev.package") + [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.sev.package_output_dir") elif [ "${ovmf_build}" == "tdx" ]; then - [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.tdx.version") - [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.tdx.package") - [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.tdx.package_output_dir") + [ -n "$ovmf_version" ] || ovmf_version=$(get_from_kata_deps "externals.ovmf.tdx.version") + [ -n "$ovmf_package" ] || ovmf_package=$(get_from_kata_deps "externals.ovmf.tdx.package") + [ -n "$package_output_dir" ] || package_output_dir=$(get_from_kata_deps "externals.ovmf.tdx.package_output_dir") fi [ -n "$ovmf_version" ] || die "failed to get ovmf version or commit" @@ -56,8 +56,8 @@ fi sudo docker pull ${container_image} || \ (sudo docker build -t "${container_image}" "${script_dir}" && \ - # No-op unless PUSH_TO_REGISTRY is exported as "yes" - push_to_registry "${container_image}") + # No-op unless PUSH_TO_REGISTRY is exported as "yes" + push_to_registry "${container_image}") sudo docker run --rm -i -v "${repo_root_dir}:${repo_root_dir}" \ -w "${PWD}" \