mirror of
https://github.com/aljazceru/kata-containers.git
synced 2026-01-23 16:24:19 +01:00
Merge pull request #7300 from stevenhorsman/CCv0-merge-10th-july
CCv0: Merge main into CCv0 branch
This commit is contained in:
@@ -13,14 +13,15 @@ on:
|
||||
required: false
|
||||
type: string
|
||||
default: no
|
||||
commit-hash:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
build-asset:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
stage:
|
||||
- ${{ inputs.stage }}
|
||||
asset:
|
||||
- cloud-hypervisor
|
||||
- cloud-hypervisor-glibc
|
||||
@@ -46,9 +47,11 @@ jobs:
|
||||
- shim-v2
|
||||
- tdvf
|
||||
- virtiofsd
|
||||
stage:
|
||||
- ${{ inputs.stage }}
|
||||
exclude:
|
||||
- stage: release
|
||||
asset: cloud-hypervisor-glibc
|
||||
- asset: cloud-hypervisor-glibc
|
||||
stage: release
|
||||
steps:
|
||||
- name: Login to Kata Containers quay.io
|
||||
if: ${{ inputs.push-to-registry == 'yes' }}
|
||||
@@ -60,7 +63,7 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
fetch-depth: 0 # This is needed in order to keep the commit ids history
|
||||
|
||||
- name: Build ${{ matrix.asset }}
|
||||
@@ -88,7 +91,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
- name: get-artifacts
|
||||
uses: actions/download-artifact@v3
|
||||
with:
|
||||
|
||||
@@ -9,6 +9,9 @@ on:
|
||||
required: false
|
||||
type: string
|
||||
default: no
|
||||
commit-hash:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
build-asset:
|
||||
@@ -41,7 +44,7 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
fetch-depth: 0 # This is needed in order to keep the commit ids history
|
||||
- name: Build ${{ matrix.asset }}
|
||||
run: |
|
||||
@@ -72,7 +75,7 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
- name: get-artifacts
|
||||
uses: actions/download-artifact@v3
|
||||
with:
|
||||
|
||||
@@ -9,6 +9,9 @@ on:
|
||||
required: false
|
||||
type: string
|
||||
default: no
|
||||
commit-hash:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
build-asset:
|
||||
@@ -37,7 +40,7 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
fetch-depth: 0 # This is needed in order to keep the commit ids history
|
||||
- name: Build ${{ matrix.asset }}
|
||||
run: |
|
||||
@@ -69,7 +72,7 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
- name: get-artifacts
|
||||
uses: actions/download-artifact@v3
|
||||
with:
|
||||
|
||||
14
.github/workflows/ci-nightly.yaml
vendored
Normal file
14
.github/workflows/ci-nightly.yaml
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
name: Kata Containers Nightly CI
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 0 * * *'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
kata-containers-ci-on-push:
|
||||
uses: ./.github/workflows/ci.yaml
|
||||
with:
|
||||
commit-hash: ${{ github.sha }}
|
||||
pr-number: "nightly"
|
||||
tag: ${{ github.sha }}-nightly
|
||||
secrets: inherit
|
||||
65
.github/workflows/ci-on-push.yaml
vendored
65
.github/workflows/ci-on-push.yaml
vendored
@@ -12,65 +12,14 @@ on:
|
||||
- synchronize
|
||||
- reopened
|
||||
- labeled
|
||||
|
||||
paths-ignore:
|
||||
- 'docs/**'
|
||||
jobs:
|
||||
build-kata-static-tarball-amd64:
|
||||
kata-containers-ci-on-push:
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
|
||||
uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml
|
||||
uses: ./.github/workflows/ci.yaml
|
||||
with:
|
||||
tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }}
|
||||
|
||||
publish-kata-deploy-payload-amd64:
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
|
||||
needs: build-kata-static-tarball-amd64
|
||||
uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml
|
||||
with:
|
||||
tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }}
|
||||
registry: ghcr.io
|
||||
repo: ${{ github.repository_owner }}/kata-deploy-ci
|
||||
tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
|
||||
commit-hash: ${{ github.event.pull_request.head.sha }}
|
||||
pr-number: ${{ github.event.pull_request.number }}
|
||||
tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}
|
||||
secrets: inherit
|
||||
|
||||
run-k8s-tests-on-aks:
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
|
||||
needs: publish-kata-deploy-payload-amd64
|
||||
uses: ./.github/workflows/run-k8s-tests-on-aks.yaml
|
||||
with:
|
||||
registry: ghcr.io
|
||||
repo: ${{ github.repository_owner }}/kata-deploy-ci
|
||||
tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
|
||||
secrets: inherit
|
||||
|
||||
run-k8s-tests-on-sev:
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
|
||||
needs: publish-kata-deploy-payload-amd64
|
||||
uses: ./.github/workflows/run-k8s-tests-on-sev.yaml
|
||||
with:
|
||||
registry: ghcr.io
|
||||
repo: ${{ github.repository_owner }}/kata-deploy-ci
|
||||
tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
|
||||
|
||||
run-k8s-tests-on-snp:
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
|
||||
needs: publish-kata-deploy-payload-amd64
|
||||
uses: ./.github/workflows/run-k8s-tests-on-snp.yaml
|
||||
with:
|
||||
registry: ghcr.io
|
||||
repo: ${{ github.repository_owner }}/kata-deploy-ci
|
||||
tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
|
||||
|
||||
run-k8s-tests-on-tdx:
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
|
||||
needs: publish-kata-deploy-payload-amd64
|
||||
uses: ./.github/workflows/run-k8s-tests-on-tdx.yaml
|
||||
with:
|
||||
registry: ghcr.io
|
||||
repo: ${{ github.repository_owner }}/kata-deploy-ci
|
||||
tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
|
||||
|
||||
run-metrics-tests:
|
||||
if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
|
||||
needs: build-kata-static-tarball-amd64
|
||||
uses: ./.github/workflows/run-metrics.yaml
|
||||
with:
|
||||
tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }}
|
||||
|
||||
76
.github/workflows/ci.yaml
vendored
Normal file
76
.github/workflows/ci.yaml
vendored
Normal file
@@ -0,0 +1,76 @@
|
||||
name: Run the Kata Containers CI
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
commit-hash:
|
||||
required: true
|
||||
type: string
|
||||
pr-number:
|
||||
required: true
|
||||
type: string
|
||||
tag:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
build-kata-static-tarball-amd64:
|
||||
uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml
|
||||
with:
|
||||
tarball-suffix: -${{ inputs.tag }}
|
||||
commit-hash: ${{ inputs.commit-hash }}
|
||||
|
||||
publish-kata-deploy-payload-amd64:
|
||||
needs: build-kata-static-tarball-amd64
|
||||
uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml
|
||||
with:
|
||||
tarball-suffix: -${{ inputs.tag }}
|
||||
registry: ghcr.io
|
||||
repo: ${{ github.repository_owner }}/kata-deploy-ci
|
||||
tag: ${{ inputs.tag }}-amd64
|
||||
commit-hash: ${{ inputs.commit-hash }}
|
||||
secrets: inherit
|
||||
|
||||
run-k8s-tests-on-aks:
|
||||
needs: publish-kata-deploy-payload-amd64
|
||||
uses: ./.github/workflows/run-k8s-tests-on-aks.yaml
|
||||
with:
|
||||
registry: ghcr.io
|
||||
repo: ${{ github.repository_owner }}/kata-deploy-ci
|
||||
tag: ${{ inputs.tag }}-amd64
|
||||
commit-hash: ${{ inputs.commit-hash }}
|
||||
pr-number: ${{ inputs.pr-number }}
|
||||
secrets: inherit
|
||||
|
||||
run-k8s-tests-on-sev:
|
||||
needs: publish-kata-deploy-payload-amd64
|
||||
uses: ./.github/workflows/run-k8s-tests-on-sev.yaml
|
||||
with:
|
||||
registry: ghcr.io
|
||||
repo: ${{ github.repository_owner }}/kata-deploy-ci
|
||||
tag: ${{ inputs.tag }}-amd64
|
||||
commit-hash: ${{ inputs.commit-hash }}
|
||||
|
||||
run-k8s-tests-on-snp:
|
||||
needs: publish-kata-deploy-payload-amd64
|
||||
uses: ./.github/workflows/run-k8s-tests-on-snp.yaml
|
||||
with:
|
||||
registry: ghcr.io
|
||||
repo: ${{ github.repository_owner }}/kata-deploy-ci
|
||||
tag: ${{ inputs.tag }}-amd64
|
||||
commit-hash: ${{ inputs.commit-hash }}
|
||||
|
||||
run-k8s-tests-on-tdx:
|
||||
needs: publish-kata-deploy-payload-amd64
|
||||
uses: ./.github/workflows/run-k8s-tests-on-tdx.yaml
|
||||
with:
|
||||
registry: ghcr.io
|
||||
repo: ${{ github.repository_owner }}/kata-deploy-ci
|
||||
tag: ${{ inputs.tag }}-amd64
|
||||
commit-hash: ${{ inputs.commit-hash }}
|
||||
|
||||
run-metrics-tests:
|
||||
needs: build-kata-static-tarball-amd64
|
||||
uses: ./.github/workflows/run-metrics.yaml
|
||||
with:
|
||||
tarball-suffix: -${{ inputs.tag }}
|
||||
commit-hash: ${{ inputs.commit-hash }}
|
||||
6
.github/workflows/payload-after-push.yaml
vendored
6
.github/workflows/payload-after-push.yaml
vendored
@@ -9,18 +9,21 @@ jobs:
|
||||
build-assets-amd64:
|
||||
uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml
|
||||
with:
|
||||
commit-hash: ${{ github.sha }}
|
||||
push-to-registry: yes
|
||||
secrets: inherit
|
||||
|
||||
build-assets-arm64:
|
||||
uses: ./.github/workflows/build-kata-static-tarball-arm64.yaml
|
||||
with:
|
||||
commit-hash: ${{ github.sha }}
|
||||
push-to-registry: yes
|
||||
secrets: inherit
|
||||
|
||||
build-assets-s390x:
|
||||
uses: ./.github/workflows/build-kata-static-tarball-s390x.yaml
|
||||
with:
|
||||
commit-hash: ${{ github.sha }}
|
||||
push-to-registry: yes
|
||||
secrets: inherit
|
||||
|
||||
@@ -28,6 +31,7 @@ jobs:
|
||||
needs: build-assets-amd64
|
||||
uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml
|
||||
with:
|
||||
commit-hash: ${{ github.sha }}
|
||||
registry: quay.io
|
||||
repo: kata-containers/kata-deploy-ci
|
||||
tag: kata-containers-amd64
|
||||
@@ -37,6 +41,7 @@ jobs:
|
||||
needs: build-assets-arm64
|
||||
uses: ./.github/workflows/publish-kata-deploy-payload-arm64.yaml
|
||||
with:
|
||||
commit-hash: ${{ github.sha }}
|
||||
registry: quay.io
|
||||
repo: kata-containers/kata-deploy-ci
|
||||
tag: kata-containers-arm64
|
||||
@@ -46,6 +51,7 @@ jobs:
|
||||
needs: build-assets-s390x
|
||||
uses: ./.github/workflows/publish-kata-deploy-payload-s390x.yaml
|
||||
with:
|
||||
commit-hash: ${{ github.sha }}
|
||||
registry: quay.io
|
||||
repo: kata-containers/kata-deploy-ci
|
||||
tag: kata-containers-s390x
|
||||
|
||||
@@ -14,6 +14,9 @@ on:
|
||||
tag:
|
||||
required: true
|
||||
type: string
|
||||
commit-hash:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
kata-payload:
|
||||
@@ -21,7 +24,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
|
||||
- name: get-kata-tarball
|
||||
uses: actions/download-artifact@v3
|
||||
|
||||
@@ -14,6 +14,9 @@ on:
|
||||
tag:
|
||||
required: true
|
||||
type: string
|
||||
commit-hash:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
kata-payload:
|
||||
@@ -25,7 +28,7 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
|
||||
- name: get-kata-tarball
|
||||
uses: actions/download-artifact@v3
|
||||
|
||||
@@ -14,6 +14,9 @@ on:
|
||||
tag:
|
||||
required: true
|
||||
type: string
|
||||
commit-hash:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
kata-payload:
|
||||
@@ -25,7 +28,7 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
|
||||
- name: get-kata-tarball
|
||||
uses: actions/download-artifact@v3
|
||||
|
||||
3
.github/workflows/release.yaml
vendored
3
.github/workflows/release.yaml
vendored
@@ -72,8 +72,7 @@ jobs:
|
||||
- uses: actions/checkout@v3
|
||||
- name: install hub
|
||||
run: |
|
||||
HUB_VER=$(curl -s "https://api.github.com/repos/github/hub/releases/latest" | jq -r .tag_name | sed 's/^v//')
|
||||
wget -q -O- https://github.com/github/hub/releases/download/v$HUB_VER/hub-linux-amd64-$HUB_VER.tgz | \
|
||||
wget -q -O- https://github.com/mislav/hub/releases/download/v2.14.2/hub-linux-amd64-2.14.2.tgz | \
|
||||
tar xz --strip-components=2 --wildcards '*/bin/hub' && sudo mv hub /usr/local/bin/hub
|
||||
|
||||
- name: download-artifacts-amd64
|
||||
|
||||
10
.github/workflows/run-k8s-tests-on-aks.yaml
vendored
10
.github/workflows/run-k8s-tests-on-aks.yaml
vendored
@@ -11,6 +11,12 @@ on:
|
||||
tag:
|
||||
required: true
|
||||
type: string
|
||||
pr-number:
|
||||
required: true
|
||||
type: string
|
||||
commit-hash:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
run-k8s-tests:
|
||||
@@ -31,13 +37,13 @@ jobs:
|
||||
DOCKER_REGISTRY: ${{ inputs.registry }}
|
||||
DOCKER_REPO: ${{ inputs.repo }}
|
||||
DOCKER_TAG: ${{ inputs.tag }}
|
||||
GH_PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
GH_PR_NUMBER: ${{ inputs.pr-number }}
|
||||
KATA_HOST_OS: ${{ matrix.host_os }}
|
||||
KATA_HYPERVISOR: ${{ matrix.vmm }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
|
||||
- name: Download Azure CLI
|
||||
run: bash tests/integration/gha-run.sh install-azure-cli
|
||||
|
||||
5
.github/workflows/run-k8s-tests-on-sev.yaml
vendored
5
.github/workflows/run-k8s-tests-on-sev.yaml
vendored
@@ -11,6 +11,9 @@ on:
|
||||
tag:
|
||||
required: true
|
||||
type: string
|
||||
commit-hash:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
run-k8s-tests:
|
||||
@@ -29,7 +32,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
|
||||
- name: Run tests
|
||||
timeout-minutes: 30
|
||||
|
||||
5
.github/workflows/run-k8s-tests-on-snp.yaml
vendored
5
.github/workflows/run-k8s-tests-on-snp.yaml
vendored
@@ -11,6 +11,9 @@ on:
|
||||
tag:
|
||||
required: true
|
||||
type: string
|
||||
commit-hash:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
run-k8s-tests:
|
||||
@@ -29,7 +32,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
|
||||
- name: Run tests
|
||||
timeout-minutes: 30
|
||||
|
||||
5
.github/workflows/run-k8s-tests-on-tdx.yaml
vendored
5
.github/workflows/run-k8s-tests-on-tdx.yaml
vendored
@@ -11,6 +11,9 @@ on:
|
||||
tag:
|
||||
required: true
|
||||
type: string
|
||||
commit-hash:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
run-k8s-tests:
|
||||
@@ -29,7 +32,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
|
||||
- name: Run tests
|
||||
timeout-minutes: 30
|
||||
|
||||
36
.github/workflows/run-metrics.yaml
vendored
36
.github/workflows/run-metrics.yaml
vendored
@@ -5,16 +5,25 @@ on:
|
||||
tarball-suffix:
|
||||
required: false
|
||||
type: string
|
||||
commit-hash:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
run-metrics:
|
||||
strategy:
|
||||
fail-fast: true
|
||||
matrix:
|
||||
vmm: ['clh', 'qemu']
|
||||
max-parallel: 1
|
||||
runs-on: metrics
|
||||
env:
|
||||
GOPATH: ${{ github.workspace }}
|
||||
KATA_HYPERVISOR: ${{ matrix.vmm }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: ${{ inputs.commit-hash }}
|
||||
|
||||
- name: get-kata-tarball
|
||||
uses: actions/download-artifact@v3
|
||||
@@ -25,8 +34,25 @@ jobs:
|
||||
- name: Install kata
|
||||
run: bash tests/metrics/gha-run.sh install-kata kata-artifacts
|
||||
|
||||
- name: run launch times on qemu
|
||||
run: bash tests/metrics/gha-run.sh run-test-launchtimes-qemu
|
||||
- name: run launch times test
|
||||
run: bash tests/metrics/gha-run.sh run-test-launchtimes
|
||||
|
||||
- name: run launch times on clh
|
||||
run: bash tests/metrics/gha-run.sh run-test-launchtimes-clh
|
||||
- name: run memory foot print test
|
||||
run: bash tests/metrics/gha-run.sh run-test-memory-usage
|
||||
|
||||
- name: run memory usage inside container test
|
||||
run: bash tests/metrics/gha-run.sh run-test-memory-usage-inside-container
|
||||
|
||||
- name: run blogbench test
|
||||
run: bash tests/metrics/gha-run.sh run-test-blogbench
|
||||
|
||||
- name: make metrics tarball ${{ matrix.vmm }}
|
||||
run: bash tests/metrics/gha-run.sh make-tarball-results
|
||||
|
||||
- name: archive metrics results ${{ matrix.vmm }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: metrics-artifacts-${{ matrix.vmm }}
|
||||
path: results-${{ matrix.vmm }}.tar.gz
|
||||
retention-days: 1
|
||||
if-no-files-found: error
|
||||
|
||||
@@ -23,7 +23,7 @@ jobs:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
./ci/install_rust.sh
|
||||
PATH=$PATH:"$HOME/.cargo/bin"
|
||||
echo PATH="$HOME/.cargo/bin:$PATH" >> $GITHUB_ENV
|
||||
- name: Run Unit Test
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
|
||||
run: |
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
<img src="https://object-storage-ca-ymq-1.vexxhost.net/swift/v1/6e4619c416ff4bd19e1c087f27a43eea/www-images-prod/openstack-logo/kata/SVG/kata-1.svg" width="900">
|
||||
|
||||
[](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml)
|
||||
[](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml) [](https://github.com/kata-containers/kata-containers/actions/workflows/ci-nightly.yaml)
|
||||
|
||||
# Kata Containers
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ Kata Containers design documents:
|
||||
- [Design for Kata Containers `Lazyload` ability with `nydus`](kata-nydus-design.md)
|
||||
- [Design for direct-assigned volume](direct-blk-device-assignment.md)
|
||||
- [Design for core-scheduling](core-scheduling.md)
|
||||
- [Virtualization Reference Architecture](kata-vra.md)
|
||||
---
|
||||
|
||||
- [Design proposals](proposals)
|
||||
|
||||
434
docs/design/kata-vra.md
Normal file
434
docs/design/kata-vra.md
Normal file
@@ -0,0 +1,434 @@
|
||||
# Virtualization Reference Architecture
|
||||
|
||||
## Subject to Change | © 2022 by NVIDIA Corporation. All rights reserved. | For test and development only_
|
||||
|
||||
Before digging deeper into the virtualization reference architecture, let's
|
||||
first look at the various GPUDirect use cases in the following table. We’re
|
||||
distinguishing between two top-tier use cases where the devices are (1)
|
||||
passthrough and (2) virtualized, where a VM gets assigned a virtual function
|
||||
(VF) and not the physical function (PF). A combination of PF and VF would also
|
||||
be possible.
|
||||
|
||||
| Device #1 (passthrough) | Device #2 (passthrough) | P2P Compatibility and Mode |
|
||||
| ------------------------- | ----------------------- | -------------------------------------------- |
|
||||
| GPU PF | GPU PF | GPUDirect P2P |
|
||||
| GPU PF | NIC PF | GPUDirect RDMA |
|
||||
| MIG-slice | MIG-slice | _No GPUDirect P2P_ |
|
||||
| MIG-slice | NIC PF | GPUDirect RDMA |
|
||||
| **PDevice #1 (virtualized)** | **Device #2 (virtualized)** | **P2P Compatibility and Mode** |
|
||||
| Time-slice vGPU VF | Time-slice vGPU VF | _No GPUDirect P2P but NVLINK P2P available_ |
|
||||
| Time-slice vGPU VF | NIC VF | GPUDirect RDMA |
|
||||
| MIG-slice vGPU | MIG-slice vGPU | _No GPUDirect P2P_ |
|
||||
| MIG-slice vGPU | NIC VF | GPUDirect RDMA |
|
||||
|
||||
In a virtualized environment we have several distinct features that may prevent
|
||||
Peer-to-peer (P2P) communication of two endpoints in a PCI Express topology. The
|
||||
IOMMU translates IO virtual addresses (IOVA) to physical addresses (PA). Each
|
||||
device behind an IOMMU has its own IOVA memory space, usually, no two devices
|
||||
share the same IOVA memory space but it’s up to the hypervisor or OS how it
|
||||
chooses to map devices to IOVA spaces. Any PCI Express DMA transactions will
|
||||
use IOVAs, which the IOMMU must translate. By default, all the traffic is routed
|
||||
to the root complex and not issued directly to the peer device.
|
||||
|
||||
An IOMMU can be used to isolate and protect devices even if virtualization is
|
||||
not used; since devices can only access memory regions that are mapped for it, a
|
||||
DMA from one device to another is not possible. DPDK uses the IOMMU to have
|
||||
better isolation between devices, another benefit is that IOVA space can be
|
||||
represented as a contiguous memory even if the PA space is heavily scattered.
|
||||
|
||||
In the case of virtualization, the IOMMU is responsible for isolating the device
|
||||
and memory between VMs for safe device assignment without compromising the host
|
||||
and other guest OSes. Without an IOMMU, any device can access the entire system
|
||||
and perform DMA transactions _anywhere_.
|
||||
|
||||
The second feature is ACS (Access Control Services), which controls which
|
||||
devices are allowed to communicate with one another and thus avoids improper
|
||||
routing of packets irrespectively of whether IOMMU is enabled or not.
|
||||
|
||||
When IOMMU is enabled, ACS is normally configured to force all PCI Express DMA
|
||||
to go through the root complex so IOMMU can translate it, impacting performance
|
||||
between peers with higher latency and reduced bandwidth.
|
||||
|
||||
A way to avoid the performance hit is to enable Address Translation Services
|
||||
(ATS). ATS-capable endpoints can prefetch IOVA -> PA translations from the IOMMU
|
||||
and then perform DMA transactions directly to another endpoint. Hypervisors
|
||||
enable this by enabling ATS in such endpoints, configuring ACS to enable Direct
|
||||
Translated P2P, and configuring the IOMMU to allow Address Translation requests.
|
||||
|
||||
Another important factor is that the NVIDIA driver stack will use the PCI
|
||||
Express topology of the system it is running on to determine whether the
|
||||
hardware is capable of supporting P2P. The driver stack qualifies specific
|
||||
chipsets, and PCI Express switches for use with GPUDirect P2P. In virtual
|
||||
environments, the PCI Express topology is flattened and obfuscated to present a
|
||||
uniform environment to the software inside the VM, which breaks the GPUDirect
|
||||
P2P use case.
|
||||
|
||||
On a bare metal machine, the driver stack groups GPUs into cliques that can
|
||||
perform GPUDirect P2P communication, excluding peer mappings where P2P
|
||||
communication is not possible, prominently if GPUs are attached to multiple CPU
|
||||
sockets.
|
||||
|
||||
CPUs and local memory banks are referred to as NUMA nodes. In a two-socket
|
||||
server, each of the CPUs has a local memory bank for a total of two NUMA nodes.
|
||||
Some servers provide the ability to configure additional NUMA nodes per CPU,
|
||||
which means a CPU socket can have two NUMA nodes (some servers support four
|
||||
NUMA nodes per socket) with local memory banks and L3 NUMA domains for improved
|
||||
performance.
|
||||
|
||||
One of the current solutions is that the hypervisor provides additional topology
|
||||
information that the driver stack can pick up and enable GPUDirect P2P between
|
||||
GPUs, even if the virtualized environment does not directly expose it. The PCI
|
||||
Express virtual P2P approval capability structure in the PCI configuration space
|
||||
is entirely emulated by the hypervisor of passthrough GPU devices.
|
||||
|
||||
A clique ID is provided where GPUs with the same clique ID belong to a group of
|
||||
GPUs capable of P2P communication
|
||||
|
||||
On vSphere, Azure, and other CPSs, the hypervisor lays down a `topologies.xml`
|
||||
which NCCL can pick up and deduce the right P2P level[^1]. NCCL is leveraging
|
||||
Infiniband (IB) and/or Unified Communication X (UCX) for communication, and
|
||||
GPUDirect P2P and GPUDirect RDMA should just work in this case. The only culprit
|
||||
is that software or applications that do not use the XML file to deduce the
|
||||
topology will fail and not enable GPUDirect ( [`nccl-p2p-level`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-p2p-level) )
|
||||
|
||||
## Hypervisor PCI Express Topology
|
||||
|
||||
To enable every part of the accelerator stack, we propose a virtualized
|
||||
reference architecture to enable GPUDirect P2P and GPUDirect RDMA for any
|
||||
hypervisor. The idea is split into two parts to enable the right PCI Express
|
||||
topology. The first part builds upon extending the PCI Express virtual P2P
|
||||
approval capability structure to every device that wants to do P2P in some way
|
||||
and groups devices by clique ID. The other part involves replicating a subset of
|
||||
the host topology so that applications running in the VM do not need to read
|
||||
additional information and enable the P2P capability like in the bare-metal use
|
||||
case described above. The driver stack can then deduce automatically if the
|
||||
topology presented in the VM is capable of P2P communication.
|
||||
|
||||
We will work with the following host topology for the following sections. It is
|
||||
a system with two converged DPUs, each having an `A100X` GPU and two `ConnectX-6`
|
||||
network ports connected to the downstream ports of a PCI Express switch.
|
||||
|
||||
```sh
|
||||
+-00.0-[d8-df]----00.0-[d9-df]--+-00.0-[da-db]--+-00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
|
||||
| +-00.1 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
|
||||
| \-00.2 Mellanox Tech MT42822 BlueField-2 SoC Management Interface
|
||||
\-01.0-[dc-df]----00.0-[dd-df]----08.0-[de-df]----00.0 NVIDIA Corporation GA100 [A100X]
|
||||
|
||||
+-00.0-[3b-42]----00.0-[3c-42]--+-00.0-[3d-3e]--+-00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
|
||||
| +-00.1 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
|
||||
| \-00.2 Mellanox Tech MT42822 BlueField-2 SoC Management Interface
|
||||
\-01.0-[3f-42]----00.0-[40-42]----08.0-[41-42]----00.0 NVIDIA Corporation GA100 [A100X]
|
||||
```
|
||||
|
||||
The green path highlighted above is the optimal and preferred path for
|
||||
efficient P2P communication.
|
||||
|
||||
## PCI Express Virtual P2P Approval Capability
|
||||
|
||||
Most of the time, the PCI Express topology is flattened and obfuscated to ensure
|
||||
easy migration of the VM image between different physical hardware topologies.
|
||||
In Kata, we can configure the hypervisor to use PCI Express root ports to
|
||||
hotplug the VFIO devices one is passing through. A user can select how many PCI
|
||||
Express root ports to allocate depending on how many devices are passed through.
|
||||
A recent addition to Kata will detect the right amount of PCI Express devices
|
||||
that need hotplugging and bail out if the number of root ports is insufficient.
|
||||
In Kata, we do not automatically increase the number of root ports, we want the
|
||||
user to be in full control of the topology.
|
||||
|
||||
```toml
|
||||
# /etc/kata-containers/configuration.toml
|
||||
|
||||
# VFIO devices are hotplugged on a bridge by default.
|
||||
# Enable hot-plugging on the root bus. This may be required for devices with
|
||||
# a large PCI bar, as this is a current limitation with hot-plugging on
|
||||
# a bridge.
|
||||
# Default “bridge-port”
|
||||
hotplug_vfio = "root-port"
|
||||
|
||||
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
|
||||
# Use this parameter when using some large PCI bar devices, such as NVIDIA GPU
|
||||
# The value means the number of pcie_root_port
|
||||
# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35"
|
||||
# Default 0
|
||||
pcie_root_port = 8
|
||||
```
|
||||
|
||||
VFIO devices are hotplugged on a PCIe-PCI bridge by default. Hotplug of PCI
|
||||
Express devices is only supported on PCI Express root or downstream ports. With
|
||||
this configuration set, if we start up a Kata container, we can inspect our
|
||||
topology and see the allocated PCI Express root ports and the hotplugged
|
||||
devices.
|
||||
|
||||
```sh
|
||||
$ lspci -tv
|
||||
-[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
|
||||
+-01.0 Red Hat, Inc. Virtio console
|
||||
+-02.0 Red Hat, Inc. Virtio SCSI
|
||||
+-03.0 Red Hat, Inc. Virtio RNG
|
||||
+-04.0-[01]----00.0 Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6
|
||||
+-05.0-[02]----00.0 Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6
|
||||
+-06.0-[03]----00.0 NVIDIA Corporation Device 20b8
|
||||
+-07.0-[04]----00.0 NVIDIA Corporation Device 20b8
|
||||
+-08.0-[05]--
|
||||
+-09.0-[06]--
|
||||
+-0a.0-[07]--
|
||||
+-0b.0-[08]--
|
||||
+-0c.0 Red Hat, Inc. Virtio socket
|
||||
+-0d.0 Red Hat, Inc. Virtio file system
|
||||
+-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller
|
||||
+-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller
|
||||
\-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
|
||||
```
|
||||
|
||||
For devices with huge BARs (Base Address Registers) like the GPU (we need to
|
||||
configure the PCI Express root port properly and allocate enough memory for
|
||||
mapping), we have added a heuristic to Kata to deduce the right settings. Hence,
|
||||
the BARs can be mapped correctly. This functionality is added to
|
||||
[`nvidia/go-nvlib1](https://gitlab.com/nvidia/cloud-native/go-nvlib) which is part
|
||||
of Kata now.
|
||||
|
||||
```sh
|
||||
$ sudo dmesg | grep BAR
|
||||
[ 0.179960] pci 0000:00:04.0: BAR 7: assigned [io 0x1000-0x1fff]
|
||||
[ 0.179962] pci 0000:00:05.0: BAR 7: assigned [io 0x2000-0x2fff]
|
||||
[ 0.179963] pci 0000:00:06.0: BAR 7: assigned [io 0x3000-0x3fff]
|
||||
[ 0.179964] pci 0000:00:07.0: BAR 7: assigned [io 0x4000-0x4fff]
|
||||
[ 0.179966] pci 0000:00:08.0: BAR 7: assigned [io 0x5000-0x5fff]
|
||||
[ 0.179967] pci 0000:00:09.0: BAR 7: assigned [io 0x6000-0x6fff]
|
||||
[ 0.179968] pci 0000:00:0a.0: BAR 7: assigned [io 0x7000-0x7fff]
|
||||
[ 0.179969] pci 0000:00:0b.0: BAR 7: assigned [io 0x8000-0x8fff]
|
||||
[ 2.115912] pci 0000:01:00.0: BAR 0: assigned [mem 0x13000000000-0x13001ffffff 64bit pref]
|
||||
[ 2.116203] pci 0000:01:00.0: BAR 2: assigned [mem 0x13002000000-0x130027fffff 64bit pref]
|
||||
[ 2.683132] pci 0000:02:00.0: BAR 0: assigned [mem 0x12000000000-0x12001ffffff 64bit pref]
|
||||
[ 2.683419] pci 0000:02:00.0: BAR 2: assigned [mem 0x12002000000-0x120027fffff 64bit pref]
|
||||
[ 2.959155] pci 0000:03:00.0: BAR 1: assigned [mem 0x11000000000-0x117ffffffff 64bit pref]
|
||||
[ 2.959345] pci 0000:03:00.0: BAR 3: assigned [mem 0x11800000000-0x11801ffffff 64bit pref]
|
||||
[ 2.959523] pci 0000:03:00.0: BAR 0: assigned [mem 0xf9000000-0xf9ffffff]
|
||||
[ 2.966119] pci 0000:04:00.0: BAR 1: assigned [mem 0x10000000000-0x107ffffffff 64bit pref]
|
||||
[ 2.966295] pci 0000:04:00.0: BAR 3: assigned [mem 0x10800000000-0x10801ffffff 64bit pref]
|
||||
[ 2.966472] pci 0000:04:00.0: BAR 0: assigned [mem 0xf7000000-0xf7ffffff]
|
||||
```
|
||||
|
||||
The NVIDIA driver stack in this case would refuse to do P2P communication since
|
||||
(1) the topology is not what it expects, (2) we do not have a qualified
|
||||
chipset. Since our P2P devices are not connected to a PCI Express switch port,
|
||||
we need to provide additional information to support the P2P functionality. One
|
||||
way of providing such meta information would be to annotate the container; most
|
||||
of the settings in Kata's configuration file can be overridden via annotations,
|
||||
but this limits the flexibility, and a user would need to update all the
|
||||
containers that he wants to run with Kata. The goal is to make such things as
|
||||
transparent as possible, so we also introduced
|
||||
[CDI](https://github.com/container-orchestrated-devices/container-device-interface)
|
||||
(Container Device Interface) to Kata. CDI is a[
|
||||
specification](https://github.com/container-orchestrated-devices/container-device-interface/blob/master/SPEC.md)
|
||||
for container runtimes to support third-party devices.
|
||||
|
||||
As written before, we can provide a clique ID for the devices that belong
|
||||
together and are capable of doing P2P. This information is provided to the
|
||||
hypervisor, which will set up things in the VM accordingly. Let's suppose the
|
||||
user wanted to do GPUDirect RDMA with the first GPU and the NIC that reside on
|
||||
the same DPU, one could provide the specification telling the hypervisor that
|
||||
they belong to the same clique.
|
||||
|
||||
```yaml
|
||||
# /etc/cdi/nvidia.yaml
|
||||
cdiVersion: 0.4.0
|
||||
kind: nvidia.com/gpu
|
||||
devices:
|
||||
- name: gpu0
|
||||
annotations:
|
||||
bdf: “41:00.0”
|
||||
clique-id: “0”
|
||||
containerEdits:
|
||||
deviceNodes:
|
||||
- path: “/dev/vfio/71"
|
||||
|
||||
# /etc/cdi/mellanox.yaml
|
||||
cdiVersion: 0.4.0
|
||||
kind: mellanox.com/nic
|
||||
devices:
|
||||
- name: nic0
|
||||
annotations:
|
||||
bdf: “3d:00.0”
|
||||
clique-id: “0”
|
||||
attach-pci: “true”
|
||||
containerEdits:
|
||||
deviceNodes:
|
||||
- path: "/dev/vfio/66"
|
||||
```
|
||||
|
||||
Since this setting is bound to the device and not the container we do not need
|
||||
to alter the container just allocate the right resource and GPUDirect RDMA would
|
||||
be set up correctly. Rather than exposing them separately, an idea would be to
|
||||
expose a GPUDirect RDMA device via NFD (Node Feature Discovery) that combines
|
||||
both of them; this way, we could make sure that the right pair is allocated and
|
||||
used more on Kubernetes deployment in the next section.
|
||||
|
||||
The GPU driver stack is leveraging the PCI Express virtual P2P approval
|
||||
capability, but the NIC stack does not use this now. One of the action items is
|
||||
to enable MOFED to read the P2P approval capability and enable ATS and ACS
|
||||
settings as described above.
|
||||
|
||||
This way, we could enable GPUDirect P2P and GPUDirect RDMA on any topology
|
||||
presented to the VM application. It is the responsibility of the administrator
|
||||
or infrastructure engineer to provide the right information either via
|
||||
annotations or a CDI specification.
|
||||
|
||||
## Host Topology Replication
|
||||
|
||||
The other way to represent the PCI Express topology in the VM is to replicate a
|
||||
subset of the topology needed to support the P2P use case inside the VM. Similar
|
||||
to the configuration for the root ports, we can easily configure the usage of
|
||||
PCI Express switch ports to hotplug the devices.
|
||||
|
||||
```toml
|
||||
# /etc/kata-containers/configuration.toml
|
||||
|
||||
# VFIO devices are hotplugged on a bridge by default.
|
||||
# Enable hot plugging on the root bus. This may be required for devices with
|
||||
# a large PCI bar, as this is a current limitation with hot plugging on
|
||||
# a bridge.
|
||||
# Default “bridge-port”
|
||||
hotplug_vfio = "switch-port"
|
||||
|
||||
# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
|
||||
# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
|
||||
# The value means the number of pcie_root_port
|
||||
# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35"
|
||||
# Default 0
|
||||
pcie_switch_port = 8
|
||||
```
|
||||
|
||||
Each device that is passed through is attached to a PCI Express downstream port
|
||||
as illustrated below. We can even replicate the host’s two DPUs topologies with
|
||||
added metadata through the CDI. Most of the time, a container only needs one
|
||||
pair of GPU and NIC for GPUDirect RDMA. This is more of a showcase of what we
|
||||
can do with the power of Kata and CDI. One could even think of adding groups of
|
||||
devices that support P2P, even from different CPU sockets or NUMA nodes, into
|
||||
one container; indeed, the first group is NUMA node 0 (red), and the second
|
||||
group is NUMA node 1 (green). Since they are grouped correctly, P2P would be
|
||||
enabled naturally inside a group, aka clique ID.
|
||||
|
||||
```sh
|
||||
$ lspci -tv
|
||||
-[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
|
||||
+-01.0 Red Hat, Inc. Virtio console
|
||||
+-02.0 Red Hat, Inc. Virtio SCSI
|
||||
+-03.0 Red Hat, Inc. Virtio RNG
|
||||
+-04.0-[01-04]----00.0-[02-04]--+-00.0-[03]----00.0 NVIDIA Corporation Device 20b8
|
||||
| \-01.0-[04]----00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx
|
||||
+-05.0-[05-08]----00.0-[06-08]--+-00.0-[07]----00.0 Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx
|
||||
| \-01.0-[08]----00.0 NVIDIA Corporation Device 20b8
|
||||
+-06.0 Red Hat, Inc. Virtio socket
|
||||
+-07.0 Red Hat, Inc. Virtio file system
|
||||
+-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller
|
||||
+-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller [AHCI mode]
|
||||
\-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
|
||||
\-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
|
||||
```
|
||||
|
||||
The configuration of using either the root port or switch port can be applied on
|
||||
a per Container or Pod basis, meaning we can switch PCI Express topologies on
|
||||
each run of an application.
|
||||
|
||||
## Hypervisor Resource Limits
|
||||
|
||||
Every hypervisor will have resource limits in terms of how many PCI Express root
|
||||
ports, switch ports, or bridge ports can be created, especially with devices
|
||||
that need to reserve a 4K IO range per PCI specification. Each instance of root
|
||||
or switch port will consume 4K IO of very limited capacity, 64k is the maximum.
|
||||
|
||||
Simple math brings us to the conclusion that we can have a maximum of 16 PCI
|
||||
Express root ports or 16 PCI Express switch ports in QEMU if devices with IO
|
||||
BARs are used in the PCI Express hierarchy.
|
||||
|
||||
Additionally, one can have 32 slots on the PCI root bus and a maximum of 256
|
||||
slots for the complete PCI(e) topology.
|
||||
|
||||
Per default, QEMU will attach a multi-function device in the last slot on the
|
||||
PCI root bus,
|
||||
|
||||
```sh
|
||||
+-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller
|
||||
+-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller [AHCI mode]
|
||||
\-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
|
||||
```
|
||||
|
||||
Kata will additionally add `virtio-xxx-pci` devices consuming (5 slots) plus a
|
||||
PCIe-PCI-bridge (1 slot) and a DRAM controller (1 slot), meaning per default, we
|
||||
have already eight slots used. This leaves us 24 slots for adding other devices
|
||||
to the root bus.
|
||||
|
||||
The problem that arises here is one use-case from a customer that uses recent
|
||||
RTX GPUs with Kata. The user wanted to pass through eight of these GPUs into one
|
||||
container and ran into issues. The problem is that those cards often consist of
|
||||
four individual device nodes: GPU, Audio, and two USB controller devices (some
|
||||
cards have a USB-C output).
|
||||
|
||||
These devices are grouped into one IOMMU group. Since one needs to pass through
|
||||
the complete IOMMU group into the VM, we need to allocate 32 PCI Express root
|
||||
ports or 32 PCI Express switch ports, which is technically impossible due to the
|
||||
resource limits outlined above. Since all the devices appear as PCI Express
|
||||
devices, we need to hotplug those into a root or switch port.
|
||||
|
||||
The solution to this problem is leveraging CDI. For each device, add the
|
||||
information if it is going to be hotplugged as a PCI Express or PCI device,
|
||||
which results in either using a PCI Express root/switch port or an ordinary PCI
|
||||
bridge. PCI bridges are not affected by the limited IO range. This way, the GPU
|
||||
is attached as a PCI Express device to a root/switch port and the other three
|
||||
PCI devices to a PCI bridge, leaving enough resources to create the needed PCI
|
||||
Express root/switch ports. For example, we’re going to attach the GPUs to a PCI
|
||||
Express root port and the NICs to a PCI bridge.
|
||||
|
||||
```jsonld
|
||||
# /etc/cdi/mellanox.json
|
||||
cdiVersion: 0.4.0
|
||||
kind: mellanox.com/nic
|
||||
devices:
|
||||
- name: nic0
|
||||
annotations:
|
||||
bdf: “3d:00.0”
|
||||
clique-id: “0”
|
||||
attach-pci: “true”
|
||||
containerEdits:
|
||||
deviceNodes:
|
||||
- path: "/dev/vfio/66"
|
||||
- name: nic1
|
||||
annotations:
|
||||
bdf: “3d:00.1”
|
||||
clique-id: “1”
|
||||
attach-pci: “true”
|
||||
containerEdits:
|
||||
deviceNodes:
|
||||
- path: "/dev/vfio/67”
|
||||
```
|
||||
|
||||
The configuration is set to use eight root ports for the GPUs and attach the
|
||||
NICs to a PCI bridge which is connected to a PCI Express-PCI bridge which is the
|
||||
preferred way of introducing a PCI topology in a PCI Express machine.
|
||||
|
||||
```sh
|
||||
$ lspci -tv
|
||||
-[0000:00]-+-00.0 Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
|
||||
+-01.0 Red Hat, Inc. Virtio console
|
||||
+-02.0 Red Hat, Inc. Virtio SCSI
|
||||
+-03.0 Red Hat, Inc. Virtio RNG
|
||||
+-04.0-[01]----00.0 NVIDIA Corporation Device 20b8
|
||||
+-05.0-[02]----00.0 NVIDIA Corporation Device 20b8
|
||||
+-06.0-[03]--
|
||||
+-07.0-[04]--
|
||||
+-08.0-[05]--
|
||||
+-09.0-[06]--
|
||||
+-0a.0-[07]--
|
||||
+-0b.0-[08]--
|
||||
+-0c.0-[09-0a]----00.0-[0a]--+-00.0 Mellanox Tech MT42822 BlueField-2 ConnectX-6
|
||||
| \-01.0 Mellanox Tech MT42822 BlueField-2 ConnectX-6
|
||||
+-0d.0 Red Hat, Inc. Virtio socket
|
||||
+-0e.0 Red Hat, Inc. Virtio file system
|
||||
+-1f.0 Intel Corporation 82801IB (ICH9) LPC Interface Controller
|
||||
+-1f.2 Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller
|
||||
\-1f.3 Intel Corporation 82801I (ICH9 Family) SMBus Controller
|
||||
```
|
||||
|
||||
The PCI devices will consume a slot of which we have 256 in the PCI(e) topology
|
||||
and leave scarce resources for the needed PCI Express devices.
|
||||
@@ -39,11 +39,9 @@ use std::path::Path;
|
||||
|
||||
const GUEST_CPUS_PATH: &str = "/sys/devices/system/cpu/online";
|
||||
|
||||
// Convenience macro to obtain the scope logger
|
||||
macro_rules! sl {
|
||||
() => {
|
||||
slog_scope::logger().new(o!("subsystem" => "cgroups"))
|
||||
};
|
||||
// Convenience function to obtain the scope logger.
|
||||
fn sl() -> slog::Logger {
|
||||
slog_scope::logger().new(o!("subsystem" => "cgroups"))
|
||||
}
|
||||
|
||||
macro_rules! get_controller_or_return_singular_none {
|
||||
@@ -82,7 +80,7 @@ impl CgroupManager for Manager {
|
||||
|
||||
fn set(&self, r: &LinuxResources, update: bool) -> Result<()> {
|
||||
info!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"cgroup manager set resources for container. Resources input {:?}", r
|
||||
);
|
||||
|
||||
@@ -120,7 +118,7 @@ impl CgroupManager for Manager {
|
||||
|
||||
// set devices resources
|
||||
set_devices_resources(&self.cgroup, &r.devices, res);
|
||||
info!(sl!(), "resources after processed {:?}", res);
|
||||
info!(sl(), "resources after processed {:?}", res);
|
||||
|
||||
// apply resources
|
||||
self.cgroup.apply(res)?;
|
||||
@@ -197,7 +195,7 @@ impl CgroupManager for Manager {
|
||||
if guest_cpuset.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
info!(sl!(), "update_cpuset_path to: {}", guest_cpuset);
|
||||
info!(sl(), "update_cpuset_path to: {}", guest_cpuset);
|
||||
|
||||
let h = cgroups::hierarchies::auto();
|
||||
let root_cg = h.root_control_group();
|
||||
@@ -205,12 +203,12 @@ impl CgroupManager for Manager {
|
||||
let root_cpuset_controller: &CpuSetController = root_cg.controller_of().unwrap();
|
||||
let path = root_cpuset_controller.path();
|
||||
let root_path = Path::new(path);
|
||||
info!(sl!(), "root cpuset path: {:?}", &path);
|
||||
info!(sl(), "root cpuset path: {:?}", &path);
|
||||
|
||||
let container_cpuset_controller: &CpuSetController = self.cgroup.controller_of().unwrap();
|
||||
let path = container_cpuset_controller.path();
|
||||
let container_path = Path::new(path);
|
||||
info!(sl!(), "container cpuset path: {:?}", &path);
|
||||
info!(sl(), "container cpuset path: {:?}", &path);
|
||||
|
||||
let mut paths = vec![];
|
||||
for ancestor in container_path.ancestors() {
|
||||
@@ -219,7 +217,7 @@ impl CgroupManager for Manager {
|
||||
}
|
||||
paths.push(ancestor);
|
||||
}
|
||||
info!(sl!(), "parent paths to update cpuset: {:?}", &paths);
|
||||
info!(sl(), "parent paths to update cpuset: {:?}", &paths);
|
||||
|
||||
let mut i = paths.len();
|
||||
loop {
|
||||
@@ -233,7 +231,7 @@ impl CgroupManager for Manager {
|
||||
.to_str()
|
||||
.unwrap()
|
||||
.trim_start_matches(root_path.to_str().unwrap());
|
||||
info!(sl!(), "updating cpuset for parent path {:?}", &r_path);
|
||||
info!(sl(), "updating cpuset for parent path {:?}", &r_path);
|
||||
let cg = new_cgroup(cgroups::hierarchies::auto(), r_path)?;
|
||||
let cpuset_controller: &CpuSetController = cg.controller_of().unwrap();
|
||||
cpuset_controller.set_cpus(guest_cpuset)?;
|
||||
@@ -241,7 +239,7 @@ impl CgroupManager for Manager {
|
||||
|
||||
if !container_cpuset.is_empty() {
|
||||
info!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"updating cpuset for container path: {:?} cpuset: {}",
|
||||
&container_path,
|
||||
container_cpuset
|
||||
@@ -276,7 +274,7 @@ fn set_network_resources(
|
||||
network: &LinuxNetwork,
|
||||
res: &mut cgroups::Resources,
|
||||
) {
|
||||
info!(sl!(), "cgroup manager set network");
|
||||
info!(sl(), "cgroup manager set network");
|
||||
|
||||
// set classid
|
||||
// description can be found at https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v1/net_cls.html
|
||||
@@ -303,7 +301,7 @@ fn set_devices_resources(
|
||||
device_resources: &[LinuxDeviceCgroup],
|
||||
res: &mut cgroups::Resources,
|
||||
) {
|
||||
info!(sl!(), "cgroup manager set devices");
|
||||
info!(sl(), "cgroup manager set devices");
|
||||
let mut devices = vec![];
|
||||
|
||||
for d in device_resources.iter() {
|
||||
@@ -332,7 +330,7 @@ fn set_hugepages_resources(
|
||||
hugepage_limits: &[LinuxHugepageLimit],
|
||||
res: &mut cgroups::Resources,
|
||||
) {
|
||||
info!(sl!(), "cgroup manager set hugepage");
|
||||
info!(sl(), "cgroup manager set hugepage");
|
||||
let mut limits = vec![];
|
||||
let hugetlb_controller = cg.controller_of::<HugeTlbController>();
|
||||
|
||||
@@ -346,7 +344,7 @@ fn set_hugepages_resources(
|
||||
limits.push(hr);
|
||||
} else {
|
||||
warn!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"{} page size support cannot be verified, dropping requested limit", l.page_size
|
||||
);
|
||||
}
|
||||
@@ -359,7 +357,7 @@ fn set_block_io_resources(
|
||||
blkio: &LinuxBlockIo,
|
||||
res: &mut cgroups::Resources,
|
||||
) {
|
||||
info!(sl!(), "cgroup manager set block io");
|
||||
info!(sl(), "cgroup manager set block io");
|
||||
|
||||
res.blkio.weight = blkio.weight;
|
||||
res.blkio.leaf_weight = blkio.leaf_weight;
|
||||
@@ -387,13 +385,13 @@ fn set_block_io_resources(
|
||||
}
|
||||
|
||||
fn set_cpu_resources(cg: &cgroups::Cgroup, cpu: &LinuxCpu) -> Result<()> {
|
||||
info!(sl!(), "cgroup manager set cpu");
|
||||
info!(sl(), "cgroup manager set cpu");
|
||||
|
||||
let cpuset_controller: &CpuSetController = cg.controller_of().unwrap();
|
||||
|
||||
if !cpu.cpus.is_empty() {
|
||||
if let Err(e) = cpuset_controller.set_cpus(&cpu.cpus) {
|
||||
warn!(sl!(), "write cpuset failed: {:?}", e);
|
||||
warn!(sl(), "write cpuset failed: {:?}", e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -424,7 +422,7 @@ fn set_cpu_resources(cg: &cgroups::Cgroup, cpu: &LinuxCpu) -> Result<()> {
|
||||
}
|
||||
|
||||
fn set_memory_resources(cg: &cgroups::Cgroup, memory: &LinuxMemory, update: bool) -> Result<()> {
|
||||
info!(sl!(), "cgroup manager set memory");
|
||||
info!(sl(), "cgroup manager set memory");
|
||||
let mem_controller: &MemController = cg.controller_of().unwrap();
|
||||
|
||||
if !update {
|
||||
@@ -493,7 +491,7 @@ fn set_memory_resources(cg: &cgroups::Cgroup, memory: &LinuxMemory, update: bool
|
||||
}
|
||||
|
||||
fn set_pids_resources(cg: &cgroups::Cgroup, pids: &LinuxPids) -> Result<()> {
|
||||
info!(sl!(), "cgroup manager set pids");
|
||||
info!(sl(), "cgroup manager set pids");
|
||||
let pid_controller: &PidController = cg.controller_of().unwrap();
|
||||
let v = if pids.limit > 0 {
|
||||
MaxValue::Value(pids.limit)
|
||||
@@ -965,7 +963,7 @@ pub fn get_paths() -> Result<HashMap<String, String>> {
|
||||
for l in fs::read_to_string(PATHS)?.lines() {
|
||||
let fl: Vec<&str> = l.split(':').collect();
|
||||
if fl.len() != 3 {
|
||||
info!(sl!(), "Corrupted cgroup data!");
|
||||
info!(sl(), "Corrupted cgroup data!");
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -986,7 +984,7 @@ pub fn get_mounts(paths: &HashMap<String, String>) -> Result<HashMap<String, Str
|
||||
let post: Vec<&str> = p[1].split(' ').collect();
|
||||
|
||||
if post.len() != 3 {
|
||||
warn!(sl!(), "can't parse {} line {:?}", MOUNTS, l);
|
||||
warn!(sl(), "can't parse {} line {:?}", MOUNTS, l);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@@ -16,11 +16,9 @@ use inotify::{Inotify, WatchMask};
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::sync::mpsc::{channel, Receiver};
|
||||
|
||||
// Convenience macro to obtain the scope logger
|
||||
macro_rules! sl {
|
||||
() => {
|
||||
slog_scope::logger().new(o!("subsystem" => "cgroups_notifier"))
|
||||
};
|
||||
// Convenience function to obtain the scope logger.
|
||||
fn sl() -> slog::Logger {
|
||||
slog_scope::logger().new(o!("subsystem" => "cgroups_notifier"))
|
||||
}
|
||||
|
||||
pub async fn notify_oom(cid: &str, cg_dir: String) -> Result<Receiver<String>> {
|
||||
@@ -38,7 +36,7 @@ pub async fn notify_oom(cid: &str, cg_dir: String) -> Result<Receiver<String>> {
|
||||
fn get_value_from_cgroup(path: &Path, key: &str) -> Result<i64> {
|
||||
let content = fs::read_to_string(path)?;
|
||||
info!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"get_value_from_cgroup file: {:?}, content: {}", &path, &content
|
||||
);
|
||||
|
||||
@@ -67,11 +65,11 @@ async fn register_memory_event_v2(
|
||||
let event_control_path = Path::new(&cg_dir).join(memory_event_name);
|
||||
let cgroup_event_control_path = Path::new(&cg_dir).join(cgroup_event_name);
|
||||
info!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"register_memory_event_v2 event_control_path: {:?}", &event_control_path
|
||||
);
|
||||
info!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"register_memory_event_v2 cgroup_event_control_path: {:?}", &cgroup_event_control_path
|
||||
);
|
||||
|
||||
@@ -82,8 +80,8 @@ async fn register_memory_event_v2(
|
||||
// Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited
|
||||
let cg_wd = inotify.add_watch(&cgroup_event_control_path, WatchMask::MODIFY)?;
|
||||
|
||||
info!(sl!(), "ev_wd: {:?}", ev_wd);
|
||||
info!(sl!(), "cg_wd: {:?}", cg_wd);
|
||||
info!(sl(), "ev_wd: {:?}", ev_wd);
|
||||
info!(sl(), "cg_wd: {:?}", cg_wd);
|
||||
|
||||
let (sender, receiver) = channel(100);
|
||||
let containere_id = containere_id.to_string();
|
||||
@@ -97,17 +95,17 @@ async fn register_memory_event_v2(
|
||||
while let Some(event_or_error) = stream.next().await {
|
||||
let event = event_or_error.unwrap();
|
||||
info!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"container[{}] get event for container: {:?}", &containere_id, &event
|
||||
);
|
||||
// info!("is1: {}", event.wd == wd1);
|
||||
info!(sl!(), "event.wd: {:?}", event.wd);
|
||||
info!(sl(), "event.wd: {:?}", event.wd);
|
||||
|
||||
if event.wd == ev_wd {
|
||||
let oom = get_value_from_cgroup(&event_control_path, "oom_kill");
|
||||
if oom.unwrap_or(0) > 0 {
|
||||
let _ = sender.send(containere_id.clone()).await.map_err(|e| {
|
||||
error!(sl!(), "send containere_id failed, error: {:?}", e);
|
||||
error!(sl(), "send containere_id failed, error: {:?}", e);
|
||||
});
|
||||
return;
|
||||
}
|
||||
@@ -171,13 +169,13 @@ async fn register_memory_event(
|
||||
let mut buf = [0u8; 8];
|
||||
match eventfd_stream.read(&mut buf).await {
|
||||
Err(err) => {
|
||||
warn!(sl!(), "failed to read from eventfd: {:?}", err);
|
||||
warn!(sl(), "failed to read from eventfd: {:?}", err);
|
||||
return;
|
||||
}
|
||||
Ok(_) => {
|
||||
let content = fs::read_to_string(path.clone());
|
||||
info!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"cgroup event for container: {}, path: {:?}, content: {:?}",
|
||||
&containere_id,
|
||||
&path,
|
||||
@@ -193,7 +191,7 @@ async fn register_memory_event(
|
||||
}
|
||||
|
||||
let _ = sender.send(containere_id.clone()).await.map_err(|e| {
|
||||
error!(sl!(), "send containere_id failed, error: {:?}", e);
|
||||
error!(sl(), "send containere_id failed, error: {:?}", e);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
@@ -1596,10 +1596,8 @@ mod tests {
|
||||
use tempfile::tempdir;
|
||||
use test_utils::skip_if_not_root;
|
||||
|
||||
macro_rules! sl {
|
||||
() => {
|
||||
slog_scope::logger()
|
||||
};
|
||||
fn sl() -> slog::Logger {
|
||||
slog_scope::logger()
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1854,7 +1852,7 @@ mod tests {
|
||||
let _ = new_linux_container_and_then(|mut c: LinuxContainer| {
|
||||
c.processes.insert(
|
||||
1,
|
||||
Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap(),
|
||||
Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap(),
|
||||
);
|
||||
let p = c.get_process("123");
|
||||
assert!(p.is_ok(), "Expecting Ok, Got {:?}", p);
|
||||
@@ -1881,7 +1879,7 @@ mod tests {
|
||||
let (c, _dir) = new_linux_container();
|
||||
let ret = c
|
||||
.unwrap()
|
||||
.start(Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap())
|
||||
.start(Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap())
|
||||
.await;
|
||||
assert!(ret.is_err(), "Expecting Err, Got {:?}", ret);
|
||||
}
|
||||
@@ -1891,7 +1889,7 @@ mod tests {
|
||||
let (c, _dir) = new_linux_container();
|
||||
let ret = c
|
||||
.unwrap()
|
||||
.run(Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap())
|
||||
.run(Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap())
|
||||
.await;
|
||||
assert!(ret.is_err(), "Expecting Err, Got {:?}", ret);
|
||||
}
|
||||
|
||||
@@ -161,7 +161,7 @@ impl Process {
|
||||
|
||||
pub fn notify_term_close(&mut self) {
|
||||
let notify = self.term_exit_notifier.clone();
|
||||
notify.notify_waiters();
|
||||
notify.notify_one();
|
||||
}
|
||||
|
||||
pub fn close_stdin(&mut self) {
|
||||
|
||||
@@ -26,11 +26,9 @@ use oci::{LinuxDeviceCgroup, LinuxResources, Spec};
|
||||
use protocols::agent::Device;
|
||||
use tracing::instrument;
|
||||
|
||||
// Convenience macro to obtain the scope logger
|
||||
macro_rules! sl {
|
||||
() => {
|
||||
slog_scope::logger().new(o!("subsystem" => "device"))
|
||||
};
|
||||
// Convenience function to obtain the scope logger.
|
||||
fn sl() -> slog::Logger {
|
||||
slog_scope::logger().new(o!("subsystem" => "device"))
|
||||
}
|
||||
|
||||
const VM_ROOTFS: &str = "/";
|
||||
@@ -78,7 +76,7 @@ where
|
||||
{
|
||||
let syspci = Path::new(&syspci);
|
||||
let drv = drv.as_ref();
|
||||
info!(sl!(), "rebind_pci_driver: {} => {:?}", dev, drv);
|
||||
info!(sl(), "rebind_pci_driver: {} => {:?}", dev, drv);
|
||||
|
||||
let devpath = syspci.join("devices").join(dev.to_string());
|
||||
let overridepath = &devpath.join("driver_override");
|
||||
@@ -606,7 +604,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) -
|
||||
let host_minor = specdev.minor;
|
||||
|
||||
info!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"update_spec_devices() updating device";
|
||||
"container_path" => &specdev.path,
|
||||
"type" => &specdev.r#type,
|
||||
@@ -659,7 +657,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) -
|
||||
if let Some(update) = res_updates.get(&(host_type.as_str(), host_major, host_minor))
|
||||
{
|
||||
info!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"update_spec_devices() updating resource";
|
||||
"type" => &host_type,
|
||||
"host_major" => host_major,
|
||||
@@ -923,7 +921,7 @@ pub async fn add_devices(
|
||||
#[instrument]
|
||||
async fn add_device(device: &Device, sandbox: &Arc<Mutex<Sandbox>>) -> Result<SpecUpdate> {
|
||||
// log before validation to help with debugging gRPC protocol version differences.
|
||||
info!(sl!(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}",
|
||||
info!(sl(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}",
|
||||
device.id, device.type_, device.vm_path, device.container_path, device.options);
|
||||
|
||||
if device.type_.is_empty() {
|
||||
|
||||
@@ -38,11 +38,9 @@ const KATA_CC_IMAGE_WORK_DIR: &str = "/run/image/";
|
||||
const KATA_CC_PAUSE_BUNDLE: &str = "/pause_bundle";
|
||||
const CONFIG_JSON: &str = "config.json";
|
||||
|
||||
// Convenience macro to obtain the scope logger
|
||||
macro_rules! sl {
|
||||
() => {
|
||||
slog_scope::logger()
|
||||
};
|
||||
// Convenience function to obtain the scope logger.
|
||||
fn sl() -> slog::Logger {
|
||||
slog_scope::logger().new(o!("subsystem" => "cgroups"))
|
||||
}
|
||||
|
||||
pub struct ImageService {
|
||||
@@ -57,18 +55,17 @@ impl ImageService {
|
||||
env::set_var("CC_IMAGE_WORK_DIR", KATA_CC_IMAGE_WORK_DIR);
|
||||
let mut image_client = ImageClient::default();
|
||||
|
||||
let image_policy_file = &AGENT_CONFIG.read().await.image_policy_file;
|
||||
let image_policy_file = &AGENT_CONFIG.image_policy_file;
|
||||
if !image_policy_file.is_empty() {
|
||||
image_client.config.file_paths.sigstore_config = image_policy_file.clone();
|
||||
}
|
||||
|
||||
let simple_signing_sigstore_config =
|
||||
&AGENT_CONFIG.read().await.simple_signing_sigstore_config;
|
||||
let simple_signing_sigstore_config = &AGENT_CONFIG.simple_signing_sigstore_config;
|
||||
if !simple_signing_sigstore_config.is_empty() {
|
||||
image_client.config.file_paths.sigstore_config = simple_signing_sigstore_config.clone();
|
||||
}
|
||||
|
||||
let image_registry_auth_file = &AGENT_CONFIG.read().await.image_registry_auth_file;
|
||||
let image_registry_auth_file = &AGENT_CONFIG.image_registry_auth_file;
|
||||
if !image_registry_auth_file.is_empty() {
|
||||
image_client.config.file_paths.auth_file = image_registry_auth_file.clone();
|
||||
}
|
||||
@@ -88,7 +85,7 @@ impl ImageService {
|
||||
return Err(anyhow!("Pause image not present in rootfs"));
|
||||
}
|
||||
|
||||
info!(sl!(), "use guest pause image cid {:?}", cid);
|
||||
info!(sl(), "use guest pause image cid {:?}", cid);
|
||||
let pause_bundle = Path::new(CONTAINER_BASE).join(cid);
|
||||
let pause_rootfs = pause_bundle.join("rootfs");
|
||||
let pause_config = pause_bundle.join(CONFIG_JSON);
|
||||
@@ -159,12 +156,12 @@ impl ImageService {
|
||||
async fn pull_image(&self, req: &image::PullImageRequest) -> Result<String> {
|
||||
env::set_var("OCICRYPT_KEYPROVIDER_CONFIG", OCICRYPT_CONFIG_PATH);
|
||||
|
||||
let https_proxy = &AGENT_CONFIG.read().await.https_proxy;
|
||||
let https_proxy = &AGENT_CONFIG.https_proxy;
|
||||
if !https_proxy.is_empty() {
|
||||
env::set_var("HTTPS_PROXY", https_proxy);
|
||||
}
|
||||
|
||||
let no_proxy = &AGENT_CONFIG.read().await.no_proxy;
|
||||
let no_proxy = &AGENT_CONFIG.no_proxy;
|
||||
if !no_proxy.is_empty() {
|
||||
env::set_var("NO_PROXY", no_proxy);
|
||||
}
|
||||
@@ -179,7 +176,7 @@ impl ImageService {
|
||||
return Ok(image.to_owned());
|
||||
}
|
||||
|
||||
let aa_kbc_params = &AGENT_CONFIG.read().await.aa_kbc_params;
|
||||
let aa_kbc_params = &AGENT_CONFIG.aa_kbc_params;
|
||||
if !aa_kbc_params.is_empty() {
|
||||
match self.attestation_agent_started.compare_exchange_weak(
|
||||
false,
|
||||
@@ -188,22 +185,21 @@ impl ImageService {
|
||||
Ordering::SeqCst,
|
||||
) {
|
||||
Ok(_) => Self::init_attestation_agent()?,
|
||||
Err(_) => info!(sl!(), "Attestation Agent already running"),
|
||||
Err(_) => info!(sl(), "Attestation Agent already running"),
|
||||
}
|
||||
}
|
||||
// If the attestation-agent is being used, then enable the authenticated credentials support
|
||||
info!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"image_client.config.auth set to: {}",
|
||||
!aa_kbc_params.is_empty()
|
||||
);
|
||||
self.image_client.lock().await.config.auth = !aa_kbc_params.is_empty();
|
||||
|
||||
// Read enable signature verification from the agent config and set it in the image_client
|
||||
let enable_signature_verification =
|
||||
&AGENT_CONFIG.read().await.enable_signature_verification;
|
||||
let enable_signature_verification = &AGENT_CONFIG.enable_signature_verification;
|
||||
info!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"enable_signature_verification set to: {}", enable_signature_verification
|
||||
);
|
||||
self.image_client.lock().await.config.security_validate = *enable_signature_verification;
|
||||
@@ -215,7 +211,7 @@ impl ImageService {
|
||||
|
||||
let decrypt_config = format!("provider:attestation-agent:{}", aa_kbc_params);
|
||||
|
||||
info!(sl!(), "pull image {:?}, bundle path {:?}", cid, bundle_path);
|
||||
info!(sl(), "pull image {:?}, bundle path {:?}", cid, bundle_path);
|
||||
// Image layers will store at KATA_CC_IMAGE_WORK_DIR, generated bundles
|
||||
// with rootfs and config.json will store under CONTAINER_BASE/cid.
|
||||
let res = self
|
||||
@@ -228,13 +224,13 @@ impl ImageService {
|
||||
match res {
|
||||
Ok(image) => {
|
||||
info!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"pull and unpack image {:?}, cid: {:?}, with image-rs succeed. ", image, cid
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
sl!(),
|
||||
sl(),
|
||||
"pull and unpack image {:?}, cid: {:?}, with image-rs failed with {:?}. ",
|
||||
image,
|
||||
cid,
|
||||
|
||||
@@ -65,7 +65,7 @@ use tokio::{
|
||||
io::AsyncWrite,
|
||||
sync::{
|
||||
watch::{channel, Receiver},
|
||||
Mutex, RwLock,
|
||||
Mutex,
|
||||
},
|
||||
task::JoinHandle,
|
||||
};
|
||||
@@ -84,12 +84,11 @@ cfg_if! {
|
||||
const NAME: &str = "kata-agent";
|
||||
|
||||
lazy_static! {
|
||||
static ref AGENT_CONFIG: Arc<RwLock<AgentConfig>> = Arc::new(RwLock::new(
|
||||
static ref AGENT_CONFIG: AgentConfig =
|
||||
// Note: We can't do AgentOpts.parse() here to send through the processed arguments to AgentConfig
|
||||
// clap::Parser::parse() greedily process all command line input including cargo test parameters,
|
||||
// so should only be used inside main.
|
||||
AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap()
|
||||
));
|
||||
AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap();
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -182,13 +181,13 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
|
||||
lazy_static::initialize(&AGENT_CONFIG);
|
||||
|
||||
init_agent_as_init(&logger, AGENT_CONFIG.read().await.unified_cgroup_hierarchy)?;
|
||||
init_agent_as_init(&logger, AGENT_CONFIG.unified_cgroup_hierarchy)?;
|
||||
drop(logger_async_guard);
|
||||
} else {
|
||||
lazy_static::initialize(&AGENT_CONFIG);
|
||||
}
|
||||
|
||||
let config = AGENT_CONFIG.read().await;
|
||||
let config = &AGENT_CONFIG;
|
||||
let log_vport = config.log_vport as u32;
|
||||
|
||||
let log_handle = tokio::spawn(create_logger_task(rfd, log_vport, shutdown_rx.clone()));
|
||||
@@ -201,7 +200,7 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
let (logger, logger_async_guard) =
|
||||
logging::create_logger(NAME, "agent", config.log_level, writer);
|
||||
|
||||
announce(&logger, &config);
|
||||
announce(&logger, config);
|
||||
|
||||
// This variable is required as it enables the global (and crucially static) logger,
|
||||
// which is required to satisfy the the lifetime constraints of the auto-generated gRPC code.
|
||||
@@ -229,7 +228,7 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
let span_guard = root_span.enter();
|
||||
|
||||
// Start the sandbox and wait for its ttRPC server to end
|
||||
start_sandbox(&logger, &config, init_mode, &mut tasks, shutdown_rx.clone()).await?;
|
||||
start_sandbox(&logger, config, init_mode, &mut tasks, shutdown_rx.clone()).await?;
|
||||
|
||||
// Install a NOP logger for the remainder of the shutdown sequence
|
||||
// to ensure any log calls made by local crates using the scope logger
|
||||
|
||||
@@ -15,11 +15,9 @@ use tracing::instrument;
|
||||
const NAMESPACE_KATA_AGENT: &str = "kata_agent";
|
||||
const NAMESPACE_KATA_GUEST: &str = "kata_guest";
|
||||
|
||||
// Convenience macro to obtain the scope logger
|
||||
macro_rules! sl {
|
||||
() => {
|
||||
slog_scope::logger().new(o!("subsystem" => "metrics"))
|
||||
};
|
||||
// Convenience function to obtain the scope logger.
|
||||
fn sl() -> slog::Logger {
|
||||
slog_scope::logger().new(o!("subsystem" => "metrics"))
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -139,7 +137,7 @@ fn update_agent_metrics() -> Result<()> {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
// FIXME: return Ok for all errors?
|
||||
warn!(sl!(), "failed to create process instance: {:?}", e);
|
||||
warn!(sl(), "failed to create process instance: {:?}", e);
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
@@ -160,7 +158,7 @@ fn update_agent_metrics() -> Result<()> {
|
||||
// io
|
||||
match me.io() {
|
||||
Err(err) => {
|
||||
info!(sl!(), "failed to get process io stat: {:?}", err);
|
||||
info!(sl(), "failed to get process io stat: {:?}", err);
|
||||
}
|
||||
Ok(io) => {
|
||||
set_gauge_vec_proc_io(&AGENT_IO_STAT, &io);
|
||||
@@ -169,7 +167,7 @@ fn update_agent_metrics() -> Result<()> {
|
||||
|
||||
match me.stat() {
|
||||
Err(err) => {
|
||||
info!(sl!(), "failed to get process stat: {:?}", err);
|
||||
info!(sl(), "failed to get process stat: {:?}", err);
|
||||
}
|
||||
Ok(stat) => {
|
||||
set_gauge_vec_proc_stat(&AGENT_PROC_STAT, &stat);
|
||||
@@ -177,7 +175,7 @@ fn update_agent_metrics() -> Result<()> {
|
||||
}
|
||||
|
||||
match me.status() {
|
||||
Err(err) => error!(sl!(), "failed to get process status: {:?}", err),
|
||||
Err(err) => error!(sl(), "failed to get process status: {:?}", err),
|
||||
Ok(status) => set_gauge_vec_proc_status(&AGENT_PROC_STATUS, &status),
|
||||
}
|
||||
|
||||
@@ -189,7 +187,7 @@ fn update_guest_metrics() {
|
||||
// try get load and task info
|
||||
match procfs::LoadAverage::new() {
|
||||
Err(err) => {
|
||||
info!(sl!(), "failed to get guest LoadAverage: {:?}", err);
|
||||
info!(sl(), "failed to get guest LoadAverage: {:?}", err);
|
||||
}
|
||||
Ok(load) => {
|
||||
GUEST_LOAD
|
||||
@@ -209,7 +207,7 @@ fn update_guest_metrics() {
|
||||
// try to get disk stats
|
||||
match procfs::diskstats() {
|
||||
Err(err) => {
|
||||
info!(sl!(), "failed to get guest diskstats: {:?}", err);
|
||||
info!(sl(), "failed to get guest diskstats: {:?}", err);
|
||||
}
|
||||
Ok(diskstats) => {
|
||||
for diskstat in diskstats {
|
||||
@@ -221,7 +219,7 @@ fn update_guest_metrics() {
|
||||
// try to get vm stats
|
||||
match procfs::vmstat() {
|
||||
Err(err) => {
|
||||
info!(sl!(), "failed to get guest vmstat: {:?}", err);
|
||||
info!(sl(), "failed to get guest vmstat: {:?}", err);
|
||||
}
|
||||
Ok(vmstat) => {
|
||||
for (k, v) in vmstat {
|
||||
@@ -233,7 +231,7 @@ fn update_guest_metrics() {
|
||||
// cpu stat
|
||||
match procfs::KernelStats::new() {
|
||||
Err(err) => {
|
||||
info!(sl!(), "failed to get guest KernelStats: {:?}", err);
|
||||
info!(sl(), "failed to get guest KernelStats: {:?}", err);
|
||||
}
|
||||
Ok(kernel_stats) => {
|
||||
set_gauge_vec_cpu_time(&GUEST_CPU_TIME, "total", &kernel_stats.total);
|
||||
@@ -246,7 +244,7 @@ fn update_guest_metrics() {
|
||||
// try to get net device stats
|
||||
match procfs::net::dev_status() {
|
||||
Err(err) => {
|
||||
info!(sl!(), "failed to get guest net::dev_status: {:?}", err);
|
||||
info!(sl(), "failed to get guest net::dev_status: {:?}", err);
|
||||
}
|
||||
Ok(devs) => {
|
||||
// netdev: map[string]procfs::net::DeviceStatus
|
||||
@@ -259,7 +257,7 @@ fn update_guest_metrics() {
|
||||
// get statistics about memory from /proc/meminfo
|
||||
match procfs::Meminfo::new() {
|
||||
Err(err) => {
|
||||
info!(sl!(), "failed to get guest Meminfo: {:?}", err);
|
||||
info!(sl(), "failed to get guest Meminfo: {:?}", err);
|
||||
}
|
||||
Ok(meminfo) => {
|
||||
set_gauge_vec_meminfo(&GUEST_MEMINFO, &meminfo);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -69,7 +69,7 @@ macro_rules! trace_rpc_call {
|
||||
propagator.extract(&extract_carrier_from_ttrpc($ctx))
|
||||
});
|
||||
|
||||
info!(sl!(), "rpc call from shim to agent: {:?}", $name);
|
||||
info!(sl(), "rpc call from shim to agent: {:?}", $name);
|
||||
|
||||
// generate tracing span
|
||||
let rpc_span = span!(tracing::Level::INFO, $name, "mod"="rpc.rs", req=?$req);
|
||||
|
||||
@@ -19,11 +19,9 @@ use tokio::sync::watch::Receiver;
|
||||
use tokio::sync::Mutex;
|
||||
use tracing::instrument;
|
||||
|
||||
// Convenience macro to obtain the scope logger
|
||||
macro_rules! sl {
|
||||
() => {
|
||||
slog_scope::logger().new(o!("subsystem" => "uevent"))
|
||||
};
|
||||
// Convenience function to obtain the scope logger.
|
||||
fn sl() -> slog::Logger {
|
||||
slog_scope::logger().new(o!("subsystem" => "uevent"))
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Eq)]
|
||||
@@ -120,11 +118,11 @@ pub async fn wait_for_uevent(
|
||||
) -> Result<Uevent> {
|
||||
let logprefix = format!("Waiting for {:?}", &matcher);
|
||||
|
||||
info!(sl!(), "{}", logprefix);
|
||||
info!(sl(), "{}", logprefix);
|
||||
let mut sb = sandbox.lock().await;
|
||||
for uev in sb.uevent_map.values() {
|
||||
if matcher.is_match(uev) {
|
||||
info!(sl!(), "{}: found {:?} in uevent map", logprefix, &uev);
|
||||
info!(sl(), "{}: found {:?} in uevent map", logprefix, &uev);
|
||||
return Ok(uev.clone());
|
||||
}
|
||||
}
|
||||
@@ -139,9 +137,9 @@ pub async fn wait_for_uevent(
|
||||
sb.uevent_watchers.push(Some((Box::new(matcher), tx)));
|
||||
drop(sb); // unlock
|
||||
|
||||
info!(sl!(), "{}: waiting on channel", logprefix);
|
||||
info!(sl(), "{}: waiting on channel", logprefix);
|
||||
|
||||
let hotplug_timeout = AGENT_CONFIG.read().await.hotplug_timeout;
|
||||
let hotplug_timeout = AGENT_CONFIG.hotplug_timeout;
|
||||
|
||||
let uev = match tokio::time::timeout(hotplug_timeout, rx).await {
|
||||
Ok(v) => v?,
|
||||
@@ -157,7 +155,7 @@ pub async fn wait_for_uevent(
|
||||
}
|
||||
};
|
||||
|
||||
info!(sl!(), "{}: found {:?} on channel", logprefix, &uev);
|
||||
info!(sl(), "{}: found {:?} on channel", logprefix, &uev);
|
||||
Ok(uev)
|
||||
}
|
||||
|
||||
|
||||
@@ -341,7 +341,10 @@ impl DragonballInner {
|
||||
|
||||
// cannot exceed maximum value
|
||||
if new_vcpus > self.config.cpu_info.default_maxvcpus {
|
||||
return Err(anyhow!("resize vcpu error: cannot greater than maxvcpus"));
|
||||
warn!(
|
||||
sl!(),
|
||||
"Cannot allocate more vcpus than the max allowed number of vcpus. The maximum allowed amount of vcpus will be used instead.");
|
||||
return Ok((current_vcpus, self.config.cpu_info.default_maxvcpus));
|
||||
}
|
||||
|
||||
Ok((current_vcpus, new_vcpus))
|
||||
|
||||
@@ -105,7 +105,12 @@ impl InitialSizeManager {
|
||||
hv.cpu_info.default_vcpus = self.resource.vcpu as i32
|
||||
}
|
||||
if self.resource.mem_mb > 0 {
|
||||
hv.memory_info.default_memory = self.resource.mem_mb;
|
||||
// since the memory overhead introduced by kata-agent and system components
|
||||
// will really affect the amount of memory the user can use, so we choose to
|
||||
// plus the default_memory here, instead of overriding it.
|
||||
// (if we override the default_memory here, and user apllications still
|
||||
// use memory as they orignally expected, it would be easy to OOM.)
|
||||
hv.memory_info.default_memory += self.resource.mem_mb;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ import (
|
||||
"github.com/prometheus/procfs"
|
||||
"github.com/urfave/cli"
|
||||
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
|
||||
@@ -113,8 +113,8 @@ type HypervisorInfo struct {
|
||||
SocketPath string
|
||||
Msize9p uint32
|
||||
MemorySlots uint32
|
||||
PCIeRootPort uint32
|
||||
ColdPlugVFIO hv.PCIePort
|
||||
HotPlugVFIO config.PCIePort
|
||||
ColdPlugVFIO config.PCIePort
|
||||
HotplugVFIOOnRootBus bool
|
||||
Debug bool
|
||||
}
|
||||
@@ -317,9 +317,9 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) {
|
||||
EntropySource: config.HypervisorConfig.EntropySource,
|
||||
SharedFS: config.HypervisorConfig.SharedFS,
|
||||
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
|
||||
HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
|
||||
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
|
||||
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
|
||||
PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
|
||||
SocketPath: socketPath,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -19,12 +19,12 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/BurntSushi/toml"
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
||||
vcUtils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
|
||||
specs "github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/urfave/cli"
|
||||
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
|
||||
@@ -74,8 +74,9 @@ func createConfig(configPath string, fileData string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeConfig, err error) {
|
||||
var coldPlugVFIO hv.PCIePort
|
||||
func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.RuntimeConfig, err error) {
|
||||
var hotPlugVFIO config.PCIePort
|
||||
var coldPlugVFIO config.PCIePort
|
||||
const logPath = "/log/path"
|
||||
hypervisorPath := filepath.Join(prefixDir, "hypervisor")
|
||||
kernelPath := filepath.Join(prefixDir, "kernel")
|
||||
@@ -87,8 +88,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
|
||||
blockStorageDriver := "virtio-scsi"
|
||||
enableIOThreads := true
|
||||
hotplugVFIOOnRootBus := true
|
||||
pcieRootPort := uint32(2)
|
||||
coldPlugVFIO = hv.NoPort
|
||||
hotPlugVFIO = config.BridgePort
|
||||
coldPlugVFIO = config.NoPort
|
||||
disableNewNetNs := false
|
||||
sharedFS := "virtio-9p"
|
||||
virtioFSdaemon := filepath.Join(prefixDir, "virtiofsd")
|
||||
@@ -132,8 +133,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
|
||||
BlockDeviceDriver: blockStorageDriver,
|
||||
EnableIOThreads: enableIOThreads,
|
||||
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
|
||||
HotPlugVFIO: hotPlugVFIO,
|
||||
ColdPlugVFIO: coldPlugVFIO,
|
||||
PCIeRootPort: pcieRootPort,
|
||||
DisableNewNetNs: disableNewNetNs,
|
||||
DefaultVCPUCount: hypConfig.NumVCPUs,
|
||||
DefaultMaxVCPUCount: hypConfig.DefaultMaxVCPUs,
|
||||
@@ -156,12 +157,12 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
|
||||
return "", oci.RuntimeConfig{}, err
|
||||
}
|
||||
|
||||
_, config, err = katautils.LoadConfiguration(configFile, true)
|
||||
_, ociConfig, err = katautils.LoadConfiguration(configFile, true)
|
||||
if err != nil {
|
||||
return "", oci.RuntimeConfig{}, err
|
||||
}
|
||||
|
||||
return configFile, config, nil
|
||||
return configFile, ociConfig, nil
|
||||
}
|
||||
|
||||
func getExpectedAgentDetails(config oci.RuntimeConfig) (AgentInfo, error) {
|
||||
@@ -277,7 +278,7 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo {
|
||||
VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon,
|
||||
|
||||
HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
|
||||
PCIeRootPort: config.HypervisorConfig.PCIeRootPort,
|
||||
HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO,
|
||||
ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO,
|
||||
}
|
||||
|
||||
|
||||
@@ -131,6 +131,11 @@ default_maxmemory = @DEFMAXMEMSZ@
|
||||
# Shared file system type:
|
||||
# - virtio-fs (default)
|
||||
# - virtio-fs-nydus
|
||||
# - none
|
||||
# WARNING: "none" should be carefully used, and only used in very few specific cases, as
|
||||
# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
|
||||
# issues with rotation of secrets, certs, or configurations via kubernetes objects like
|
||||
# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
|
||||
shared_fs = "@DEFSHAREDFS_CLH_VIRTIOFS@"
|
||||
|
||||
# Path to vhost-user-fs daemon.
|
||||
|
||||
@@ -178,6 +178,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
|
||||
# - virtio-fs (default)
|
||||
# - virtio-9p
|
||||
# - virtio-fs-nydus
|
||||
# - none
|
||||
# WARNING: "none" should be carefully used, and only used in very few specific cases, as
|
||||
# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
|
||||
# issues with rotation of secrets, certs, or configurations via kubernetes objects like
|
||||
# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
|
||||
shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@"
|
||||
|
||||
# Path to vhost-user-fs daemon.
|
||||
|
||||
@@ -186,6 +186,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
|
||||
# - virtio-fs (default)
|
||||
# - virtio-9p
|
||||
# - virtio-fs-nydus
|
||||
# - none
|
||||
# WARNING: "none" should be carefully used, and only used in very few specific cases, as
|
||||
# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
|
||||
# issues with rotation of secrets, certs, or configurations via kubernetes objects like
|
||||
# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
|
||||
shared_fs = "@DEFSHAREDFS_QEMU_SEV_VIRTIOFS@"
|
||||
|
||||
# Path to vhost-user-fs daemon.
|
||||
@@ -669,4 +674,4 @@ service_offload = @DEFSERVICEOFFLOAD@
|
||||
#
|
||||
# Keys can be remotely provisioned. The Kata agent fetches them from e.g.
|
||||
# a HTTPS URL:
|
||||
#provision=https://my-key-broker.foo/tenant/<tenant-id>
|
||||
#provision=https://my-key-broker.foo/tenant/<tenant-id>
|
||||
|
||||
@@ -184,6 +184,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
|
||||
# - virtio-fs (default)
|
||||
# - virtio-9p
|
||||
# - virtio-fs-nydus
|
||||
# - none
|
||||
# WARNING: "none" should be carefully used, and only used in very few specific cases, as
|
||||
# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
|
||||
# issues with rotation of secrets, certs, or configurations via kubernetes objects like
|
||||
# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
|
||||
shared_fs = "@DEFSHAREDFS_QEMU_SNP_VIRTIOFS@"
|
||||
|
||||
# Path to vhost-user-fs daemon.
|
||||
|
||||
@@ -172,6 +172,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
|
||||
# - virtio-fs (default)
|
||||
# - virtio-9p
|
||||
# - virtio-fs-nydus
|
||||
# - none
|
||||
# WARNING: "none" should be carefully used, and only used in very few specific cases, as
|
||||
# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
|
||||
# issues with rotation of secrets, certs, or configurations via kubernetes objects like
|
||||
# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
|
||||
shared_fs = "@DEFSHAREDFS_QEMU_TDX_VIRTIOFS@"
|
||||
|
||||
# Path to vhost-user-fs daemon.
|
||||
|
||||
@@ -206,6 +206,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
|
||||
# - virtio-fs (default)
|
||||
# - virtio-9p
|
||||
# - virtio-fs-nydus
|
||||
# - none
|
||||
# WARNING: "none" should be carefully used, and only used in very few specific cases, as
|
||||
# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
|
||||
# issues with rotation of secrets, certs, or configurations via kubernetes objects like
|
||||
# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
|
||||
shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@"
|
||||
|
||||
# Path to vhost-user-fs daemon.
|
||||
@@ -380,8 +385,15 @@ pflashes = []
|
||||
# Default false
|
||||
#hotplug_vfio_on_root_bus = true
|
||||
|
||||
# Enable hot-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port"
|
||||
#hot_plug_vfio = "root-port"
|
||||
|
||||
# In a confidential compute environment hot-plugging can compromise
|
||||
# security. Enable cold-plugging of VFIO devices to a root-port.
|
||||
# security.
|
||||
# Enable cold-plugging of VFIO devices to a bridge-port,
|
||||
# root-port or switch-port.
|
||||
# The default setting is "no-port", which means disabled.
|
||||
#cold_plug_vfio = "root-port"
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ import (
|
||||
specs "github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
|
||||
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
||||
vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
|
||||
@@ -308,8 +308,9 @@ func TestCreateContainerConfigFail(t *testing.T) {
|
||||
assert.Error(err)
|
||||
}
|
||||
|
||||
func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err error) {
|
||||
var coldPlugVFIO hv.PCIePort
|
||||
func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, err error) {
|
||||
var hotPlugVFIO config.PCIePort
|
||||
var coldPlugVFIO config.PCIePort
|
||||
if dir == "" {
|
||||
return "", fmt.Errorf("BUG: need directory")
|
||||
}
|
||||
@@ -330,11 +331,11 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
|
||||
blockDeviceDriver := "virtio-scsi"
|
||||
enableIOThreads := true
|
||||
hotplugVFIOOnRootBus := true
|
||||
pcieRootPort := uint32(2)
|
||||
disableNewNetNs := false
|
||||
sharedFS := "virtio-9p"
|
||||
virtioFSdaemon := path.Join(dir, "virtiofsd")
|
||||
coldPlugVFIO = hv.RootPort
|
||||
hotPlugVFIO = config.BridgePort
|
||||
coldPlugVFIO = config.RootPort
|
||||
|
||||
configFileOptions := ktu.RuntimeConfigOptions{
|
||||
Hypervisor: "qemu",
|
||||
@@ -349,10 +350,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
|
||||
BlockDeviceDriver: blockDeviceDriver,
|
||||
EnableIOThreads: enableIOThreads,
|
||||
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
|
||||
PCIeRootPort: pcieRootPort,
|
||||
DisableNewNetNs: disableNewNetNs,
|
||||
SharedFS: sharedFS,
|
||||
VirtioFSDaemon: virtioFSdaemon,
|
||||
HotPlugVFIO: hotPlugVFIO,
|
||||
ColdPlugVFIO: coldPlugVFIO,
|
||||
}
|
||||
|
||||
|
||||
@@ -81,6 +81,17 @@ const (
|
||||
|
||||
// VirtioFSNydus means use nydus for the shared file system
|
||||
VirtioFSNydus = "virtio-fs-nydus"
|
||||
|
||||
// NoSharedFS means *no* shared file system solution will be used
|
||||
// and files will be copied into the guest system.
|
||||
//
|
||||
// WARNING: This should be carefully used, and only used in very few
|
||||
// specific cases, as any update to the mount will *NOT* be reflected
|
||||
// during the lifecycle of the pod, causing issues with rotation of
|
||||
// secrets, certs, or configurations via kubernetes objects like
|
||||
// configMaps or secrets, as those will be copied into the guest at
|
||||
// *pod* *creation* *time*.
|
||||
NoSharedFS = "none"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -114,14 +125,117 @@ const (
|
||||
// SysDevPrefix is static string of /sys/dev
|
||||
var SysDevPrefix = "/sys/dev"
|
||||
|
||||
// SysIOMMUPath is static string of /sys/kernel/iommu_groups
|
||||
var SysIOMMUPath = "/sys/kernel/iommu_groups"
|
||||
// SysIOMMUGroupPath is static string of /sys/kernel/iommu_groups
|
||||
var SysIOMMUGroupPath = "/sys/kernel/iommu_groups"
|
||||
|
||||
// SysBusPciDevicesPath is static string of /sys/bus/pci/devices
|
||||
var SysBusPciDevicesPath = "/sys/bus/pci/devices"
|
||||
|
||||
var getSysDevPath = getSysDevPathImpl
|
||||
|
||||
// PCIePortBusPrefix gives us the correct bus nameing dependeing on the port
|
||||
// used to hot(cold)-plug the device
|
||||
type PCIePortBusPrefix string
|
||||
|
||||
const (
|
||||
PCIeRootPortPrefix PCIePortBusPrefix = "rp"
|
||||
PCIeSwitchPortPrefix PCIePortBusPrefix = "sw"
|
||||
PCIeSwitchUpstreamPortPrefix PCIePortBusPrefix = "swup"
|
||||
PCIeSwitchhDownstreamPortPrefix PCIePortBusPrefix = "swdp"
|
||||
PCIBridgePortPrefix PCIePortBusPrefix = "bp"
|
||||
)
|
||||
|
||||
func (p PCIePortBusPrefix) String() string {
|
||||
switch p {
|
||||
case PCIeRootPortPrefix:
|
||||
fallthrough
|
||||
case PCIeSwitchPortPrefix:
|
||||
fallthrough
|
||||
case PCIeSwitchUpstreamPortPrefix:
|
||||
fallthrough
|
||||
case PCIeSwitchhDownstreamPortPrefix:
|
||||
fallthrough
|
||||
case PCIBridgePortPrefix:
|
||||
return string(p)
|
||||
}
|
||||
return fmt.Sprintf("<unknown PCIePortBusPrefix: %s>", string(p))
|
||||
}
|
||||
|
||||
// PCIePort distinguish only between root and switch port
|
||||
type PCIePort string
|
||||
|
||||
const (
|
||||
// RootPort attach VFIO devices to a root-port
|
||||
RootPort PCIePort = "root-port"
|
||||
// SwitchPort attach VFIO devices to a switch-port
|
||||
SwitchPort = "switch-port"
|
||||
// BridgePort is the default
|
||||
BridgePort = "bridge-port"
|
||||
// NoPort is for disabling VFIO hotplug/coldplug
|
||||
NoPort = "no-port"
|
||||
// InvalidPort is for invalid port
|
||||
InvalidPort = "invalid-port"
|
||||
)
|
||||
|
||||
func (p PCIePort) String() string {
|
||||
switch p {
|
||||
case RootPort:
|
||||
fallthrough
|
||||
case SwitchPort:
|
||||
fallthrough
|
||||
case BridgePort:
|
||||
fallthrough
|
||||
case NoPort:
|
||||
fallthrough
|
||||
case InvalidPort:
|
||||
return string(p)
|
||||
}
|
||||
return fmt.Sprintf("<unknown PCIePort: %s>", string(p))
|
||||
}
|
||||
|
||||
var PCIePortPrefixMapping = map[PCIePort]PCIePortBusPrefix{
|
||||
RootPort: PCIeRootPortPrefix,
|
||||
SwitchPort: PCIeSwitchhDownstreamPortPrefix,
|
||||
BridgePort: PCIBridgePortPrefix,
|
||||
}
|
||||
|
||||
func (p PCIePort) Invalid() bool {
|
||||
switch p {
|
||||
case RootPort:
|
||||
fallthrough
|
||||
case SwitchPort:
|
||||
fallthrough
|
||||
case BridgePort:
|
||||
fallthrough
|
||||
case NoPort:
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (p PCIePort) Valid() bool {
|
||||
switch p {
|
||||
case RootPort:
|
||||
fallthrough
|
||||
case SwitchPort:
|
||||
fallthrough
|
||||
case BridgePort:
|
||||
fallthrough
|
||||
case NoPort:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
type PCIePortMapping map[string]bool
|
||||
|
||||
var (
|
||||
// Each of this structures keeps track of the devices attached to the
|
||||
// different types of PCI ports. We can deduces the Bus number from it
|
||||
// and eliminate duplicates being assigned.
|
||||
PCIeDevices = map[PCIePort]PCIePortMapping{}
|
||||
)
|
||||
|
||||
// DeviceInfo is an embedded type that contains device data common to all types of devices.
|
||||
type DeviceInfo struct {
|
||||
// DriverOptions is specific options for each device driver
|
||||
@@ -167,6 +281,9 @@ type DeviceInfo struct {
|
||||
// ColdPlug specifies whether the device must be cold plugged (true)
|
||||
// or hot plugged (false).
|
||||
ColdPlug bool
|
||||
|
||||
// Specifies the PCIe port type to which the device is attached
|
||||
Port PCIePort
|
||||
}
|
||||
|
||||
// BlockDrive represents a block storage drive which may be used in case the storage
|
||||
@@ -268,14 +385,8 @@ const (
|
||||
VFIOAPDeviceMediatedType
|
||||
)
|
||||
|
||||
type VFIODev interface {
|
||||
GetID() *string
|
||||
GetType() VFIODeviceType
|
||||
GetSysfsDev() *string
|
||||
}
|
||||
|
||||
// VFIOPCIDev represents a VFIO PCI device used for hotplugging
|
||||
type VFIOPCIDev struct {
|
||||
// VFIODev represents a VFIO PCI device used for hotplugging
|
||||
type VFIODev struct {
|
||||
// ID is used to identify this drive in the hypervisor options.
|
||||
ID string
|
||||
|
||||
@@ -305,44 +416,15 @@ type VFIOPCIDev struct {
|
||||
|
||||
// IsPCIe specifies device is PCIe or PCI
|
||||
IsPCIe bool
|
||||
}
|
||||
|
||||
func (d VFIOPCIDev) GetID() *string {
|
||||
return &d.ID
|
||||
}
|
||||
|
||||
func (d VFIOPCIDev) GetType() VFIODeviceType {
|
||||
return d.Type
|
||||
}
|
||||
|
||||
func (d VFIOPCIDev) GetSysfsDev() *string {
|
||||
return &d.SysfsDev
|
||||
}
|
||||
|
||||
type VFIOAPDev struct {
|
||||
// ID is used to identify this drive in the hypervisor options.
|
||||
ID string
|
||||
|
||||
// sysfsdev of VFIO mediated device
|
||||
SysfsDev string
|
||||
|
||||
// APDevices are the Adjunct Processor devices assigned to the mdev
|
||||
APDevices []string
|
||||
|
||||
// Type of VFIO device
|
||||
Type VFIODeviceType
|
||||
}
|
||||
// Rank identifies a device in a IOMMU group
|
||||
Rank int
|
||||
|
||||
func (d VFIOAPDev) GetID() *string {
|
||||
return &d.ID
|
||||
}
|
||||
|
||||
func (d VFIOAPDev) GetType() VFIODeviceType {
|
||||
return d.Type
|
||||
}
|
||||
|
||||
func (d VFIOAPDev) GetSysfsDev() *string {
|
||||
return &d.SysfsDev
|
||||
// Port is the PCIe port type to which the device is attached
|
||||
Port PCIePort
|
||||
}
|
||||
|
||||
// RNGDev represents a random number generator device
|
||||
|
||||
@@ -47,9 +47,9 @@ func deviceLogger() *logrus.Entry {
|
||||
return api.DeviceLogger()
|
||||
}
|
||||
|
||||
// Identify PCIe device by reading the size of the PCI config space
|
||||
// IsPCIeDevice identifies PCIe device by reading the size of the PCI config space
|
||||
// Plain PCI device have 256 bytes of config space where PCIe devices have 4K
|
||||
func isPCIeDevice(bdf string) bool {
|
||||
func IsPCIeDevice(bdf string) bool {
|
||||
if len(strings.Split(bdf, ":")) == 2 {
|
||||
bdf = PCIDomain + ":" + bdf
|
||||
}
|
||||
@@ -157,14 +157,12 @@ func checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (boo
|
||||
|
||||
// GetAllVFIODevicesFromIOMMUGroup returns all the VFIO devices in the IOMMU group
|
||||
// We can reuse this function at various levels, sandbox, container.
|
||||
// Only the VFIO module is allowed to do bus assignments, all other modules need to
|
||||
// ignore it if used as helper function to get VFIO information.
|
||||
func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignment bool) ([]*config.VFIODev, error) {
|
||||
func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODev, error) {
|
||||
|
||||
vfioDevs := []*config.VFIODev{}
|
||||
|
||||
vfioGroup := filepath.Base(device.HostPath)
|
||||
iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices")
|
||||
iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioGroup, "devices")
|
||||
|
||||
deviceFiles, err := os.ReadDir(iommuDevicesPath)
|
||||
if err != nil {
|
||||
@@ -174,7 +172,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
|
||||
// Pass all devices in iommu group
|
||||
for i, deviceFile := range deviceFiles {
|
||||
//Get bdf of device eg 0000:00:1c.0
|
||||
deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath)
|
||||
deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(deviceFile.Name(), iommuDevicesPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -196,27 +194,24 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
|
||||
|
||||
switch vfioDeviceType {
|
||||
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
|
||||
isPCIe := isPCIeDevice(deviceBDF)
|
||||
// Do not directly assign to `vfio` -- need to access field still
|
||||
vfioPCI := config.VFIOPCIDev{
|
||||
vfio = config.VFIODev{
|
||||
ID: id,
|
||||
Type: vfioDeviceType,
|
||||
BDF: deviceBDF,
|
||||
SysfsDev: deviceSysfsDev,
|
||||
IsPCIe: isPCIe,
|
||||
IsPCIe: IsPCIeDevice(deviceBDF),
|
||||
Class: pciClass,
|
||||
Rank: -1,
|
||||
Port: device.Port,
|
||||
}
|
||||
if isPCIe && !ignoreBusAssignment {
|
||||
vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs))
|
||||
AllPCIeDevs[deviceBDF] = true
|
||||
}
|
||||
vfio = vfioPCI
|
||||
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
devices, err := GetAPVFIODevices(deviceSysfsDev)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
vfio = config.VFIOAPDev{
|
||||
vfio = config.VFIODev{
|
||||
ID: id,
|
||||
SysfsDev: deviceSysfsDev,
|
||||
Type: config.VFIOAPDeviceMediatedType,
|
||||
|
||||
@@ -28,14 +28,9 @@ const (
|
||||
vfioRemoveIDPath = "/sys/bus/pci/drivers/vfio-pci/remove_id"
|
||||
iommuGroupPath = "/sys/bus/pci/devices/%s/iommu_group"
|
||||
vfioDevPath = "/dev/vfio/%s"
|
||||
pcieRootPortPrefix = "rp"
|
||||
vfioAPSysfsDir = "/sys/devices/vfio_ap"
|
||||
)
|
||||
|
||||
var (
|
||||
AllPCIeDevs = map[string]bool{}
|
||||
)
|
||||
|
||||
// VFIODevice is a vfio device meant to be passed to the hypervisor
|
||||
// to be used by the Virtual Machine.
|
||||
type VFIODevice struct {
|
||||
@@ -70,10 +65,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
|
||||
}
|
||||
}()
|
||||
|
||||
device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo, false)
|
||||
device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, vfio := range device.VfioDevs {
|
||||
if vfio.IsPCIe {
|
||||
busIndex := len(config.PCIeDevices[vfio.Port])
|
||||
vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
|
||||
config.PCIeDevices[vfio.Port][vfio.BDF] = true
|
||||
}
|
||||
}
|
||||
|
||||
coldPlug := device.DeviceInfo.ColdPlug
|
||||
deviceLogger().WithField("cold-plug", coldPlug).Info("Attaching VFIO device")
|
||||
@@ -169,23 +171,18 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
|
||||
for _, dev := range ds.VFIODevs {
|
||||
var vfio config.VFIODev
|
||||
|
||||
vfioDeviceType := (*device.VfioDevs[0]).GetType()
|
||||
switch vfioDeviceType {
|
||||
switch dev.Type {
|
||||
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
|
||||
bdf := ""
|
||||
if pciDev, ok := (*dev).(config.VFIOPCIDev); ok {
|
||||
bdf = pciDev.BDF
|
||||
}
|
||||
vfio = config.VFIOPCIDev{
|
||||
ID: *(*dev).GetID(),
|
||||
Type: config.VFIODeviceType((*dev).GetType()),
|
||||
BDF: bdf,
|
||||
SysfsDev: *(*dev).GetSysfsDev(),
|
||||
vfio = config.VFIODev{
|
||||
ID: dev.ID,
|
||||
Type: config.VFIODeviceType(dev.Type),
|
||||
BDF: dev.BDF,
|
||||
SysfsDev: dev.SysfsDev,
|
||||
}
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
vfio = config.VFIOAPDev{
|
||||
ID: *(*dev).GetID(),
|
||||
SysfsDev: *(*dev).GetSysfsDev(),
|
||||
vfio = config.VFIODev{
|
||||
ID: dev.ID,
|
||||
SysfsDev: dev.SysfsDev,
|
||||
}
|
||||
default:
|
||||
deviceLogger().WithError(
|
||||
@@ -200,7 +197,7 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
|
||||
|
||||
// It should implement GetAttachCount() and DeviceID() as api.Device implementation
|
||||
// here it shares function from *GenericDevice so we don't need duplicate codes
|
||||
func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
|
||||
func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
|
||||
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
|
||||
vfioDeviceType, err = GetVFIODeviceType(sysfsDevStr)
|
||||
if err != nil {
|
||||
@@ -210,14 +207,18 @@ func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceS
|
||||
switch vfioDeviceType {
|
||||
case config.VFIOPCIDeviceNormalType:
|
||||
// Get bdf of device eg. 0000:00:1c.0
|
||||
deviceBDF = getBDF(deviceFileName)
|
||||
// OLD IMPL: deviceBDF = getBDF(deviceFileName)
|
||||
// The old implementation did not consider the case where
|
||||
// vfio devices are located on different root busses. The
|
||||
// kata-agent will handle the case now, here, use the full PCI addr
|
||||
deviceBDF = deviceFileName
|
||||
// Get sysfs path used by cloud-hypervisor
|
||||
deviceSysfsDev = filepath.Join(config.SysBusPciDevicesPath, deviceFileName)
|
||||
case config.VFIOPCIDeviceMediatedType:
|
||||
// Get sysfsdev of device eg. /sys/devices/pci0000:00/0000:00:02.0/f79944e4-5a3d-11e8-99ce-479cbab002e4
|
||||
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
|
||||
deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
|
||||
deviceBDF = getBDF(getMediatedBDF(deviceSysfsDev))
|
||||
deviceBDF = GetBDF(getMediatedBDF(deviceSysfsDev))
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
|
||||
deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
|
||||
@@ -240,7 +241,7 @@ func getMediatedBDF(deviceSysfsDev string) string {
|
||||
|
||||
// getBDF returns the BDF of pci device
|
||||
// Expected input string format is [<domain>]:[<bus>][<slot>].[<func>] eg. 0000:02:10.0
|
||||
func getBDF(deviceSysStr string) string {
|
||||
func GetBDF(deviceSysStr string) string {
|
||||
tokens := strings.SplitN(deviceSysStr, ":", 2)
|
||||
if len(tokens) == 1 {
|
||||
return ""
|
||||
|
||||
@@ -20,7 +20,7 @@ func TestGetVFIODetails(t *testing.T) {
|
||||
}
|
||||
|
||||
data := []testData{
|
||||
{"0000:02:10.0", "02:10.0"},
|
||||
{"0000:02:10.0", "0000:02:10.0"},
|
||||
{"0000:0210.0", ""},
|
||||
{"f79944e4-5a3d-11e8-99ce-", ""},
|
||||
{"f79944e4-5a3d-11e8-99ce", ""},
|
||||
@@ -29,7 +29,7 @@ func TestGetVFIODetails(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, d := range data {
|
||||
deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(d.deviceStr, "")
|
||||
deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(d.deviceStr, "")
|
||||
|
||||
switch vfioDeviceType {
|
||||
case config.VFIOPCIDeviceNormalType:
|
||||
|
||||
@@ -71,7 +71,11 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS
|
||||
dm.blockDriver = config.VirtioSCSI
|
||||
}
|
||||
|
||||
drivers.AllPCIeDevs = make(map[string]bool)
|
||||
config.PCIeDevices = make(map[config.PCIePort]config.PCIePortMapping)
|
||||
|
||||
config.PCIeDevices[config.RootPort] = make(map[string]bool)
|
||||
config.PCIeDevices[config.SwitchPort] = make(map[string]bool)
|
||||
config.PCIeDevices[config.BridgePort] = make(map[string]bool)
|
||||
|
||||
for _, dev := range devices {
|
||||
dm.devices[dev.DeviceID()] = dev
|
||||
@@ -118,7 +122,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device
|
||||
}
|
||||
if IsVFIO(devInfo.HostPath) {
|
||||
return drivers.NewVFIODevice(&devInfo), nil
|
||||
} else if isVhostUserBlk(devInfo) {
|
||||
} else if IsVhostUserBlk(devInfo) {
|
||||
if devInfo.DriverOptions == nil {
|
||||
devInfo.DriverOptions = make(map[string]string)
|
||||
}
|
||||
|
||||
@@ -116,14 +116,14 @@ func TestAttachVFIODevice(t *testing.T) {
|
||||
_, err = os.Create(deviceConfigFile)
|
||||
assert.Nil(t, err)
|
||||
|
||||
savedIOMMUPath := config.SysIOMMUPath
|
||||
config.SysIOMMUPath = tmpDir
|
||||
savedIOMMUPath := config.SysIOMMUGroupPath
|
||||
config.SysIOMMUGroupPath = tmpDir
|
||||
|
||||
savedSysBusPciDevicesPath := config.SysBusPciDevicesPath
|
||||
config.SysBusPciDevicesPath = devicesDir
|
||||
|
||||
defer func() {
|
||||
config.SysIOMMUPath = savedIOMMUPath
|
||||
config.SysIOMMUGroupPath = savedIOMMUPath
|
||||
config.SysBusPciDevicesPath = savedSysBusPciDevicesPath
|
||||
}()
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ func isBlock(devInfo config.DeviceInfo) bool {
|
||||
}
|
||||
|
||||
// isVhostUserBlk checks if the device is a VhostUserBlk device.
|
||||
func isVhostUserBlk(devInfo config.DeviceInfo) bool {
|
||||
func IsVhostUserBlk(devInfo config.DeviceInfo) bool {
|
||||
return devInfo.DevType == "b" && devInfo.Major == config.VhostUserBlkMajor
|
||||
}
|
||||
|
||||
|
||||
@@ -70,7 +70,7 @@ func TestIsVhostUserBlk(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, d := range data {
|
||||
isVhostUserBlk := isVhostUserBlk(
|
||||
isVhostUserBlk := IsVhostUserBlk(
|
||||
config.DeviceInfo{
|
||||
DevType: d.devType,
|
||||
Major: d.major,
|
||||
|
||||
@@ -123,6 +123,14 @@ const (
|
||||
// PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port.
|
||||
PCIeRootPort DeviceDriver = "pcie-root-port"
|
||||
|
||||
// PCIeSwitchUpstreamPort is a PCIe switch upstream port
|
||||
// A upstream port connects to a PCIe Root Port
|
||||
PCIeSwitchUpstreamPort DeviceDriver = "x3130-upstream"
|
||||
|
||||
// PCIeSwitchDownstreamPort is a PCIe switch downstream port
|
||||
// PCIe devices can be hot-plugged to the downstream port.
|
||||
PCIeSwitchDownstreamPort DeviceDriver = "xio3130-downstream"
|
||||
|
||||
// Loader is the Loader device driver.
|
||||
Loader DeviceDriver = "loader"
|
||||
|
||||
@@ -236,6 +244,7 @@ const (
|
||||
|
||||
// SecExecGuest represents an s390x Secure Execution (Protected Virtualization in QEMU) object
|
||||
SecExecGuest ObjectType = "s390-pv-guest"
|
||||
|
||||
// PEFGuest represent ppc64le PEF(Protected Execution Facility) object.
|
||||
PEFGuest ObjectType = "pef-guest"
|
||||
)
|
||||
@@ -410,7 +419,6 @@ func (object Object) QemuParams(config *Config) []string {
|
||||
deviceParams = append(deviceParams, string(object.Driver))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("id=%s", object.DeviceID))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("host-path=%s", object.File))
|
||||
|
||||
}
|
||||
|
||||
if len(deviceParams) > 0 {
|
||||
@@ -1722,6 +1730,106 @@ func (b PCIeRootPortDevice) Valid() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// PCIeSwitchUpstreamPortDevice is the port connecting to the root port
|
||||
type PCIeSwitchUpstreamPortDevice struct {
|
||||
ID string // format: sup{n}, n>=0
|
||||
Bus string // default is rp0
|
||||
}
|
||||
|
||||
// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
|
||||
func (b PCIeSwitchUpstreamPortDevice) QemuParams(config *Config) []string {
|
||||
var qemuParams []string
|
||||
var deviceParams []string
|
||||
|
||||
driver := PCIeSwitchUpstreamPort
|
||||
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
|
||||
|
||||
qemuParams = append(qemuParams, "-device")
|
||||
qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
|
||||
return qemuParams
|
||||
}
|
||||
|
||||
// Valid returns true if the PCIeSwitchUpstreamPortDevice structure is valid and complete.
|
||||
func (b PCIeSwitchUpstreamPortDevice) Valid() bool {
|
||||
if b.ID == "" {
|
||||
return false
|
||||
}
|
||||
if b.Bus == "" {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// PCIeSwitchDownstreamPortDevice is the port connecting to the root port
|
||||
type PCIeSwitchDownstreamPortDevice struct {
|
||||
ID string // format: sup{n}, n>=0
|
||||
Bus string // default is rp0
|
||||
Chassis string // (slot, chassis) pair is mandatory and must be unique for each downstream port, >=0, default is 0x00
|
||||
Slot string // >=0, default is 0x00
|
||||
// This to work needs patches to QEMU
|
||||
BusReserve string
|
||||
// Pref64 and Pref32 are not allowed to be set simultaneously
|
||||
Pref64Reserve string // reserve prefetched MMIO aperture, 64-bit
|
||||
Pref32Reserve string // reserve prefetched MMIO aperture, 32-bit
|
||||
MemReserve string // reserve non-prefetched MMIO aperture, 32-bit *only*
|
||||
IOReserve string // IO reservation
|
||||
|
||||
}
|
||||
|
||||
// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
|
||||
func (b PCIeSwitchDownstreamPortDevice) QemuParams(config *Config) []string {
|
||||
var qemuParams []string
|
||||
var deviceParams []string
|
||||
driver := PCIeSwitchDownstreamPort
|
||||
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("chassis=%s", b.Chassis))
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("slot=%s", b.Slot))
|
||||
if b.BusReserve != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("bus-reserve=%s", b.BusReserve))
|
||||
}
|
||||
|
||||
if b.Pref64Reserve != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("pref64-reserve=%s", b.Pref64Reserve))
|
||||
}
|
||||
|
||||
if b.Pref32Reserve != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("pref32-reserve=%s", b.Pref32Reserve))
|
||||
}
|
||||
|
||||
if b.MemReserve != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("mem-reserve=%s", b.MemReserve))
|
||||
}
|
||||
|
||||
if b.IOReserve != "" {
|
||||
deviceParams = append(deviceParams, fmt.Sprintf("io-reserve=%s", b.IOReserve))
|
||||
}
|
||||
|
||||
qemuParams = append(qemuParams, "-device")
|
||||
qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
|
||||
return qemuParams
|
||||
}
|
||||
|
||||
// Valid returns true if the PCIeSwitchUpstremPortDevice structure is valid and complete.
|
||||
func (b PCIeSwitchDownstreamPortDevice) Valid() bool {
|
||||
if b.ID == "" {
|
||||
return false
|
||||
}
|
||||
if b.Bus == "" {
|
||||
return false
|
||||
}
|
||||
if b.Chassis == "" {
|
||||
return false
|
||||
}
|
||||
if b.Slot == "" {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// VFIODevice represents a qemu vfio device meant for direct access by guest OS.
|
||||
type VFIODevice struct {
|
||||
// Bus-Device-Function of device
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
package hypervisors
|
||||
|
||||
import "fmt"
|
||||
import "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
|
||||
// Bridge is a bridge where devices can be hot plugged
|
||||
type Bridge struct {
|
||||
@@ -28,37 +28,8 @@ type CPUDevice struct {
|
||||
ID string
|
||||
}
|
||||
|
||||
// PCIePort distinguish only between root and switch port
|
||||
type PCIePort string
|
||||
|
||||
const (
|
||||
// RootPort attach VFIO devices to a root-port
|
||||
RootPort PCIePort = "root-port"
|
||||
// SwitchPort attach VFIO devices to a switch-port
|
||||
SwitchPort = "switch-port"
|
||||
// BridgePort is the default
|
||||
BridgePort = "bridge-port"
|
||||
// NoPort is for disabling VFIO hotplug/coldplug
|
||||
NoPort = "no-port"
|
||||
)
|
||||
|
||||
func (p PCIePort) String() string {
|
||||
switch p {
|
||||
case RootPort:
|
||||
return "root-port"
|
||||
case SwitchPort:
|
||||
return "switch-port"
|
||||
case BridgePort:
|
||||
return "bridge-port"
|
||||
case NoPort:
|
||||
return "no-port"
|
||||
}
|
||||
return fmt.Sprintf("<unknown PCIePort: %s>", string(p))
|
||||
}
|
||||
|
||||
type HypervisorState struct {
|
||||
BlockIndexMap map[int]struct{}
|
||||
|
||||
// Type of hypervisor, E.g. qemu/firecracker/acrn.
|
||||
Type string
|
||||
UUID string
|
||||
@@ -74,7 +45,7 @@ type HypervisorState struct {
|
||||
HotpluggedMemory int
|
||||
VirtiofsDaemonPid int
|
||||
Pid int
|
||||
PCIeRootPort int
|
||||
ColdPlugVFIO PCIePort
|
||||
HotPlugVFIO config.PCIePort
|
||||
ColdPlugVFIO config.PCIePort
|
||||
HotplugVFIOOnRootBus bool
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ import (
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
@@ -224,8 +224,8 @@ type RuntimeConfigOptions struct {
|
||||
JaegerUser string
|
||||
JaegerPassword string
|
||||
PFlash []string
|
||||
PCIeRootPort uint32
|
||||
ColdPlugVFIO hv.PCIePort
|
||||
HotPlugVFIO config.PCIePort
|
||||
ColdPlugVFIO config.PCIePort
|
||||
DefaultVCPUCount uint32
|
||||
DefaultMaxVCPUCount uint32
|
||||
DefaultMemSize uint32
|
||||
@@ -318,7 +318,6 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string {
|
||||
disable_block_device_use = ` + strconv.FormatBool(config.DisableBlock) + `
|
||||
enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + `
|
||||
hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + `
|
||||
pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + `
|
||||
cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `"
|
||||
msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + `
|
||||
enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + `
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
package katautils
|
||||
|
||||
import (
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
config "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
)
|
||||
|
||||
// name is the name of the runtime
|
||||
@@ -82,7 +82,6 @@ const defaultEnableDebug bool = false
|
||||
const defaultDisableNestingChecks bool = false
|
||||
const defaultMsize9p uint32 = 8192
|
||||
const defaultHotplugVFIOOnRootBus bool = false
|
||||
const defaultPCIeRootPort = 0
|
||||
const defaultEntropySource = "/dev/urandom"
|
||||
const defaultGuestHookPath string = ""
|
||||
const defaultVirtioFSCacheMode = "never"
|
||||
@@ -115,4 +114,5 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock"
|
||||
// Default config file used by stateless systems.
|
||||
var defaultRuntimeConfiguration = "@CONFIG_PATH@"
|
||||
|
||||
const defaultColdPlugVFIO = hv.NoPort
|
||||
const defaultHotPlugVFIO = config.NoPort
|
||||
const defaultColdPlugVFIO = config.NoPort
|
||||
|
||||
@@ -20,7 +20,6 @@ import (
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
|
||||
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
|
||||
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
||||
@@ -79,98 +78,98 @@ type factory struct {
|
||||
}
|
||||
|
||||
type hypervisor struct {
|
||||
Path string `toml:"path"`
|
||||
JailerPath string `toml:"jailer_path"`
|
||||
Kernel string `toml:"kernel"`
|
||||
CtlPath string `toml:"ctlpath"`
|
||||
Initrd string `toml:"initrd"`
|
||||
Image string `toml:"image"`
|
||||
RootfsType string `toml:"rootfs_type"`
|
||||
Firmware string `toml:"firmware"`
|
||||
FirmwareVolume string `toml:"firmware_volume"`
|
||||
MachineAccelerators string `toml:"machine_accelerators"`
|
||||
CPUFeatures string `toml:"cpu_features"`
|
||||
KernelParams string `toml:"kernel_params"`
|
||||
MachineType string `toml:"machine_type"`
|
||||
BlockDeviceDriver string `toml:"block_device_driver"`
|
||||
EntropySource string `toml:"entropy_source"`
|
||||
SharedFS string `toml:"shared_fs"`
|
||||
VirtioFSDaemon string `toml:"virtio_fs_daemon"`
|
||||
VirtioFSCache string `toml:"virtio_fs_cache"`
|
||||
VhostUserStorePath string `toml:"vhost_user_store_path"`
|
||||
FileBackedMemRootDir string `toml:"file_mem_backend"`
|
||||
GuestHookPath string `toml:"guest_hook_path"`
|
||||
GuestMemoryDumpPath string `toml:"guest_memory_dump_path"`
|
||||
SeccompSandbox string `toml:"seccompsandbox"`
|
||||
GuestPreAttestationURI string `toml:"guest_pre_attestation_kbs_uri"`
|
||||
GuestPreAttestationMode string `toml:"guest_pre_attestation_kbs_mode"`
|
||||
GuestPreAttestationKeyset string `toml:"guest_pre_attestation_keyset"`
|
||||
SEVCertChainPath string `toml:"sev_cert_chain"`
|
||||
BlockDeviceAIO string `toml:"block_device_aio"`
|
||||
RemoteHypervisorSocket string `toml:"remote_hypervisor_socket"`
|
||||
HypervisorPathList []string `toml:"valid_hypervisor_paths"`
|
||||
JailerPathList []string `toml:"valid_jailer_paths"`
|
||||
CtlPathList []string `toml:"valid_ctlpaths"`
|
||||
VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"`
|
||||
VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"`
|
||||
PFlashList []string `toml:"pflashes"`
|
||||
VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"`
|
||||
FileBackedMemRootList []string `toml:"valid_file_mem_backends"`
|
||||
EntropySourceList []string `toml:"valid_entropy_sources"`
|
||||
EnableAnnotations []string `toml:"enable_annotations"`
|
||||
RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"`
|
||||
TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"`
|
||||
MemOffset uint64 `toml:"memory_offset"`
|
||||
DefaultMaxMemorySize uint64 `toml:"default_maxmemory"`
|
||||
DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"`
|
||||
DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"`
|
||||
DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"`
|
||||
DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"`
|
||||
NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"`
|
||||
NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"`
|
||||
NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"`
|
||||
NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"`
|
||||
VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"`
|
||||
VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"`
|
||||
DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"`
|
||||
MemorySize uint32 `toml:"default_memory"`
|
||||
MemSlots uint32 `toml:"memory_slots"`
|
||||
DefaultBridges uint32 `toml:"default_bridges"`
|
||||
Msize9p uint32 `toml:"msize_9p"`
|
||||
PCIeRootPort uint32 `toml:"pcie_root_port"`
|
||||
GuestPreAttestationGRPCTimeout uint32 `toml:"guest_pre_attestation_grpc_timeout"`
|
||||
SEVGuestPolicy uint32 `toml:"sev_guest_policy"`
|
||||
SNPGuestPolicy uint64 `toml:"snp_guest_policy"`
|
||||
RemoteHypervisorTimeout uint32 `toml:"remote_hypervisor_timeout"`
|
||||
NumVCPUs int32 `toml:"default_vcpus"`
|
||||
BlockDeviceCacheSet bool `toml:"block_device_cache_set"`
|
||||
BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"`
|
||||
BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"`
|
||||
EnableVhostUserStore bool `toml:"enable_vhost_user_store"`
|
||||
VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"`
|
||||
DisableBlockDeviceUse bool `toml:"disable_block_device_use"`
|
||||
MemPrealloc bool `toml:"enable_mem_prealloc"`
|
||||
HugePages bool `toml:"enable_hugepages"`
|
||||
VirtioMem bool `toml:"enable_virtio_mem"`
|
||||
IOMMU bool `toml:"enable_iommu"`
|
||||
IOMMUPlatform bool `toml:"enable_iommu_platform"`
|
||||
Debug bool `toml:"enable_debug"`
|
||||
DisableNestingChecks bool `toml:"disable_nesting_checks"`
|
||||
EnableIOThreads bool `toml:"enable_iothreads"`
|
||||
DisableImageNvdimm bool `toml:"disable_image_nvdimm"`
|
||||
HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"`
|
||||
ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"`
|
||||
DisableVhostNet bool `toml:"disable_vhost_net"`
|
||||
GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"`
|
||||
ConfidentialGuest bool `toml:"confidential_guest"`
|
||||
SevSnpGuest bool `toml:"sev_snp_guest"`
|
||||
GuestSwap bool `toml:"enable_guest_swap"`
|
||||
Rootless bool `toml:"rootless"`
|
||||
DisableSeccomp bool `toml:"disable_seccomp"`
|
||||
DisableSeLinux bool `toml:"disable_selinux"`
|
||||
DisableGuestSeLinux bool `toml:"disable_guest_selinux"`
|
||||
LegacySerial bool `toml:"use_legacy_serial"`
|
||||
GuestPreAttestation bool `toml:"guest_pre_attestation"`
|
||||
Path string `toml:"path"`
|
||||
JailerPath string `toml:"jailer_path"`
|
||||
Kernel string `toml:"kernel"`
|
||||
CtlPath string `toml:"ctlpath"`
|
||||
Initrd string `toml:"initrd"`
|
||||
Image string `toml:"image"`
|
||||
RootfsType string `toml:"rootfs_type"`
|
||||
Firmware string `toml:"firmware"`
|
||||
FirmwareVolume string `toml:"firmware_volume"`
|
||||
MachineAccelerators string `toml:"machine_accelerators"`
|
||||
CPUFeatures string `toml:"cpu_features"`
|
||||
KernelParams string `toml:"kernel_params"`
|
||||
MachineType string `toml:"machine_type"`
|
||||
BlockDeviceDriver string `toml:"block_device_driver"`
|
||||
EntropySource string `toml:"entropy_source"`
|
||||
SharedFS string `toml:"shared_fs"`
|
||||
VirtioFSDaemon string `toml:"virtio_fs_daemon"`
|
||||
VirtioFSCache string `toml:"virtio_fs_cache"`
|
||||
VhostUserStorePath string `toml:"vhost_user_store_path"`
|
||||
FileBackedMemRootDir string `toml:"file_mem_backend"`
|
||||
GuestHookPath string `toml:"guest_hook_path"`
|
||||
GuestMemoryDumpPath string `toml:"guest_memory_dump_path"`
|
||||
SeccompSandbox string `toml:"seccompsandbox"`
|
||||
GuestPreAttestationURI string `toml:"guest_pre_attestation_kbs_uri"`
|
||||
GuestPreAttestationMode string `toml:"guest_pre_attestation_kbs_mode"`
|
||||
GuestPreAttestationKeyset string `toml:"guest_pre_attestation_keyset"`
|
||||
SEVCertChainPath string `toml:"sev_cert_chain"`
|
||||
BlockDeviceAIO string `toml:"block_device_aio"`
|
||||
RemoteHypervisorSocket string `toml:"remote_hypervisor_socket"`
|
||||
HypervisorPathList []string `toml:"valid_hypervisor_paths"`
|
||||
JailerPathList []string `toml:"valid_jailer_paths"`
|
||||
CtlPathList []string `toml:"valid_ctlpaths"`
|
||||
VirtioFSDaemonList []string `toml:"valid_virtio_fs_daemon_paths"`
|
||||
VirtioFSExtraArgs []string `toml:"virtio_fs_extra_args"`
|
||||
PFlashList []string `toml:"pflashes"`
|
||||
VhostUserStorePathList []string `toml:"valid_vhost_user_store_paths"`
|
||||
FileBackedMemRootList []string `toml:"valid_file_mem_backends"`
|
||||
EntropySourceList []string `toml:"valid_entropy_sources"`
|
||||
EnableAnnotations []string `toml:"enable_annotations"`
|
||||
RxRateLimiterMaxRate uint64 `toml:"rx_rate_limiter_max_rate"`
|
||||
TxRateLimiterMaxRate uint64 `toml:"tx_rate_limiter_max_rate"`
|
||||
MemOffset uint64 `toml:"memory_offset"`
|
||||
DefaultMaxMemorySize uint64 `toml:"default_maxmemory"`
|
||||
DiskRateLimiterBwMaxRate int64 `toml:"disk_rate_limiter_bw_max_rate"`
|
||||
DiskRateLimiterBwOneTimeBurst int64 `toml:"disk_rate_limiter_bw_one_time_burst"`
|
||||
DiskRateLimiterOpsMaxRate int64 `toml:"disk_rate_limiter_ops_max_rate"`
|
||||
DiskRateLimiterOpsOneTimeBurst int64 `toml:"disk_rate_limiter_ops_one_time_burst"`
|
||||
NetRateLimiterBwMaxRate int64 `toml:"net_rate_limiter_bw_max_rate"`
|
||||
NetRateLimiterBwOneTimeBurst int64 `toml:"net_rate_limiter_bw_one_time_burst"`
|
||||
NetRateLimiterOpsMaxRate int64 `toml:"net_rate_limiter_ops_max_rate"`
|
||||
NetRateLimiterOpsOneTimeBurst int64 `toml:"net_rate_limiter_ops_one_time_burst"`
|
||||
VirtioFSCacheSize uint32 `toml:"virtio_fs_cache_size"`
|
||||
VirtioFSQueueSize uint32 `toml:"virtio_fs_queue_size"`
|
||||
DefaultMaxVCPUs uint32 `toml:"default_maxvcpus"`
|
||||
MemorySize uint32 `toml:"default_memory"`
|
||||
MemSlots uint32 `toml:"memory_slots"`
|
||||
DefaultBridges uint32 `toml:"default_bridges"`
|
||||
Msize9p uint32 `toml:"msize_9p"`
|
||||
GuestPreAttestationGRPCTimeout uint32 `toml:"guest_pre_attestation_grpc_timeout"`
|
||||
SEVGuestPolicy uint32 `toml:"sev_guest_policy"`
|
||||
SNPGuestPolicy uint64 `toml:"snp_guest_policy"`
|
||||
RemoteHypervisorTimeout uint32 `toml:"remote_hypervisor_timeout"`
|
||||
NumVCPUs int32 `toml:"default_vcpus"`
|
||||
BlockDeviceCacheSet bool `toml:"block_device_cache_set"`
|
||||
BlockDeviceCacheDirect bool `toml:"block_device_cache_direct"`
|
||||
BlockDeviceCacheNoflush bool `toml:"block_device_cache_noflush"`
|
||||
EnableVhostUserStore bool `toml:"enable_vhost_user_store"`
|
||||
VhostUserDeviceReconnect uint32 `toml:"vhost_user_reconnect_timeout_sec"`
|
||||
DisableBlockDeviceUse bool `toml:"disable_block_device_use"`
|
||||
MemPrealloc bool `toml:"enable_mem_prealloc"`
|
||||
HugePages bool `toml:"enable_hugepages"`
|
||||
VirtioMem bool `toml:"enable_virtio_mem"`
|
||||
IOMMU bool `toml:"enable_iommu"`
|
||||
IOMMUPlatform bool `toml:"enable_iommu_platform"`
|
||||
Debug bool `toml:"enable_debug"`
|
||||
DisableNestingChecks bool `toml:"disable_nesting_checks"`
|
||||
EnableIOThreads bool `toml:"enable_iothreads"`
|
||||
DisableImageNvdimm bool `toml:"disable_image_nvdimm"`
|
||||
HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"`
|
||||
HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"`
|
||||
ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"`
|
||||
DisableVhostNet bool `toml:"disable_vhost_net"`
|
||||
GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"`
|
||||
ConfidentialGuest bool `toml:"confidential_guest"`
|
||||
SevSnpGuest bool `toml:"sev_snp_guest"`
|
||||
GuestSwap bool `toml:"enable_guest_swap"`
|
||||
Rootless bool `toml:"rootless"`
|
||||
DisableSeccomp bool `toml:"disable_seccomp"`
|
||||
DisableSeLinux bool `toml:"disable_selinux"`
|
||||
DisableGuestSeLinux bool `toml:"disable_guest_selinux"`
|
||||
LegacySerial bool `toml:"use_legacy_serial"`
|
||||
GuestPreAttestation bool `toml:"guest_pre_attestation"`
|
||||
}
|
||||
|
||||
type runtime struct {
|
||||
@@ -298,12 +297,18 @@ func (h hypervisor) firmware() (string, error) {
|
||||
return ResolvePath(p)
|
||||
}
|
||||
|
||||
func (h hypervisor) coldPlugVFIO() hv.PCIePort {
|
||||
func (h hypervisor) coldPlugVFIO() config.PCIePort {
|
||||
if h.ColdPlugVFIO == "" {
|
||||
return defaultColdPlugVFIO
|
||||
}
|
||||
return h.ColdPlugVFIO
|
||||
}
|
||||
func (h hypervisor) hotPlugVFIO() config.PCIePort {
|
||||
if h.HotPlugVFIO == "" {
|
||||
return defaultHotPlugVFIO
|
||||
}
|
||||
return h.HotPlugVFIO
|
||||
}
|
||||
|
||||
func (h hypervisor) firmwareVolume() (string, error) {
|
||||
p := h.FirmwareVolume
|
||||
@@ -523,7 +528,7 @@ func (h hypervisor) blockDeviceAIO() (string, error) {
|
||||
}
|
||||
|
||||
func (h hypervisor) sharedFS() (string, error) {
|
||||
supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus}
|
||||
supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus, config.NoSharedFS}
|
||||
|
||||
if h.SharedFS == "" {
|
||||
return config.VirtioFS, nil
|
||||
@@ -838,6 +843,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
|
||||
KernelPath: kernel,
|
||||
InitrdPath: initrd,
|
||||
ImagePath: image,
|
||||
RootfsType: rootfsType,
|
||||
FirmwarePath: firmware,
|
||||
FirmwareVolumePath: firmwareVolume,
|
||||
PFlash: pflashes,
|
||||
@@ -880,8 +886,8 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
|
||||
Msize9p: h.msize9p(),
|
||||
DisableImageNvdimm: h.DisableImageNvdimm,
|
||||
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
|
||||
HotPlugVFIO: h.hotPlugVFIO(),
|
||||
ColdPlugVFIO: h.coldPlugVFIO(),
|
||||
PCIeRootPort: h.PCIeRootPort,
|
||||
DisableVhostNet: h.DisableVhostNet,
|
||||
EnableVhostUserStore: h.EnableVhostUserStore,
|
||||
VhostUserStorePath: h.vhostUserStorePath(),
|
||||
@@ -907,7 +913,6 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
|
||||
SNPGuestPolicy: h.getSnpGuestPolicy(),
|
||||
SEVCertChainPath: h.SEVCertChainPath,
|
||||
DisableGuestSeLinux: h.DisableGuestSeLinux,
|
||||
RootfsType: rootfsType,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -1034,11 +1039,12 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
|
||||
return vc.HypervisorConfig{}, err
|
||||
}
|
||||
|
||||
if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus {
|
||||
return vc.HypervisorConfig{}, errors.New("clh only support virtio-fs or virtio-fs-nydus")
|
||||
if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus && sharedFS != config.NoSharedFS {
|
||||
return vc.HypervisorConfig{},
|
||||
fmt.Errorf("Cloud Hypervisor does not support %s shared filesystem option", sharedFS)
|
||||
}
|
||||
|
||||
if h.VirtioFSDaemon == "" {
|
||||
if (sharedFS == config.VirtioFS || sharedFS == config.VirtioFSNydus) && h.VirtioFSDaemon == "" {
|
||||
return vc.HypervisorConfig{},
|
||||
fmt.Errorf("cannot enable %s without daemon path in configuration file", sharedFS)
|
||||
}
|
||||
@@ -1084,7 +1090,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
|
||||
Msize9p: h.msize9p(),
|
||||
HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus,
|
||||
ColdPlugVFIO: h.coldPlugVFIO(),
|
||||
PCIeRootPort: h.PCIeRootPort,
|
||||
HotPlugVFIO: h.hotPlugVFIO(),
|
||||
DisableVhostNet: true,
|
||||
GuestHookPath: h.guestHookPath(),
|
||||
VirtioFSExtraArgs: h.VirtioFSExtraArgs,
|
||||
@@ -1302,6 +1308,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
|
||||
KernelPath: defaultKernelPath,
|
||||
ImagePath: defaultImagePath,
|
||||
InitrdPath: defaultInitrdPath,
|
||||
RootfsType: defaultRootfsType,
|
||||
FirmwarePath: defaultFirmwarePath,
|
||||
FirmwareVolumePath: defaultFirmwareVolumePath,
|
||||
MachineAccelerators: defaultMachineAccelerators,
|
||||
@@ -1330,9 +1337,10 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
|
||||
Msize9p: defaultMsize9p,
|
||||
HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus,
|
||||
ColdPlugVFIO: defaultColdPlugVFIO,
|
||||
PCIeRootPort: defaultPCIeRootPort,
|
||||
HotPlugVFIO: defaultHotPlugVFIO,
|
||||
GuestHookPath: defaultGuestHookPath,
|
||||
VhostUserStorePath: defaultVhostUserStorePath,
|
||||
VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect,
|
||||
VirtioFSCache: defaultVirtioFSCacheMode,
|
||||
DisableImageNvdimm: defaultDisableImageNvdimm,
|
||||
RxRateLimiterMaxRate: defaultRxRateLimiterMaxRate,
|
||||
@@ -1352,8 +1360,6 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
|
||||
SEVGuestPolicy: defaultSEVGuestPolicy,
|
||||
SNPGuestPolicy: defaultSNPGuestPolicy,
|
||||
SEVCertChainPath: defaultSEVCertChainPath,
|
||||
VhostUserDeviceReconnect: defaultVhostUserDeviceReconnect,
|
||||
RootfsType: defaultRootfsType,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1711,9 +1717,10 @@ func checkConfig(config oci.RuntimeConfig) error {
|
||||
return err
|
||||
}
|
||||
|
||||
hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO
|
||||
coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO
|
||||
machineType := config.HypervisorConfig.HypervisorMachineType
|
||||
if err := checkPCIeConfig(coldPlugVFIO, machineType); err != nil {
|
||||
if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -1723,18 +1730,32 @@ func checkConfig(config oci.RuntimeConfig) error {
|
||||
// checkPCIeConfig ensures the PCIe configuration is valid.
|
||||
// Only allow one of the following settings for cold-plug:
|
||||
// no-port, root-port, switch-port
|
||||
func checkPCIeConfig(vfioPort hv.PCIePort, machineType string) error {
|
||||
func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string) error {
|
||||
// Currently only QEMU q35 supports advanced PCIe topologies
|
||||
// firecracker, dragonball do not have right now any PCIe support
|
||||
if machineType != "q35" {
|
||||
return nil
|
||||
}
|
||||
if vfioPort == hv.NoPort || vfioPort == hv.RootPort || vfioPort == hv.SwitchPort {
|
||||
|
||||
if coldPlug != config.NoPort && hotPlug != config.NoPort {
|
||||
return fmt.Errorf("invalid hot-plug=%s and cold-plug=%s settings, only one of them can be set", coldPlug, hotPlug)
|
||||
}
|
||||
if coldPlug == config.NoPort && hotPlug == config.NoPort {
|
||||
return nil
|
||||
}
|
||||
var port config.PCIePort
|
||||
if coldPlug != config.NoPort {
|
||||
port = coldPlug
|
||||
}
|
||||
if hotPlug != config.NoPort {
|
||||
port = hotPlug
|
||||
}
|
||||
if port == config.NoPort || port == config.BridgePort || port == config.RootPort || port == config.SwitchPort {
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s",
|
||||
vfioPort, hv.NoPort, hv.RootPort, hv.SwitchPort)
|
||||
return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s, %s",
|
||||
coldPlug, config.NoPort, config.BridgePort, config.RootPort, config.SwitchPort)
|
||||
}
|
||||
|
||||
// checkNetNsConfig performs sanity checks on disable_new_netns config.
|
||||
|
||||
@@ -18,8 +18,8 @@ import (
|
||||
"syscall"
|
||||
"testing"
|
||||
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
|
||||
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
||||
@@ -63,15 +63,16 @@ func createConfig(configPath string, fileData string) error {
|
||||
|
||||
// createAllRuntimeConfigFiles creates all files necessary to call
|
||||
// loadConfiguration().
|
||||
func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConfig, err error) {
|
||||
func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntimeConfig, err error) {
|
||||
if dir == "" {
|
||||
return config, fmt.Errorf("BUG: need directory")
|
||||
return testConfig, fmt.Errorf("BUG: need directory")
|
||||
}
|
||||
|
||||
if hypervisor == "" {
|
||||
return config, fmt.Errorf("BUG: need hypervisor")
|
||||
return testConfig, fmt.Errorf("BUG: need hypervisor")
|
||||
}
|
||||
var coldPlugVFIO hv.PCIePort
|
||||
var hotPlugVFIO config.PCIePort
|
||||
var coldPlugVFIO config.PCIePort
|
||||
hypervisorPath := path.Join(dir, "hypervisor")
|
||||
kernelPath := path.Join(dir, "kernel")
|
||||
kernelParams := "foo=bar xyz"
|
||||
@@ -85,8 +86,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
blockDeviceAIO := "io_uring"
|
||||
enableIOThreads := true
|
||||
hotplugVFIOOnRootBus := true
|
||||
pcieRootPort := uint32(2)
|
||||
coldPlugVFIO = hv.RootPort
|
||||
hotPlugVFIO = config.NoPort
|
||||
coldPlugVFIO = config.BridgePort
|
||||
disableNewNetNs := false
|
||||
sharedFS := "virtio-9p"
|
||||
virtioFSdaemon := path.Join(dir, "virtiofsd")
|
||||
@@ -108,7 +109,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
BlockDeviceAIO: blockDeviceAIO,
|
||||
EnableIOThreads: enableIOThreads,
|
||||
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
|
||||
PCIeRootPort: pcieRootPort,
|
||||
HotPlugVFIO: hotPlugVFIO,
|
||||
ColdPlugVFIO: coldPlugVFIO,
|
||||
DisableNewNetNs: disableNewNetNs,
|
||||
DefaultVCPUCount: defaultVCPUCount,
|
||||
@@ -134,7 +135,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
configPath := path.Join(dir, "runtime.toml")
|
||||
err = createConfig(configPath, runtimeConfigFileData)
|
||||
if err != nil {
|
||||
return config, err
|
||||
return testConfig, err
|
||||
}
|
||||
|
||||
configPathLink := path.Join(filepath.Dir(configPath), "link-to-configuration.toml")
|
||||
@@ -142,7 +143,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
// create a link to the config file
|
||||
err = syscall.Symlink(configPath, configPathLink)
|
||||
if err != nil {
|
||||
return config, err
|
||||
return testConfig, err
|
||||
}
|
||||
|
||||
files := []string{hypervisorPath, kernelPath, imagePath}
|
||||
@@ -151,7 +152,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
// create the resource (which must be >0 bytes)
|
||||
err := WriteFile(file, "foo", testFileMode)
|
||||
if err != nil {
|
||||
return config, err
|
||||
return testConfig, err
|
||||
}
|
||||
}
|
||||
|
||||
@@ -172,7 +173,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
DefaultBridges: defaultBridgesCount,
|
||||
EnableIOThreads: enableIOThreads,
|
||||
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
|
||||
PCIeRootPort: pcieRootPort,
|
||||
HotPlugVFIO: hotPlugVFIO,
|
||||
ColdPlugVFIO: coldPlugVFIO,
|
||||
Msize9p: defaultMsize9p,
|
||||
MemSlots: defaultMemSlots,
|
||||
@@ -217,10 +218,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
|
||||
err = SetKernelParams(&runtimeConfig)
|
||||
if err != nil {
|
||||
return config, err
|
||||
return testConfig, err
|
||||
}
|
||||
|
||||
config = testRuntimeConfig{
|
||||
rtimeConfig := testRuntimeConfig{
|
||||
RuntimeConfig: runtimeConfig,
|
||||
RuntimeConfigFile: configPath,
|
||||
ConfigPath: configPath,
|
||||
@@ -229,7 +230,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
|
||||
LogPath: logPath,
|
||||
}
|
||||
|
||||
return config, nil
|
||||
return rtimeConfig, nil
|
||||
}
|
||||
|
||||
// testLoadConfiguration accepts an optional function that can be used
|
||||
@@ -570,6 +571,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
|
||||
BlockDeviceAIO: defaultBlockDeviceAIO,
|
||||
DisableGuestSeLinux: defaultDisableGuestSeLinux,
|
||||
SNPGuestPolicy: defaultSNPGuestPolicy,
|
||||
HotPlugVFIO: defaultHotPlugVFIO,
|
||||
ColdPlugVFIO: defaultColdPlugVFIO,
|
||||
}
|
||||
|
||||
@@ -604,7 +606,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
|
||||
|
||||
func TestNewQemuHypervisorConfig(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
var coldPlugVFIO hv.PCIePort
|
||||
var coldPlugVFIO config.PCIePort
|
||||
hypervisorPath := path.Join(dir, "hypervisor")
|
||||
kernelPath := path.Join(dir, "kernel")
|
||||
imagePath := path.Join(dir, "image")
|
||||
@@ -612,8 +614,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
|
||||
disableBlock := true
|
||||
enableIOThreads := true
|
||||
hotplugVFIOOnRootBus := true
|
||||
pcieRootPort := uint32(2)
|
||||
coldPlugVFIO = hv.RootPort
|
||||
coldPlugVFIO = config.BridgePort
|
||||
orgVHostVSockDevicePath := utils.VHostVSockDevicePath
|
||||
blockDeviceAIO := "io_uring"
|
||||
defer func() {
|
||||
@@ -632,7 +633,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
|
||||
DisableBlockDeviceUse: disableBlock,
|
||||
EnableIOThreads: enableIOThreads,
|
||||
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
|
||||
PCIeRootPort: pcieRootPort,
|
||||
ColdPlugVFIO: coldPlugVFIO,
|
||||
RxRateLimiterMaxRate: rxRateLimiterMaxRate,
|
||||
TxRateLimiterMaxRate: txRateLimiterMaxRate,
|
||||
@@ -688,10 +688,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
|
||||
t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus)
|
||||
}
|
||||
|
||||
if config.PCIeRootPort != pcieRootPort {
|
||||
t.Errorf("Expected value for PCIeRootPort %v, got %v", pcieRootPort, config.PCIeRootPort)
|
||||
}
|
||||
|
||||
if config.RxRateLimiterMaxRate != rxRateLimiterMaxRate {
|
||||
t.Errorf("Expected value for rx rate limiter %v, got %v", rxRateLimiterMaxRate, config.RxRateLimiterMaxRate)
|
||||
}
|
||||
@@ -814,7 +810,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
|
||||
disableBlock := true
|
||||
enableIOThreads := true
|
||||
hotplugVFIOOnRootBus := true
|
||||
pcieRootPort := uint32(2)
|
||||
|
||||
hypervisor := hypervisor{
|
||||
Path: hypervisorPath,
|
||||
@@ -825,7 +820,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
|
||||
DisableBlockDeviceUse: disableBlock,
|
||||
EnableIOThreads: enableIOThreads,
|
||||
HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
|
||||
PCIeRootPort: pcieRootPort,
|
||||
}
|
||||
|
||||
_, err := newQemuHypervisorConfig(hypervisor)
|
||||
|
||||
@@ -460,6 +460,10 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
|
||||
return err
|
||||
}
|
||||
|
||||
if err := addHypervisorHotColdPlugVfioOverrides(ocispec, config); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if value, ok := ocispec.Annotations[vcAnnotations.MachineType]; ok {
|
||||
if value != "" {
|
||||
config.HypervisorConfig.HypervisorMachineType = value
|
||||
@@ -515,12 +519,6 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
|
||||
return err
|
||||
}
|
||||
|
||||
if err := newAnnotationConfiguration(ocispec, vcAnnotations.PCIeRootPort).setUint(func(pcieRootPort uint64) {
|
||||
config.HypervisorConfig.PCIeRootPort = uint32(pcieRootPort)
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if value, ok := ocispec.Annotations[vcAnnotations.EntropySource]; ok {
|
||||
if !checkPathIsInGlobs(runtime.HypervisorConfig.EntropySourceList, value) {
|
||||
return fmt.Errorf("entropy source %v required from annotation is not valid", value)
|
||||
@@ -583,6 +581,37 @@ func addHypervisorPathOverrides(ocispec specs.Spec, config *vc.SandboxConfig, ru
|
||||
return nil
|
||||
}
|
||||
|
||||
func addHypervisorPCIePortOverride(value string) (config.PCIePort, error) {
|
||||
if value == "" {
|
||||
return config.NoPort, nil
|
||||
}
|
||||
port := config.PCIePort(value)
|
||||
if port.Invalid() {
|
||||
return config.InvalidPort, fmt.Errorf("Invalid PCIe port \"%v\" specified in annotation", value)
|
||||
}
|
||||
return port, nil
|
||||
}
|
||||
|
||||
func addHypervisorHotColdPlugVfioOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error {
|
||||
|
||||
var err error
|
||||
if value, ok := ocispec.Annotations[vcAnnotations.HotPlugVFIO]; ok {
|
||||
if sbConfig.HypervisorConfig.HotPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil {
|
||||
return err
|
||||
}
|
||||
// If hot-plug is specified disable cold-plug and vice versa
|
||||
sbConfig.HypervisorConfig.ColdPlugVFIO = config.NoPort
|
||||
}
|
||||
if value, ok := ocispec.Annotations[vcAnnotations.ColdPlugVFIO]; ok {
|
||||
if sbConfig.HypervisorConfig.ColdPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil {
|
||||
return err
|
||||
}
|
||||
// If cold-plug is specified disable hot-plug and vice versa
|
||||
sbConfig.HypervisorConfig.HotPlugVFIO = config.NoPort
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, runtime RuntimeConfig) error {
|
||||
|
||||
if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMemory).setUintWithCheck(func(memorySz uint64) error {
|
||||
|
||||
@@ -599,7 +599,7 @@ func TestContainerPipeSizeAnnotation(t *testing.T) {
|
||||
func TestAddHypervisorAnnotations(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
config := vc.SandboxConfig{
|
||||
sbConfig := vc.SandboxConfig{
|
||||
Annotations: make(map[string]string),
|
||||
}
|
||||
|
||||
@@ -628,8 +628,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
|
||||
runtimeConfig.HypervisorConfig.VirtioFSDaemonList = []string{"/bin/*ls*"}
|
||||
|
||||
ocispec.Annotations[vcAnnotations.KernelParams] = "vsyscall=emulate iommu=on"
|
||||
addHypervisorConfigOverrides(ocispec, &config, runtimeConfig)
|
||||
assert.Exactly(expectedHyperConfig, config.HypervisorConfig)
|
||||
addHypervisorConfigOverrides(ocispec, &sbConfig, runtimeConfig)
|
||||
assert.Exactly(expectedHyperConfig, sbConfig.HypervisorConfig)
|
||||
|
||||
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1"
|
||||
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1"
|
||||
@@ -660,7 +660,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
|
||||
ocispec.Annotations[vcAnnotations.GuestHookPath] = "/usr/bin/"
|
||||
ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true"
|
||||
ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true"
|
||||
ocispec.Annotations[vcAnnotations.PCIeRootPort] = "2"
|
||||
ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = config.BridgePort
|
||||
ocispec.Annotations[vcAnnotations.HotPlugVFIO] = config.NoPort
|
||||
ocispec.Annotations[vcAnnotations.IOMMUPlatform] = "true"
|
||||
ocispec.Annotations[vcAnnotations.SGXEPC] = "64Mi"
|
||||
ocispec.Annotations[vcAnnotations.UseLegacySerial] = "true"
|
||||
@@ -668,55 +669,58 @@ func TestAddHypervisorAnnotations(t *testing.T) {
|
||||
ocispec.Annotations[vcAnnotations.RxRateLimiterMaxRate] = "10000000"
|
||||
ocispec.Annotations[vcAnnotations.TxRateLimiterMaxRate] = "10000000"
|
||||
|
||||
addAnnotations(ocispec, &config, runtimeConfig)
|
||||
assert.Equal(config.HypervisorConfig.NumVCPUs, uint32(1))
|
||||
assert.Equal(config.HypervisorConfig.DefaultMaxVCPUs, uint32(1))
|
||||
assert.Equal(config.HypervisorConfig.MemorySize, uint32(1024))
|
||||
assert.Equal(config.HypervisorConfig.MemSlots, uint32(20))
|
||||
assert.Equal(config.HypervisorConfig.MemOffset, uint64(512))
|
||||
assert.Equal(config.HypervisorConfig.VirtioMem, true)
|
||||
assert.Equal(config.HypervisorConfig.MemPrealloc, true)
|
||||
assert.Equal(config.HypervisorConfig.FileBackedMemRootDir, "/dev/shm")
|
||||
assert.Equal(config.HypervisorConfig.HugePages, true)
|
||||
assert.Equal(config.HypervisorConfig.IOMMU, true)
|
||||
assert.Equal(config.HypervisorConfig.BlockDeviceDriver, "virtio-scsi")
|
||||
assert.Equal(config.HypervisorConfig.BlockDeviceAIO, "io_uring")
|
||||
assert.Equal(config.HypervisorConfig.DisableBlockDeviceUse, true)
|
||||
assert.Equal(config.HypervisorConfig.EnableIOThreads, true)
|
||||
assert.Equal(config.HypervisorConfig.BlockDeviceCacheSet, true)
|
||||
assert.Equal(config.HypervisorConfig.BlockDeviceCacheDirect, true)
|
||||
assert.Equal(config.HypervisorConfig.BlockDeviceCacheNoflush, true)
|
||||
assert.Equal(config.HypervisorConfig.SharedFS, "virtio-fs")
|
||||
assert.Equal(config.HypervisorConfig.VirtioFSDaemon, "/bin/false")
|
||||
assert.Equal(config.HypervisorConfig.VirtioFSCache, "auto")
|
||||
assert.ElementsMatch(config.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"})
|
||||
assert.Equal(config.HypervisorConfig.Msize9p, uint32(512))
|
||||
assert.Equal(config.HypervisorConfig.HypervisorMachineType, "q35")
|
||||
assert.Equal(config.HypervisorConfig.MachineAccelerators, "nofw")
|
||||
assert.Equal(config.HypervisorConfig.CPUFeatures, "pmu=off")
|
||||
assert.Equal(config.HypervisorConfig.DisableVhostNet, true)
|
||||
assert.Equal(config.HypervisorConfig.GuestHookPath, "/usr/bin/")
|
||||
assert.Equal(config.HypervisorConfig.DisableImageNvdimm, true)
|
||||
assert.Equal(config.HypervisorConfig.HotplugVFIOOnRootBus, true)
|
||||
assert.Equal(config.HypervisorConfig.PCIeRootPort, uint32(2))
|
||||
assert.Equal(config.HypervisorConfig.IOMMUPlatform, true)
|
||||
assert.Equal(config.HypervisorConfig.SGXEPCSize, int64(67108864))
|
||||
assert.Equal(config.HypervisorConfig.LegacySerial, true)
|
||||
assert.Equal(config.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000))
|
||||
assert.Equal(config.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000))
|
||||
err := addAnnotations(ocispec, &sbConfig, runtimeConfig)
|
||||
assert.NoError(err)
|
||||
|
||||
assert.Equal(sbConfig.HypervisorConfig.NumVCPUs, uint32(1))
|
||||
assert.Equal(sbConfig.HypervisorConfig.DefaultMaxVCPUs, uint32(1))
|
||||
assert.Equal(sbConfig.HypervisorConfig.MemorySize, uint32(1024))
|
||||
assert.Equal(sbConfig.HypervisorConfig.MemSlots, uint32(20))
|
||||
assert.Equal(sbConfig.HypervisorConfig.MemOffset, uint64(512))
|
||||
assert.Equal(sbConfig.HypervisorConfig.VirtioMem, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.MemPrealloc, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.FileBackedMemRootDir, "/dev/shm")
|
||||
assert.Equal(sbConfig.HypervisorConfig.HugePages, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.IOMMU, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceDriver, "virtio-scsi")
|
||||
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceAIO, "io_uring")
|
||||
assert.Equal(sbConfig.HypervisorConfig.DisableBlockDeviceUse, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.EnableIOThreads, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheSet, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheDirect, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheNoflush, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.SharedFS, "virtio-fs")
|
||||
assert.Equal(sbConfig.HypervisorConfig.VirtioFSDaemon, "/bin/false")
|
||||
assert.Equal(sbConfig.HypervisorConfig.VirtioFSCache, "auto")
|
||||
assert.ElementsMatch(sbConfig.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"})
|
||||
assert.Equal(sbConfig.HypervisorConfig.Msize9p, uint32(512))
|
||||
assert.Equal(sbConfig.HypervisorConfig.HypervisorMachineType, "q35")
|
||||
assert.Equal(sbConfig.HypervisorConfig.MachineAccelerators, "nofw")
|
||||
assert.Equal(sbConfig.HypervisorConfig.CPUFeatures, "pmu=off")
|
||||
assert.Equal(sbConfig.HypervisorConfig.DisableVhostNet, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.GuestHookPath, "/usr/bin/")
|
||||
assert.Equal(sbConfig.HypervisorConfig.DisableImageNvdimm, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.HotplugVFIOOnRootBus, true)
|
||||
assert.Equal(string(sbConfig.HypervisorConfig.ColdPlugVFIO), string(config.BridgePort))
|
||||
assert.Equal(string(sbConfig.HypervisorConfig.HotPlugVFIO), string(config.NoPort))
|
||||
assert.Equal(sbConfig.HypervisorConfig.IOMMUPlatform, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.SGXEPCSize, int64(67108864))
|
||||
assert.Equal(sbConfig.HypervisorConfig.LegacySerial, true)
|
||||
assert.Equal(sbConfig.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000))
|
||||
assert.Equal(sbConfig.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000))
|
||||
|
||||
// In case an absurd large value is provided, the config value if not over-ridden
|
||||
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "655536"
|
||||
err := addAnnotations(ocispec, &config, runtimeConfig)
|
||||
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
|
||||
assert.Error(err)
|
||||
|
||||
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "-1"
|
||||
err = addAnnotations(ocispec, &config, runtimeConfig)
|
||||
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
|
||||
assert.Error(err)
|
||||
|
||||
ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1"
|
||||
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "-1"
|
||||
err = addAnnotations(ocispec, &config, runtimeConfig)
|
||||
err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
|
||||
assert.Error(err)
|
||||
|
||||
ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1"
|
||||
|
||||
@@ -90,7 +90,7 @@ func (a *Acrn) Capabilities(ctx context.Context) types.Capabilities {
|
||||
span, _ := katatrace.Trace(ctx, a.Logger(), "Capabilities", acrnTracingTags, map[string]string{"sandbox_id": a.id})
|
||||
defer span.End()
|
||||
|
||||
return a.arch.capabilities()
|
||||
return a.arch.capabilities(a.config)
|
||||
}
|
||||
|
||||
func (a *Acrn) HypervisorConfig() HypervisorConfig {
|
||||
|
||||
@@ -33,7 +33,7 @@ type acrnArch interface {
|
||||
kernelParameters(debug bool) []Param
|
||||
|
||||
//capabilities returns the capabilities supported by acrn
|
||||
capabilities() types.Capabilities
|
||||
capabilities(config HypervisorConfig) types.Capabilities
|
||||
|
||||
// memoryTopology returns the memory topology using the given amount of memoryMb and hostMemoryMb
|
||||
memoryTopology(memMb uint64) Memory
|
||||
@@ -361,7 +361,7 @@ func (a *acrnArchBase) memoryTopology(memoryMb uint64) Memory {
|
||||
return memory
|
||||
}
|
||||
|
||||
func (a *acrnArchBase) capabilities() types.Capabilities {
|
||||
func (a *acrnArchBase) capabilities(config HypervisorConfig) types.Capabilities {
|
||||
var caps types.Capabilities
|
||||
|
||||
caps.SetBlockDeviceSupport()
|
||||
|
||||
@@ -83,8 +83,9 @@ func TestAcrnArchBaseKernelParameters(t *testing.T) {
|
||||
func TestAcrnArchBaseCapabilities(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
acrnArchBase := newAcrnArchBase()
|
||||
config := HypervisorConfig{}
|
||||
|
||||
c := acrnArchBase.capabilities()
|
||||
c := acrnArchBase.capabilities(config)
|
||||
assert.True(c.IsBlockDeviceSupported())
|
||||
assert.True(c.IsBlockDeviceHotplugSupported())
|
||||
assert.False(c.IsFsSharingSupported())
|
||||
|
||||
@@ -349,6 +349,10 @@ func (clh *cloudHypervisor) createVirtiofsDaemon(sharedPath string) (VirtiofsDae
|
||||
}
|
||||
|
||||
func (clh *cloudHypervisor) setupVirtiofsDaemon(ctx context.Context) error {
|
||||
if clh.config.SharedFS == config.NoSharedFS {
|
||||
return nil
|
||||
}
|
||||
|
||||
if clh.config.SharedFS == config.Virtio9P {
|
||||
return errors.New("cloud-hypervisor only supports virtio based file sharing")
|
||||
}
|
||||
@@ -860,12 +864,12 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
|
||||
defer cancel()
|
||||
|
||||
// Create the clh device config via the constructor to ensure default values are properly assigned
|
||||
clhDevice := *chclient.NewDeviceConfig(*(*device).GetSysfsDev())
|
||||
clhDevice := *chclient.NewDeviceConfig(device.SysfsDev)
|
||||
pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err))
|
||||
}
|
||||
clh.devicesIds[*(*device).GetID()] = pciInfo.GetId()
|
||||
clh.devicesIds[device.ID] = pciInfo.GetId()
|
||||
|
||||
// clh doesn't use bridges, so the PCI path is simply the slot
|
||||
// number of the device. This will break if clh starts using
|
||||
@@ -882,14 +886,11 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
|
||||
return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf)
|
||||
}
|
||||
|
||||
guestPciPath, err := types.PciPathFromString(tokens[0])
|
||||
|
||||
pciDevice, ok := (*device).(config.VFIOPCIDev)
|
||||
if !ok {
|
||||
if device.Type == config.VFIOAPDeviceMediatedType {
|
||||
return fmt.Errorf("VFIO device %+v is not PCI, only PCI is supported in Cloud Hypervisor", device)
|
||||
}
|
||||
pciDevice.GuestPciPath = guestPciPath
|
||||
*device = pciDevice
|
||||
|
||||
device.GuestPciPath, err = types.PciPathFromString(tokens[0])
|
||||
|
||||
return err
|
||||
}
|
||||
@@ -933,7 +934,7 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int
|
||||
case BlockDev:
|
||||
deviceID = clhDriveIndexToID(devInfo.(*config.BlockDrive).Index)
|
||||
case VfioDev:
|
||||
deviceID = *devInfo.(config.VFIODev).GetID()
|
||||
deviceID = devInfo.(*config.VFIODev).ID
|
||||
default:
|
||||
clh.Logger().WithFields(log.Fields{"devInfo": devInfo,
|
||||
"deviceType": devType}).Error("HotplugRemoveDevice: unsupported device")
|
||||
@@ -1210,7 +1211,9 @@ func (clh *cloudHypervisor) Capabilities(ctx context.Context) types.Capabilities
|
||||
|
||||
clh.Logger().WithField("function", "Capabilities").Info("get Capabilities")
|
||||
var caps types.Capabilities
|
||||
caps.SetFsSharingSupport()
|
||||
if clh.config.SharedFS != config.NoSharedFS {
|
||||
caps.SetFsSharingSupport()
|
||||
}
|
||||
caps.SetBlockDeviceHotplugSupport()
|
||||
return caps
|
||||
}
|
||||
|
||||
@@ -673,7 +673,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) {
|
||||
_, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev)
|
||||
assert.NoError(err, "Hotplug remove block device expected no error")
|
||||
|
||||
_, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIOPCIDev{}, VfioDev)
|
||||
_, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIODev{}, VfioDev)
|
||||
assert.NoError(err, "Hotplug remove vfio block device expected no error")
|
||||
|
||||
_, err = clh.HotplugRemoveDevice(context.Background(), nil, NetDev)
|
||||
@@ -726,3 +726,30 @@ func TestClhSetConfig(t *testing.T) {
|
||||
|
||||
assert.Equal(clh.config, config)
|
||||
}
|
||||
|
||||
func TestClhCapabilities(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
hConfig, err := newClhConfig()
|
||||
assert.NoError(err)
|
||||
|
||||
clh := &cloudHypervisor{}
|
||||
assert.Equal(clh.config, HypervisorConfig{})
|
||||
|
||||
hConfig.SharedFS = config.VirtioFS
|
||||
|
||||
err = clh.setConfig(&hConfig)
|
||||
assert.NoError(err)
|
||||
|
||||
var ctx context.Context
|
||||
c := clh.Capabilities(ctx)
|
||||
assert.True(c.IsFsSharingSupported())
|
||||
|
||||
hConfig.SharedFS = config.NoSharedFS
|
||||
|
||||
err = clh.setConfig(&hConfig)
|
||||
assert.NoError(err)
|
||||
|
||||
c = clh.Capabilities(ctx)
|
||||
assert.False(c.IsFsSharingSupported())
|
||||
}
|
||||
|
||||
@@ -288,12 +288,12 @@ type HypervisorConfig struct {
|
||||
// root bus instead of a bridge.
|
||||
HotplugVFIOOnRootBus bool
|
||||
|
||||
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
|
||||
// The PCIe Root Port device is used to hot-plug the PCIe device
|
||||
PCIeRootPort uint32
|
||||
// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
|
||||
// root port, switch, bridge or no port
|
||||
HotPlugVFIO hv.PCIePort
|
||||
|
||||
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
|
||||
// root port, switch or no port
|
||||
// root port, switch, bridge or no port
|
||||
ColdPlugVFIO hv.PCIePort
|
||||
|
||||
// BootToBeTemplate used to indicate if the VM is created to be a template VM
|
||||
|
||||
@@ -389,7 +389,6 @@ type HypervisorConfig struct {
|
||||
Gid uint32
|
||||
SEVGuestPolicy uint32
|
||||
SNPGuestPolicy uint64
|
||||
PCIeRootPort uint32
|
||||
NumVCPUs uint32
|
||||
RemoteHypervisorTimeout uint32
|
||||
IOMMUPlatform bool
|
||||
@@ -420,7 +419,10 @@ type HypervisorConfig struct {
|
||||
DisableSeLinux bool
|
||||
DisableGuestSeLinux bool
|
||||
LegacySerial bool
|
||||
ColdPlugVFIO hv.PCIePort
|
||||
HotPlugVFIO config.PCIePort
|
||||
ColdPlugVFIO config.PCIePort
|
||||
VFIODevices []config.DeviceInfo
|
||||
VhostUserBlkDevices []config.DeviceInfo
|
||||
}
|
||||
|
||||
// vcpu mapping from vcpu number to thread number
|
||||
|
||||
@@ -21,6 +21,7 @@ import (
|
||||
"github.com/docker/go-units"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/api"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
|
||||
volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/uuid"
|
||||
@@ -1148,7 +1149,7 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
|
||||
ContainerPath: dev.ContainerPath,
|
||||
Type: kataVfioPciDevType,
|
||||
Id: groupNum,
|
||||
Options: nil,
|
||||
Options: make([]string, len(devList)),
|
||||
}
|
||||
|
||||
// We always pass the device information to the agent, since
|
||||
@@ -1158,16 +1159,16 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
|
||||
if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel {
|
||||
kataDevice.Type = kataVfioPciGuestKernelDevType
|
||||
}
|
||||
for i, dev := range devList {
|
||||
if dev.Type == config.VFIOAPDeviceMediatedType {
|
||||
kataDevice.Type = kataVfioApDevType
|
||||
kataDevice.Options = dev.APDevices
|
||||
} else {
|
||||
|
||||
if (*devList[0]).GetType() == config.VFIOAPDeviceMediatedType {
|
||||
kataDevice.Type = kataVfioApDevType
|
||||
kataDevice.Options = (*devList[0]).(config.VFIOAPDev).APDevices
|
||||
} else {
|
||||
kataDevice.Options = make([]string, len(devList))
|
||||
for i, device := range devList {
|
||||
pciDevice := (*device).(config.VFIOPCIDev)
|
||||
kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDevice.BDF, pciDevice.GuestPciPath)
|
||||
devBDF := drivers.GetBDF(dev.BDF)
|
||||
kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", devBDF, dev.GuestPciPath)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return kataDevice
|
||||
@@ -1354,7 +1355,6 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co
|
||||
if _, err = k.sendReq(ctx, req); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return buildProcessFromExecID(req.ExecId)
|
||||
}
|
||||
|
||||
|
||||
@@ -245,7 +245,6 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) {
|
||||
DisableNestingChecks: sconfig.HypervisorConfig.DisableNestingChecks,
|
||||
DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm,
|
||||
HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus,
|
||||
PCIeRootPort: sconfig.HypervisorConfig.PCIeRootPort,
|
||||
BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate,
|
||||
BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate,
|
||||
DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet,
|
||||
@@ -487,8 +486,8 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) {
|
||||
DisableNestingChecks: hconf.DisableNestingChecks,
|
||||
DisableImageNvdimm: hconf.DisableImageNvdimm,
|
||||
HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus,
|
||||
HotPlugVFIO: hconf.HotPlugVFIO,
|
||||
ColdPlugVFIO: hconf.ColdPlugVFIO,
|
||||
PCIeRootPort: hconf.PCIeRootPort,
|
||||
BootToBeTemplate: hconf.BootToBeTemplate,
|
||||
BootFromTemplate: hconf.BootFromTemplate,
|
||||
DisableVhostNet: hconf.DisableVhostNet,
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
package persistapi
|
||||
|
||||
import (
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
specs "github.com/opencontainers/runtime-spec/specs-go"
|
||||
)
|
||||
@@ -131,10 +131,6 @@ type HypervisorConfig struct {
|
||||
// Enable SGX. Hardware-based isolation and memory encryption.
|
||||
SGXEPCSize int64
|
||||
|
||||
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
|
||||
// The PCIe Root Port device is used to hot-plug the PCIe device
|
||||
PCIeRootPort uint32
|
||||
|
||||
// NumVCPUs specifies default number of vCPUs for the VM.
|
||||
NumVCPUs uint32
|
||||
|
||||
@@ -199,9 +195,13 @@ type HypervisorConfig struct {
|
||||
// root bus instead of a bridge.
|
||||
HotplugVFIOOnRootBus bool
|
||||
|
||||
// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
|
||||
// root, switch, bridge or no-port
|
||||
HotPlugVFIO config.PCIePort
|
||||
|
||||
// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
|
||||
// root port or a switch or no-port
|
||||
ColdPlugVFIO hv.PCIePort
|
||||
// root, bridge, switch or no-port
|
||||
ColdPlugVFIO config.PCIePort
|
||||
|
||||
// BootToBeTemplate used to indicate if the VM is created to be a template VM
|
||||
BootToBeTemplate bool
|
||||
|
||||
@@ -143,9 +143,11 @@ const (
|
||||
// root bus instead of a bridge.
|
||||
HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus"
|
||||
|
||||
// PCIeRootPort is used to indicate the number of PCIe Root Port devices
|
||||
// The PCIe Root Port device is used to hot-plug the PCIe device
|
||||
PCIeRootPort = kataAnnotHypervisorPrefix + "pcie_root_port"
|
||||
// ColdPlugVFIO is a sandbox annotation used to indicate if devices need to be coldplugged.
|
||||
ColdPlugVFIO = kataAnnotHypervisorPrefix + "cold_plug_vfio"
|
||||
|
||||
// HotPlugVFIO is a sandbox annotation used to indicate if devices need to be hotplugged.
|
||||
HotPlugVFIO = kataAnnotHypervisorPrefix + "hot_plug_vfio"
|
||||
|
||||
// EntropySource is a sandbox annotation to specify the path to a host source of
|
||||
// entropy (/dev/random, /dev/urandom or real hardware RNG device)
|
||||
|
||||
@@ -66,6 +66,11 @@ const romFile = ""
|
||||
// Default value is false.
|
||||
const defaultDisableModern = false
|
||||
|
||||
// A deeper PCIe topology than 5 is already not advisable just for the sake
|
||||
// of having enough buffer we limit ourselves to 10 and exit if we reach
|
||||
// the root bus
|
||||
const maxPCIeTopoDepth = 10
|
||||
|
||||
type qmpChannel struct {
|
||||
qmp *govmmQemu.QMP
|
||||
ctx context.Context
|
||||
@@ -76,15 +81,15 @@ type qmpChannel struct {
|
||||
|
||||
// QemuState keeps Qemu's state
|
||||
type QemuState struct {
|
||||
UUID string
|
||||
Bridges []types.Bridge
|
||||
// HotpluggedCPUs is the list of CPUs that were hot-added
|
||||
UUID string
|
||||
HotPlugVFIO config.PCIePort
|
||||
Bridges []types.Bridge
|
||||
HotpluggedVCPUs []hv.CPUDevice
|
||||
HotpluggedMemory int
|
||||
VirtiofsDaemonPid int
|
||||
PCIeRootPort int
|
||||
HotplugVFIOOnRootBus bool
|
||||
ColdPlugVFIO hv.PCIePort
|
||||
HotplugVFIO config.PCIePort
|
||||
ColdPlugVFIO config.PCIePort
|
||||
}
|
||||
|
||||
// qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
|
||||
@@ -207,7 +212,7 @@ func (q *qemu) Capabilities(ctx context.Context) types.Capabilities {
|
||||
span, _ := katatrace.Trace(ctx, q.Logger(), "Capabilities", qemuTracingTags, map[string]string{"sandbox_id": q.id})
|
||||
defer span.End()
|
||||
|
||||
return q.arch.capabilities()
|
||||
return q.arch.capabilities(q.config)
|
||||
}
|
||||
|
||||
func (q *qemu) HypervisorConfig() HypervisorConfig {
|
||||
@@ -278,10 +283,10 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso
|
||||
|
||||
q.Logger().Debug("Creating UUID")
|
||||
q.state.UUID = uuid.Generate().String()
|
||||
|
||||
q.state.HotPlugVFIO = q.config.HotPlugVFIO
|
||||
q.state.ColdPlugVFIO = q.config.ColdPlugVFIO
|
||||
q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus
|
||||
q.state.PCIeRootPort = int(q.config.PCIeRootPort)
|
||||
q.state.HotPlugVFIO = q.config.HotPlugVFIO
|
||||
|
||||
// The path might already exist, but in case of VM templating,
|
||||
// we have to create it since the sandbox has not created it yet.
|
||||
@@ -727,27 +732,12 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
|
||||
}
|
||||
}
|
||||
|
||||
// Add PCIe Root Port devices to hypervisor
|
||||
// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port.
|
||||
// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
|
||||
memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory()
|
||||
|
||||
if hypervisorConfig.PCIeRootPort > 0 {
|
||||
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort, memSize32bit, memSize64bit)
|
||||
}
|
||||
|
||||
// The default OVMF MMIO aperture is too small for some PCIe devices
|
||||
// with huge BARs so we need to increase it.
|
||||
// memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
|
||||
if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
|
||||
pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
|
||||
fwCfg := govmmQemu.FwCfg{
|
||||
Name: "opt/ovmf/X-PciMmio64Mb",
|
||||
Str: pciMmio64Mb,
|
||||
if machine.Type == QemuQ35 || machine.Type == QemuVirt {
|
||||
if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig, machine.Type); err != nil {
|
||||
q.Logger().WithError(err).Errorf("Cannot create PCIe topology")
|
||||
return err
|
||||
}
|
||||
qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
|
||||
}
|
||||
|
||||
q.qemuConfig = qemuConfig
|
||||
|
||||
q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath)
|
||||
@@ -773,6 +763,101 @@ func (q *qemu) checkBpfEnabled() {
|
||||
}
|
||||
}
|
||||
|
||||
// If a user uses 8 GPUs with 4 devices in each IOMMU Group that means we need
|
||||
// to hotplug 32 devices. We do not have enough PCIe root bus slots to
|
||||
// accomplish this task. Kata will use already some slots for vfio-xxxx-pci
|
||||
// devices.
|
||||
// Max PCI slots per root bus is 32
|
||||
// Max PCIe root ports is 16
|
||||
// Max PCIe switch ports is 16
|
||||
// There is only 64kB of IO memory each root,switch port will consume 4k hence
|
||||
// only 16 ports possible.
|
||||
func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig, machineType string) error {
|
||||
|
||||
// If no-port set just return no need to add PCIe Root Port or PCIe Switches
|
||||
if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort && machineType == QemuQ35 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Add PCIe Root Port or PCIe Switches to the hypervisor
|
||||
// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged
|
||||
// into a PCIe Root Port or PCIe Switch.
|
||||
// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
|
||||
|
||||
// Deduce the right values for mem-reserve and pref-64-reserve memory regions
|
||||
memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory()
|
||||
|
||||
// The default OVMF MMIO aperture is too small for some PCIe devices
|
||||
// with huge BARs so we need to increase it.
|
||||
// memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
|
||||
if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
|
||||
pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
|
||||
fwCfg := govmmQemu.FwCfg{
|
||||
Name: "opt/ovmf/X-PciMmio64Mb",
|
||||
Str: pciMmio64Mb,
|
||||
}
|
||||
qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
|
||||
}
|
||||
|
||||
// Get the number of hot(cold)-pluggable ports needed from the provided
|
||||
// VFIO devices and VhostUserBlockDevices
|
||||
var numOfPluggablePorts uint32 = 0
|
||||
for _, dev := range hypervisorConfig.VFIODevices {
|
||||
var err error
|
||||
dev.HostPath, err = config.GetHostPath(dev, false, "")
|
||||
if err != nil {
|
||||
return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err)
|
||||
}
|
||||
devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err)
|
||||
}
|
||||
for _, vfioDevice := range devicesPerIOMMUGroup {
|
||||
if drivers.IsPCIeDevice(vfioDevice.BDF) {
|
||||
numOfPluggablePorts = numOfPluggablePorts + 1
|
||||
}
|
||||
}
|
||||
}
|
||||
vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus)
|
||||
vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort)
|
||||
|
||||
numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices)
|
||||
|
||||
// If number of PCIe root ports > 16 then bail out otherwise we may
|
||||
// use up all slots or IO memory on the root bus and vfio-XXX-pci devices
|
||||
// cannot be added which are crucial for Kata max slots on root bus is 32
|
||||
// max slots on the complete pci(e) topology is 256 in QEMU
|
||||
if vfioOnRootPort {
|
||||
// On Arm the vhost-user-block device is a PCIe device we need
|
||||
// to account for it in the number of pluggable ports
|
||||
if machineType == QemuVirt {
|
||||
numOfPluggablePorts = numOfPluggablePorts + uint32(numOfVhostUserBlockDevices)
|
||||
}
|
||||
if numOfPluggablePorts > maxPCIeRootPort {
|
||||
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
|
||||
}
|
||||
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
|
||||
return nil
|
||||
}
|
||||
if vfioOnSwitchPort {
|
||||
// On Arm the vhost-user-block device is a PCIe device we need
|
||||
// to account for it in the number of pluggable ports
|
||||
if machineType == QemuVirt {
|
||||
numOfPluggableRootPorts := uint32(numOfVhostUserBlockDevices)
|
||||
if numOfPluggableRootPorts > maxPCIeRootPort {
|
||||
return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
|
||||
}
|
||||
qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggableRootPorts, memSize32bit, memSize64bit)
|
||||
}
|
||||
if numOfPluggablePorts > maxPCIeSwitchPort {
|
||||
return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort)
|
||||
}
|
||||
qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
|
||||
return nil
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (q *qemu) vhostFSSocketPath(id string) (string, error) {
|
||||
return utils.BuildSocketPath(q.config.VMStorePath, id, vhostFSSocket)
|
||||
}
|
||||
@@ -1612,6 +1697,7 @@ func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDri
|
||||
}
|
||||
|
||||
func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation, devID string) (err error) {
|
||||
|
||||
err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false, vAttr.ReconnectTime)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -1629,18 +1715,14 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
|
||||
|
||||
switch machineType {
|
||||
case QemuVirt:
|
||||
if q.state.PCIeRootPort <= 0 {
|
||||
return fmt.Errorf("Vhost-user-blk device is a PCIe device if machine type is virt. Need to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for virt")
|
||||
}
|
||||
|
||||
//The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0
|
||||
//Since the dev is the first and only one on this bus(root port), it should be 0.
|
||||
addr := "00"
|
||||
|
||||
bridgeId := fmt.Sprintf("%s%d", pcieRootPortPrefix, len(drivers.AllPCIeDevs))
|
||||
drivers.AllPCIeDevs[devID] = true
|
||||
bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort]))
|
||||
config.PCIeDevices[config.RootPort][devID] = true
|
||||
|
||||
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeId)
|
||||
bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID)
|
||||
bridgeSlot, err := q.qomGetSlot(bridgeQomPath)
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -1656,7 +1738,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
|
||||
return err
|
||||
}
|
||||
|
||||
if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeId); err != nil {
|
||||
if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeID); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -1770,41 +1852,108 @@ func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) {
|
||||
|
||||
// Query QMP to find a device's PCI path given its QOM path or ID
|
||||
func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) {
|
||||
// XXX: For now we assume there's exactly one bridge, since
|
||||
// that's always how we configure qemu from Kata for now. It
|
||||
// would be good to generalize this to different PCI
|
||||
// topologies
|
||||
|
||||
var slots []types.PciSlot
|
||||
|
||||
devSlot, err := q.qomGetSlot(qemuID)
|
||||
if err != nil {
|
||||
return types.PciPath{}, err
|
||||
}
|
||||
slots = append(slots, devSlot)
|
||||
|
||||
busq, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qemuID, "parent_bus")
|
||||
// This only works for Q35 and Virt
|
||||
r, _ := regexp.Compile(`^/machine/.*/pcie.0`)
|
||||
|
||||
var parentPath = qemuID
|
||||
// We do not want to use a forever loop here, a deeper PCIe topology
|
||||
// than 5 is already not advisable just for the sake of having enough
|
||||
// buffer we limit ourselves to 10 and leave the loop early if we hit
|
||||
// the root bus.
|
||||
for i := 1; i <= maxPCIeTopoDepth; i++ {
|
||||
parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus")
|
||||
if err != nil {
|
||||
return types.PciPath{}, err
|
||||
}
|
||||
|
||||
busQOM, ok := parenBusQOM.(string)
|
||||
if !ok {
|
||||
return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM)
|
||||
}
|
||||
|
||||
// If we hit /machine/q35/pcie.0 we're done this is the root bus
|
||||
// we climbed the complete hierarchy
|
||||
if r.Match([]byte(busQOM)) {
|
||||
break
|
||||
}
|
||||
|
||||
// `bus` is the QOM path of the QOM bus object, but we need
|
||||
// the PCI parent_bus which manages that bus. There doesn't seem
|
||||
// to be a way to get that other than to simply drop the last
|
||||
// path component.
|
||||
idx := strings.LastIndex(busQOM, "/")
|
||||
if idx == -1 {
|
||||
return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM)
|
||||
}
|
||||
parentBus := busQOM[:idx]
|
||||
|
||||
parentSlot, err := q.qomGetSlot(parentBus)
|
||||
if err != nil {
|
||||
return types.PciPath{}, err
|
||||
}
|
||||
|
||||
// Prepend the slots, since we're climbing the hierarchy
|
||||
slots = append([]types.PciSlot{parentSlot}, slots...)
|
||||
parentPath = parentBus
|
||||
}
|
||||
return types.PciPathFromSlots(slots...)
|
||||
}
|
||||
|
||||
func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) {
|
||||
return q.executeVFIODeviceAdd(device)
|
||||
}
|
||||
|
||||
func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.VFIODev) (err error) {
|
||||
return q.executeVFIODeviceAdd(device)
|
||||
}
|
||||
|
||||
func (q *qemu) hotplugVFIODeviceBridgePort(ctx context.Context, device *config.VFIODev) (err error) {
|
||||
addr, bridge, err := q.arch.addDeviceToBridge(ctx, device.ID, types.PCI)
|
||||
if err != nil {
|
||||
return types.PciPath{}, err
|
||||
return err
|
||||
}
|
||||
|
||||
bus, ok := busq.(string)
|
||||
if !ok {
|
||||
return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, busq)
|
||||
}
|
||||
defer func() {
|
||||
if err != nil {
|
||||
q.arch.removeDeviceFromBridge(device.ID)
|
||||
}
|
||||
}()
|
||||
return q.executePCIVFIODeviceAdd(device, addr, bridge.ID)
|
||||
}
|
||||
|
||||
// `bus` is the QOM path of the QOM bus object, but we need
|
||||
// the PCI bridge which manages that bus. There doesn't seem
|
||||
// to be a way to get that other than to simply drop the last
|
||||
// path component.
|
||||
idx := strings.LastIndex(bus, "/")
|
||||
if idx == -1 {
|
||||
return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", bus)
|
||||
func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, bridgeID string) error {
|
||||
switch device.Type {
|
||||
case config.VFIOPCIDeviceNormalType:
|
||||
return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, addr, bridgeID, romFile)
|
||||
case config.VFIOPCIDeviceMediatedType:
|
||||
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile)
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
|
||||
default:
|
||||
return fmt.Errorf("Incorrect VFIO device type found")
|
||||
}
|
||||
bridge := bus[:idx]
|
||||
}
|
||||
|
||||
bridgeSlot, err := q.qomGetSlot(bridge)
|
||||
if err != nil {
|
||||
return types.PciPath{}, err
|
||||
func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error {
|
||||
switch device.Type {
|
||||
case config.VFIOPCIDeviceNormalType:
|
||||
return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, device.Bus, romFile)
|
||||
case config.VFIOPCIDeviceMediatedType:
|
||||
return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile)
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
|
||||
default:
|
||||
return fmt.Errorf("Incorrect VFIO device type found")
|
||||
}
|
||||
|
||||
return types.PciPathFromSlots(bridgeSlot, devSlot)
|
||||
}
|
||||
|
||||
func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op Operation) (err error) {
|
||||
@@ -1812,109 +1961,53 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op
|
||||
return err
|
||||
}
|
||||
|
||||
devID := *(*device).GetID()
|
||||
machineType := q.HypervisorConfig().HypervisorMachineType
|
||||
|
||||
if op == AddDevice {
|
||||
|
||||
buf, _ := json.Marshal(device)
|
||||
q.Logger().WithFields(logrus.Fields{
|
||||
"machine-type": machineType,
|
||||
"hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus,
|
||||
"pcie-root-port": q.state.PCIeRootPort,
|
||||
"device-info": string(buf),
|
||||
"machine-type": q.HypervisorConfig().HypervisorMachineType,
|
||||
"hot-plug-vfio": q.state.HotPlugVFIO,
|
||||
"device-info": string(buf),
|
||||
}).Info("Start hot-plug VFIO device")
|
||||
|
||||
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
|
||||
// for pc machine type instead of bridge. This is useful for devices that require
|
||||
// a large PCI BAR which is a currently a limitation with PCI bridges.
|
||||
if q.state.HotplugVFIOOnRootBus {
|
||||
switch (*device).GetType() {
|
||||
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
|
||||
// In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port.
|
||||
pciDevice, ok := (*device).(config.VFIOPCIDev)
|
||||
if !ok {
|
||||
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
|
||||
}
|
||||
switch machineType {
|
||||
case QemuQ35:
|
||||
if pciDevice.IsPCIe && q.state.PCIeRootPort <= 0 {
|
||||
q.Logger().WithField("dev-id", (*device).GetID()).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35")
|
||||
pciDevice.Bus = ""
|
||||
}
|
||||
default:
|
||||
pciDevice.Bus = ""
|
||||
}
|
||||
*device = pciDevice
|
||||
|
||||
if pciDevice.Type == config.VFIOPCIDeviceNormalType {
|
||||
err = q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, pciDevice.Bus, romFile)
|
||||
} else {
|
||||
err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), "", pciDevice.Bus, romFile)
|
||||
}
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
|
||||
}
|
||||
// In case MachineType is q35, a PCIe device is hotplugged on
|
||||
// a PCIe Root Port or alternatively on a PCIe Switch Port
|
||||
if q.HypervisorConfig().HypervisorMachineType != QemuQ35 && q.HypervisorConfig().HypervisorMachineType != QemuVirt {
|
||||
device.Bus = ""
|
||||
} else {
|
||||
addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI)
|
||||
var err error
|
||||
// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
|
||||
// for pc machine type instead of bridge. This is useful for devices that require
|
||||
// a large PCI BAR which is a currently a limitation with PCI bridges.
|
||||
if q.state.HotPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus {
|
||||
err = q.hotplugVFIODeviceRootPort(ctx, device)
|
||||
} else if q.state.HotPlugVFIO == config.SwitchPort {
|
||||
err = q.hotplugVFIODeviceSwitchPort(ctx, device)
|
||||
} else {
|
||||
err = q.hotplugVFIODeviceBridgePort(ctx, device)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err != nil {
|
||||
q.arch.removeDeviceFromBridge(devID)
|
||||
}
|
||||
}()
|
||||
|
||||
switch (*device).GetType() {
|
||||
case config.VFIOPCIDeviceNormalType:
|
||||
pciDevice, ok := (*device).(config.VFIOPCIDev)
|
||||
if !ok {
|
||||
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
|
||||
}
|
||||
err = q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, addr, bridge.ID, romFile)
|
||||
case config.VFIOPCIDeviceMediatedType:
|
||||
err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), addr, bridge.ID, romFile)
|
||||
case config.VFIOAPDeviceMediatedType:
|
||||
err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
|
||||
default:
|
||||
return fmt.Errorf("Incorrect VFIO device type found")
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
switch (*device).GetType() {
|
||||
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
|
||||
pciDevice, ok := (*device).(config.VFIOPCIDev)
|
||||
if !ok {
|
||||
return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
|
||||
}
|
||||
// XXX: Depending on whether we're doing root port or
|
||||
// bridge hotplug, and how the bridge is set up in
|
||||
// other parts of the code, we may or may not already
|
||||
// have information about the slot number of the
|
||||
// bridge and or the device. For simplicity, just
|
||||
// query both of them back from qemu
|
||||
guestPciPath, err := q.qomGetPciPath(devID)
|
||||
pciDevice.GuestPciPath = guestPciPath
|
||||
*device = pciDevice
|
||||
return err
|
||||
}
|
||||
// XXX: Depending on whether we're doing root port or
|
||||
// bridge hotplug, and how the bridge is set up in
|
||||
// other parts of the code, we may or may not already
|
||||
// have information about the slot number of the
|
||||
// bridge and or the device. For simplicity, just
|
||||
// query both of them back from qemu
|
||||
device.GuestPciPath, err = q.qomGetPciPath(device.ID)
|
||||
return err
|
||||
} else {
|
||||
q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device")
|
||||
|
||||
if !q.state.HotplugVFIOOnRootBus {
|
||||
if err := q.arch.removeDeviceFromBridge(devID); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID)
|
||||
}
|
||||
|
||||
q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device")
|
||||
|
||||
if !q.state.HotplugVFIOOnRootBus {
|
||||
if err := q.arch.removeDeviceFromBridge(device.ID); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID)
|
||||
|
||||
}
|
||||
|
||||
func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error {
|
||||
@@ -2612,7 +2705,7 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin
|
||||
for i := uint32(0); i < number; i++ {
|
||||
devices = append(devices,
|
||||
govmmQemu.PCIeRootPortDevice{
|
||||
ID: fmt.Sprintf("%s%d", pcieRootPortPrefix, i),
|
||||
ID: fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, i),
|
||||
Bus: bus,
|
||||
Chassis: chassis,
|
||||
Slot: strconv.FormatUint(uint64(i), 10),
|
||||
@@ -2626,6 +2719,79 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin
|
||||
return devices
|
||||
}
|
||||
|
||||
// gollangci-lint enforces multi-line comments to be a block comment
|
||||
// not multiple single line comments ...
|
||||
/* pcie.0 bus
|
||||
// -------------------------------------------------
|
||||
// |
|
||||
// -------------
|
||||
// | Root Port |
|
||||
// -------------
|
||||
// -------------------------|------------------------
|
||||
// | ----------------- |
|
||||
// | PCI Express | Upstream Port | |
|
||||
// | Switch ----------------- |
|
||||
// | | | |
|
||||
// | ------------------- ------------------- |
|
||||
// | | Downstream Port | | Downstream Port | |
|
||||
// | ------------------- ------------------- |
|
||||
// -------------|-----------------------|------------
|
||||
// ------------- --------------
|
||||
// | GPU/ACCEL | | IB/ETH NIC |
|
||||
// ------------- --------------
|
||||
*/
|
||||
// genericAppendPCIeSwitch adds a PCIe Swtich
|
||||
func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
|
||||
|
||||
// Q35, Virt have the correct PCIe support,
|
||||
// hence ignore all other machines
|
||||
if machineType != QemuQ35 && machineType != QemuVirt {
|
||||
return devices
|
||||
}
|
||||
|
||||
// Using an own ID for the root port, so we do not clash with already
|
||||
// existing root ports adding "s" for switch prefix
|
||||
pcieRootPort := govmmQemu.PCIeRootPortDevice{
|
||||
ID: fmt.Sprintf("%s%s%d", config.PCIeSwitchPortPrefix, config.PCIeRootPortPrefix, 0),
|
||||
Bus: defaultBridgeBus,
|
||||
Chassis: "1",
|
||||
Slot: strconv.FormatUint(uint64(0), 10),
|
||||
Multifunction: false,
|
||||
Addr: "0",
|
||||
MemReserve: fmt.Sprintf("%dB", memSize32bit),
|
||||
Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
|
||||
}
|
||||
|
||||
devices = append(devices, pcieRootPort)
|
||||
|
||||
pcieSwitchUpstreamPort := govmmQemu.PCIeSwitchUpstreamPortDevice{
|
||||
ID: fmt.Sprintf("%s%d", config.PCIeSwitchUpstreamPortPrefix, 0),
|
||||
Bus: pcieRootPort.ID,
|
||||
}
|
||||
devices = append(devices, pcieSwitchUpstreamPort)
|
||||
|
||||
currentChassis, err := strconv.Atoi(pcieRootPort.Chassis)
|
||||
if err != nil {
|
||||
return devices
|
||||
}
|
||||
nextChassis := currentChassis + 1
|
||||
|
||||
for i := uint32(0); i < number; i++ {
|
||||
|
||||
pcieSwitchDownstreamPort := govmmQemu.PCIeSwitchDownstreamPortDevice{
|
||||
ID: fmt.Sprintf("%s%d", config.PCIeSwitchhDownstreamPortPrefix, i),
|
||||
Bus: pcieSwitchUpstreamPort.ID,
|
||||
Chassis: fmt.Sprintf("%d", nextChassis),
|
||||
Slot: strconv.FormatUint(uint64(i), 10),
|
||||
// TODO: MemReserve: fmt.Sprintf("%dB", memSize32bit),
|
||||
// TODO: Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
|
||||
}
|
||||
devices = append(devices, pcieSwitchDownstreamPort)
|
||||
}
|
||||
|
||||
return devices
|
||||
}
|
||||
|
||||
func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
|
||||
span, _ := katatrace.Trace(ctx, q.Logger(), "GetThreadIDs", qemuTracingTags, map[string]string{"sandbox_id": q.id})
|
||||
defer span.End()
|
||||
@@ -2801,7 +2967,6 @@ func (q *qemu) Save() (s hv.HypervisorState) {
|
||||
s.UUID = q.state.UUID
|
||||
s.HotpluggedMemory = q.state.HotpluggedMemory
|
||||
s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus
|
||||
s.PCIeRootPort = q.state.PCIeRootPort
|
||||
|
||||
for _, bridge := range q.arch.getBridges() {
|
||||
s.Bridges = append(s.Bridges, hv.Bridge{
|
||||
@@ -2825,7 +2990,6 @@ func (q *qemu) Load(s hv.HypervisorState) {
|
||||
q.state.HotpluggedMemory = s.HotpluggedMemory
|
||||
q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus
|
||||
q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid
|
||||
q.state.PCIeRootPort = s.PCIeRootPort
|
||||
|
||||
for _, bridge := range s.Bridges {
|
||||
q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr))
|
||||
|
||||
@@ -26,6 +26,7 @@ import (
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
|
||||
"github.com/intel-go/cpuid"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
|
||||
)
|
||||
|
||||
@@ -182,7 +183,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
|
||||
return q, nil
|
||||
}
|
||||
|
||||
func (q *qemuAmd64) capabilities() types.Capabilities {
|
||||
func (q *qemuAmd64) capabilities(hConfig HypervisorConfig) types.Capabilities {
|
||||
var caps types.Capabilities
|
||||
|
||||
if q.qemuMachine.Type == QemuQ35 ||
|
||||
@@ -191,7 +192,9 @@ func (q *qemuAmd64) capabilities() types.Capabilities {
|
||||
}
|
||||
|
||||
caps.SetMultiQueueSupport()
|
||||
caps.SetFsSharingSupport()
|
||||
if hConfig.SharedFS != config.NoSharedFS {
|
||||
caps.SetFsSharingSupport()
|
||||
}
|
||||
|
||||
return caps
|
||||
}
|
||||
@@ -323,6 +326,7 @@ func (q *qemuAmd64) appendProtectionDevice(devices []govmmQemu.Device, firmware,
|
||||
ReducedPhysBits: 1,
|
||||
}), "", nil
|
||||
case noneProtection:
|
||||
|
||||
return devices, firmware, nil
|
||||
|
||||
default:
|
||||
|
||||
@@ -42,13 +42,14 @@ func TestQemuAmd64BadMachineType(t *testing.T) {
|
||||
|
||||
func TestQemuAmd64Capabilities(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
config := HypervisorConfig{}
|
||||
|
||||
amd64 := newTestQemu(assert, QemuQ35)
|
||||
caps := amd64.capabilities()
|
||||
caps := amd64.capabilities(config)
|
||||
assert.True(caps.IsBlockDeviceHotplugSupported())
|
||||
|
||||
amd64 = newTestQemu(assert, QemuMicrovm)
|
||||
caps = amd64.capabilities()
|
||||
caps = amd64.capabilities(config)
|
||||
assert.False(caps.IsBlockDeviceHotplugSupported())
|
||||
}
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ type qemuArch interface {
|
||||
kernelParameters(debug bool) []Param
|
||||
|
||||
//capabilities returns the capabilities supported by QEMU
|
||||
capabilities() types.Capabilities
|
||||
capabilities(config HypervisorConfig) types.Capabilities
|
||||
|
||||
// bridges sets the number bridges for the machine type
|
||||
bridges(number uint32)
|
||||
@@ -150,6 +150,9 @@ type qemuArch interface {
|
||||
// appendPCIeRootPortDevice appends a pcie-root-port device to pcie.0 bus
|
||||
appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device
|
||||
|
||||
// appendPCIeSwitch appends a ioh3420 device to a pcie-root-port
|
||||
appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device
|
||||
|
||||
// append vIOMMU device
|
||||
appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Device, error)
|
||||
|
||||
@@ -204,7 +207,8 @@ const (
|
||||
defaultBridgeBus = "pcie.0"
|
||||
defaultPCBridgeBus = "pci.0"
|
||||
maxDevIDSize = 31
|
||||
pcieRootPortPrefix = "rp"
|
||||
maxPCIeRootPort = 16 // Limitation from QEMU
|
||||
maxPCIeSwitchPort = 16 // Limitation from QEMU
|
||||
)
|
||||
|
||||
// This is the PCI start address assigned to the first bridge that
|
||||
@@ -313,11 +317,13 @@ func (q *qemuArchBase) kernelParameters(debug bool) []Param {
|
||||
return params
|
||||
}
|
||||
|
||||
func (q *qemuArchBase) capabilities() types.Capabilities {
|
||||
func (q *qemuArchBase) capabilities(hConfig HypervisorConfig) types.Capabilities {
|
||||
var caps types.Capabilities
|
||||
caps.SetBlockDeviceHotplugSupport()
|
||||
caps.SetMultiQueueSupport()
|
||||
caps.SetFsSharingSupport()
|
||||
if hConfig.SharedFS != config.NoSharedFS {
|
||||
caps.SetFsSharingSupport()
|
||||
}
|
||||
return caps
|
||||
}
|
||||
|
||||
@@ -708,17 +714,17 @@ func (q *qemuArchBase) appendVhostUserDevice(ctx context.Context, devices []govm
|
||||
}
|
||||
|
||||
func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device {
|
||||
pciDevice := vfioDev.(config.VFIOPCIDev)
|
||||
if pciDevice.BDF == "" {
|
||||
|
||||
if vfioDev.BDF == "" {
|
||||
return devices
|
||||
}
|
||||
|
||||
devices = append(devices,
|
||||
govmmQemu.VFIODevice{
|
||||
BDF: pciDevice.BDF,
|
||||
VendorID: pciDevice.VendorID,
|
||||
DeviceID: pciDevice.DeviceID,
|
||||
Bus: pciDevice.Bus,
|
||||
BDF: vfioDev.BDF,
|
||||
VendorID: vfioDev.VendorID,
|
||||
DeviceID: vfioDev.DeviceID,
|
||||
Bus: vfioDev.Bus,
|
||||
},
|
||||
)
|
||||
|
||||
@@ -834,6 +840,13 @@ func (q *qemuArchBase) appendPCIeRootPortDevice(devices []govmmQemu.Device, numb
|
||||
return genericAppendPCIeRootPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
|
||||
}
|
||||
|
||||
// appendPCIeSwitchPortDevice appends a PCIe Switch with <number> ports
|
||||
func (q *qemuArchBase) appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
|
||||
return genericAppendPCIeSwitchPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
|
||||
}
|
||||
|
||||
// getBARsMaxAddressableMemory we need to know the BAR sizes to configure the
|
||||
// PCIe Root Port or PCIe Downstream Port attaching a device with huge BARs.
|
||||
func (q *qemuArchBase) getBARsMaxAddressableMemory() (uint64, uint64) {
|
||||
|
||||
pci := nvpci.New()
|
||||
|
||||
@@ -117,9 +117,16 @@ func TestQemuArchBaseKernelParameters(t *testing.T) {
|
||||
func TestQemuArchBaseCapabilities(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
qemuArchBase := newQemuArchBase()
|
||||
hConfig := HypervisorConfig{}
|
||||
hConfig.SharedFS = config.VirtioFS
|
||||
|
||||
c := qemuArchBase.capabilities()
|
||||
c := qemuArchBase.capabilities(hConfig)
|
||||
assert.True(c.IsBlockDeviceHotplugSupported())
|
||||
assert.True(c.IsFsSharingSupported())
|
||||
|
||||
hConfig.SharedFS = config.NoSharedFS
|
||||
c = qemuArchBase.capabilities(hConfig)
|
||||
assert.False(c.IsFsSharingSupported())
|
||||
}
|
||||
|
||||
func TestQemuArchBaseBridges(t *testing.T) {
|
||||
@@ -463,7 +470,7 @@ func TestQemuArchBaseAppendVFIODevice(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
vfDevice := config.VFIOPCIDev{
|
||||
vfDevice := config.VFIODev{
|
||||
BDF: bdf,
|
||||
}
|
||||
|
||||
@@ -483,7 +490,7 @@ func TestQemuArchBaseAppendVFIODeviceWithVendorDeviceID(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
vfDevice := config.VFIOPCIDev{
|
||||
vfDevice := config.VFIODev{
|
||||
BDF: bdf,
|
||||
VendorID: vendorID,
|
||||
DeviceID: deviceID,
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
|
||||
"github.com/sirupsen/logrus"
|
||||
@@ -97,7 +98,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
|
||||
return q, nil
|
||||
}
|
||||
|
||||
func (q *qemuPPC64le) capabilities() types.Capabilities {
|
||||
func (q *qemuPPC64le) capabilities(hConfig HypervisorConfig) types.Capabilities {
|
||||
var caps types.Capabilities
|
||||
|
||||
// pseries machine type supports hotplugging drives
|
||||
@@ -106,7 +107,9 @@ func (q *qemuPPC64le) capabilities() types.Capabilities {
|
||||
}
|
||||
|
||||
caps.SetMultiQueueSupport()
|
||||
caps.SetFsSharingSupport()
|
||||
if hConfig.SharedFS != config.NoSharedFS {
|
||||
caps.SetFsSharingSupport()
|
||||
}
|
||||
|
||||
return caps
|
||||
}
|
||||
|
||||
@@ -111,9 +111,6 @@ func TestQemuCreateVM(t *testing.T) {
|
||||
config6 := newQemuConfig()
|
||||
config6.DisableGuestSeLinux = false
|
||||
|
||||
config7 := newQemuConfig()
|
||||
config7.PCIeRootPort = 1
|
||||
|
||||
config8 := newQemuConfig()
|
||||
config8.EnableVhostUserStore = true
|
||||
config8.HugePages = true
|
||||
@@ -161,7 +158,6 @@ func TestQemuCreateVM(t *testing.T) {
|
||||
{config3, false, true},
|
||||
{config5, false, true},
|
||||
{config6, false, false},
|
||||
{config7, false, true},
|
||||
{config8, false, true},
|
||||
{config9, true, false},
|
||||
{config10, false, true},
|
||||
|
||||
@@ -36,7 +36,6 @@ import (
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
|
||||
deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
|
||||
hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
|
||||
resCtrl "github.com/kata-containers/kata-containers/src/runtime/pkg/resourcecontrol"
|
||||
exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental"
|
||||
@@ -106,15 +105,10 @@ type HypervisorPidKey struct{}
|
||||
|
||||
// SandboxStatus describes a sandbox status.
|
||||
type SandboxStatus struct {
|
||||
ContainersStatus []ContainerStatus
|
||||
|
||||
// Annotations allow clients to store arbitrary values,
|
||||
// for example to add additional status values required
|
||||
// to support particular specifications.
|
||||
Annotations map[string]string
|
||||
|
||||
Annotations map[string]string
|
||||
ID string
|
||||
Hypervisor HypervisorType
|
||||
ContainersStatus []ContainerStatus
|
||||
State types.SandboxState
|
||||
HypervisorConfig HypervisorConfig
|
||||
}
|
||||
@@ -530,6 +524,7 @@ func createSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Fac
|
||||
return s, nil
|
||||
}
|
||||
|
||||
//nolint:gocyclo
|
||||
func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factory) (sb *Sandbox, retErr error) {
|
||||
span, ctx := katatrace.Trace(ctx, nil, "newSandbox", sandboxTracingTags, map[string]string{"sandbox_id": sandboxConfig.ID})
|
||||
defer span.End()
|
||||
@@ -630,22 +625,49 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
|
||||
|
||||
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
|
||||
// until we have TDISP/IDE PCIe support.
|
||||
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != hv.NoPort)
|
||||
var devs []config.DeviceInfo
|
||||
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort)
|
||||
// Aggregate all the containner devices for hot-plug and use them to dedcue
|
||||
// the correct amount of ports to reserve for the hypervisor.
|
||||
hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
|
||||
|
||||
var vfioDevices []config.DeviceInfo
|
||||
// vhost-user-block device is a PCIe device in Virt, keep track of it
|
||||
// for correct number of PCIe root ports.
|
||||
var vhostUserBlkDevices []config.DeviceInfo
|
||||
|
||||
for cnt, containers := range sandboxConfig.Containers {
|
||||
for dev, device := range containers.DeviceInfos {
|
||||
if coldPlugVFIO && deviceManager.IsVFIO(device.ContainerPath) {
|
||||
|
||||
if deviceManager.IsVhostUserBlk(device) {
|
||||
vhostUserBlkDevices = append(vhostUserBlkDevices, device)
|
||||
continue
|
||||
}
|
||||
isVFIO := deviceManager.IsVFIO(device.ContainerPath)
|
||||
if hotPlugVFIO && isVFIO {
|
||||
vfioDevices = append(vfioDevices, device)
|
||||
sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
|
||||
}
|
||||
if coldPlugVFIO && isVFIO {
|
||||
device.ColdPlug = true
|
||||
devs = append(devs, device)
|
||||
device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO
|
||||
vfioDevices = append(vfioDevices, device)
|
||||
// We need to remove the devices marked for cold-plug
|
||||
// otherwise at the container level the kata-agent
|
||||
// will try to hot-plug them.
|
||||
infos := sandboxConfig.Containers[cnt].DeviceInfos
|
||||
infos = append(infos[:dev], infos[dev+1:]...)
|
||||
sandboxConfig.Containers[cnt].DeviceInfos = infos
|
||||
sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging"
|
||||
}
|
||||
}
|
||||
var filteredDevices []config.DeviceInfo
|
||||
for _, device := range containers.DeviceInfos {
|
||||
if device.ID != "remove-we-are-cold-plugging" {
|
||||
filteredDevices = append(filteredDevices, device)
|
||||
}
|
||||
}
|
||||
sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices
|
||||
|
||||
}
|
||||
sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
|
||||
sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices
|
||||
|
||||
// store doesn't require hypervisor to be stored immediately
|
||||
if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil {
|
||||
@@ -660,7 +682,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
|
||||
return s, nil
|
||||
}
|
||||
|
||||
for _, dev := range devs {
|
||||
for _, dev := range vfioDevices {
|
||||
_, err := s.AddDevice(ctx, dev)
|
||||
if err != nil {
|
||||
s.Logger().WithError(err).Debug("Cannot cold-plug add device")
|
||||
@@ -1723,7 +1745,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
|
||||
defer span.End()
|
||||
|
||||
for i := range s.config.Containers {
|
||||
|
||||
c, err := newContainer(ctx, s, &s.config.Containers[i])
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -1742,7 +1763,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
|
||||
if err := s.updateResources(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := s.resourceControllerUpdate(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -1754,7 +1774,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
|
||||
if err := s.storeSandbox(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1918,15 +1937,11 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
|
||||
// adding a group of VFIO devices
|
||||
for _, dev := range vfioDevices {
|
||||
if _, err := s.hypervisor.HotplugAddDevice(ctx, dev, VfioDev); err != nil {
|
||||
bdf := ""
|
||||
if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
|
||||
bdf = pciDevice.BDF
|
||||
}
|
||||
s.Logger().
|
||||
WithFields(logrus.Fields{
|
||||
"sandbox": s.id,
|
||||
"vfio-device-ID": (*dev).GetID(),
|
||||
"vfio-device-BDF": bdf,
|
||||
"vfio-device-ID": dev.ID,
|
||||
"vfio-device-BDF": dev.BDF,
|
||||
}).WithError(err).Error("failed to hotplug VFIO device")
|
||||
return err
|
||||
}
|
||||
@@ -1941,6 +1956,7 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
|
||||
return err
|
||||
case config.VhostUserBlk:
|
||||
vhostUserBlkDevice, ok := device.(*drivers.VhostUserBlkDevice)
|
||||
|
||||
if !ok {
|
||||
return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
|
||||
}
|
||||
@@ -1975,15 +1991,11 @@ func (s *Sandbox) HotplugRemoveDevice(ctx context.Context, device api.Device, de
|
||||
// remove a group of VFIO devices
|
||||
for _, dev := range vfioDevices {
|
||||
if _, err := s.hypervisor.HotplugRemoveDevice(ctx, dev, VfioDev); err != nil {
|
||||
bdf := ""
|
||||
if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
|
||||
bdf = pciDevice.BDF
|
||||
}
|
||||
s.Logger().WithError(err).
|
||||
WithFields(logrus.Fields{
|
||||
"sandbox": s.id,
|
||||
"vfio-device-ID": (*dev).GetID(),
|
||||
"vfio-device-BDF": bdf,
|
||||
"vfio-device-ID": dev.ID,
|
||||
"vfio-device-BDF": dev.BDF,
|
||||
}).Error("failed to hot unplug VFIO device")
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -593,11 +593,11 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) {
|
||||
_, err = os.Create(deviceFile)
|
||||
assert.Nil(t, err)
|
||||
|
||||
savedIOMMUPath := config.SysIOMMUPath
|
||||
config.SysIOMMUPath = tmpDir
|
||||
savedIOMMUPath := config.SysIOMMUGroupPath
|
||||
config.SysIOMMUGroupPath = tmpDir
|
||||
|
||||
defer func() {
|
||||
config.SysIOMMUPath = savedIOMMUPath
|
||||
config.SysIOMMUGroupPath = savedIOMMUPath
|
||||
}()
|
||||
|
||||
dm := manager.NewDeviceManager(config.VirtioSCSI, false, "", 0, nil)
|
||||
|
||||
@@ -240,3 +240,17 @@ restart_containerd_service() {
|
||||
clean_env_ctr
|
||||
return 0
|
||||
}
|
||||
|
||||
# @path_results: path to the input metric-results folder
|
||||
# @tarball_fname: path and filename to the output tarball
|
||||
function compress_metrics_results_dir()
|
||||
{
|
||||
local path_results="${1:-results}"
|
||||
local tarball_fname="${2:-}"
|
||||
|
||||
[ -z "${tarball_fname}" ] && die "Missing the tarball filename or the path to save the tarball results is incorrect."
|
||||
[ ! -d "${path_results}" ] && die "Missing path to the results folder."
|
||||
|
||||
cd "${path_results}" && tar -czf "${tarball_fname}" *.json && cd -
|
||||
info "tarball generated: ${tarball_fname}"
|
||||
}
|
||||
|
||||
@@ -31,13 +31,16 @@ function login_azure() {
|
||||
}
|
||||
|
||||
function create_cluster() {
|
||||
# First, ensure that the cluster didn't fail to get cleaned up from a previous run.
|
||||
delete_cluster || true
|
||||
|
||||
az aks create \
|
||||
-g "kataCI" \
|
||||
-n "$(_print_cluster_name)" \
|
||||
-s "Standard_D4s_v5" \
|
||||
--node-count 1 \
|
||||
--generate-ssh-keys \
|
||||
$([ "${KATA_HOST_OS}" = "cbl-mariner" ] && echo "--os-sku mariner --workload-runtime KataMshvVmIsolation")
|
||||
$([ "${KATA_HOST_OS}" = "cbl-mariner" ] && echo "--os-sku AzureLinux --workload-runtime KataMshvVmIsolation")
|
||||
}
|
||||
|
||||
function install_bats() {
|
||||
@@ -55,10 +58,28 @@ function get_cluster_credentials() {
|
||||
-n "$(_print_cluster_name)"
|
||||
}
|
||||
|
||||
function ensure_yq() {
|
||||
: "${GOPATH:=${GITHUB_WORKSPACE}}"
|
||||
export GOPATH
|
||||
export PATH="${GOPATH}/bin:${PATH}"
|
||||
INSTALL_IN_GOPATH=true "${repo_root_dir}/ci/install_yq.sh"
|
||||
}
|
||||
|
||||
function run_tests() {
|
||||
platform="${1}"
|
||||
ensure_yq
|
||||
|
||||
# Emsure we're in the default namespace
|
||||
kubectl config set-context --current --namespace=default
|
||||
|
||||
# Delete any spurious tests namespace that was left behind
|
||||
kubectl delete namespace kata-containers-k8s-tests &> /dev/null || true
|
||||
|
||||
sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}|g" "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml"
|
||||
if [ "${KATA_HOST_OS}" = "cbl-mariner" ]; then
|
||||
yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[+].name' "HOST_OS"
|
||||
yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[-1].value' "${KATA_HOST_OS}"
|
||||
fi
|
||||
cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml"
|
||||
cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" | grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" || die "Failed to setup the tests image"
|
||||
|
||||
@@ -80,6 +101,10 @@ function run_tests() {
|
||||
sleep 60s
|
||||
fi
|
||||
|
||||
# Create a new namespace for the tests and switch to it
|
||||
kubectl apply -f ${integration_dir}/kubernetes/runtimeclass_workloads/tests-namespace.yaml
|
||||
kubectl config set-context --current --namespace=kata-containers-k8s-tests
|
||||
|
||||
pushd "${integration_dir}/kubernetes"
|
||||
bash setup.sh
|
||||
bash run_kubernetes_tests.sh
|
||||
@@ -89,6 +114,10 @@ function run_tests() {
|
||||
function cleanup() {
|
||||
platform="${1}"
|
||||
|
||||
# Switch back to the default namespace and delete the tests one
|
||||
kubectl config set-context --current --namespace=default
|
||||
kubectl delete namespace kata-containers-k8s-tests
|
||||
|
||||
if [ "${platform}" = "tdx" ]; then
|
||||
deploy_spec="-k "${tools_dir}/packaging/kata-deploy/kata-deploy/overlays/k3s""
|
||||
cleanup_spec="-k "${tools_dir}/packaging/kata-deploy/kata-cleanup/overlays/k3s""
|
||||
@@ -115,11 +144,12 @@ function delete_cluster() {
|
||||
az aks delete \
|
||||
-g "kataCI" \
|
||||
-n "$(_print_cluster_name)" \
|
||||
--yes \
|
||||
--no-wait
|
||||
--yes
|
||||
}
|
||||
|
||||
function main() {
|
||||
export KATA_HOST_OS="${KATA_HOST_OS:-}"
|
||||
|
||||
action="${1:-}"
|
||||
|
||||
case "${action}" in
|
||||
|
||||
@@ -14,13 +14,12 @@ setup() {
|
||||
@test "Pod quota" {
|
||||
resource_name="pod-quota"
|
||||
deployment_name="deploymenttest"
|
||||
namespace="test-quota-ns"
|
||||
|
||||
# Create the resourcequota
|
||||
kubectl create -f "${pod_config_dir}/resource-quota.yaml"
|
||||
|
||||
# View information about resourcequota
|
||||
kubectl get -n "$namespace" resourcequota "$resource_name" \
|
||||
kubectl get resourcequota "$resource_name" \
|
||||
--output=yaml | grep 'pods: "2"'
|
||||
|
||||
# Create deployment
|
||||
@@ -28,10 +27,9 @@ setup() {
|
||||
|
||||
# View deployment
|
||||
kubectl wait --for=condition=Available --timeout=$timeout \
|
||||
-n "$namespace" deployment/${deployment_name}
|
||||
deployment/${deployment_name}
|
||||
}
|
||||
|
||||
teardown() {
|
||||
kubectl delete -n "$namespace" deployment "$deployment_name"
|
||||
kubectl delete -f "${pod_config_dir}/resource-quota.yaml"
|
||||
}
|
||||
|
||||
@@ -54,10 +54,6 @@ else
|
||||
)
|
||||
fi
|
||||
|
||||
if [ ${KATA_HOST_OS} == "cbl-mariner" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# we may need to skip a few test cases when running on non-x86_64 arch
|
||||
arch_config_file="${kubernetes_dir}/filter_out_per_arch/${TARGET_ARCH}.yaml"
|
||||
if [ -f "${arch_config_file}" ]; then
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
namespace: default
|
||||
name: custom-dns-test
|
||||
spec:
|
||||
terminationGracePeriodSeconds: 0
|
||||
|
||||
@@ -8,7 +8,6 @@ apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: pod-oom
|
||||
namespace: default
|
||||
spec:
|
||||
runtimeClassName: kata
|
||||
restartPolicy: Never
|
||||
|
||||
@@ -7,7 +7,6 @@ apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: deploymenttest
|
||||
namespace: test-quota-ns
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
|
||||
@@ -14,7 +14,6 @@ items:
|
||||
kind: ResourceQuota
|
||||
metadata:
|
||||
name: pod-quota
|
||||
namespace: test-quota-ns
|
||||
spec:
|
||||
hard:
|
||||
pods: "2"
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: kata-containers-k8s-tests
|
||||
@@ -13,8 +13,24 @@ set_runtime_class() {
|
||||
sed -i -e "s|runtimeClassName: kata|runtimeClassName: kata-${KATA_HYPERVISOR}|" ${kubernetes_dir}/runtimeclass_workloads/*.yaml
|
||||
}
|
||||
|
||||
set_kernel_path() {
|
||||
if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then
|
||||
mariner_kernel_path="/usr/share/cloud-hypervisor/vmlinux.bin"
|
||||
find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.kernel]' "${mariner_kernel_path}" \;
|
||||
fi
|
||||
}
|
||||
|
||||
set_initrd_path() {
|
||||
if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then
|
||||
initrd_path="/opt/kata/share/kata-containers/kata-containers-initrd-cbl-mariner.img"
|
||||
find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.initrd]' "${initrd_path}" \;
|
||||
fi
|
||||
}
|
||||
|
||||
main() {
|
||||
set_runtime_class
|
||||
set_kernel_path
|
||||
set_initrd_path
|
||||
}
|
||||
|
||||
main "$@"
|
||||
|
||||
@@ -55,6 +55,8 @@ For further details see the [time tests documentation](time).
|
||||
Tests that measure the size and overheads of the runtime. Generally this is looking at
|
||||
memory footprint sizes, but could also cover disk space or even CPU consumption.
|
||||
|
||||
For further details see the [density tests documentation](density).
|
||||
|
||||
### Networking
|
||||
|
||||
Tests relating to networking. General items could include:
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
# Copyright (c) 2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# This file contains baseline expectations
|
||||
# for checked results by checkmetrics tool.
|
||||
#
|
||||
# values set specifically for packet.com c1.small worker.
|
||||
|
||||
[[metric]]
|
||||
name = "boot-times"
|
||||
type = "json"
|
||||
description = "measure container lifecycle timings"
|
||||
# Min and Max values to set a 'range' that
|
||||
# the median of the CSV Results data must fall
|
||||
# within (inclusive)
|
||||
checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result"
|
||||
checktype = "mean"
|
||||
midval = 0.42
|
||||
minpercent = 20.0
|
||||
maxpercent = 20.0
|
||||
|
||||
[[metric]]
|
||||
name = "memory-footprint"
|
||||
type = "json"
|
||||
description = "measure memory usage"
|
||||
# Min and Max values to set a 'range' that
|
||||
# the median of the CSV Results data must fall
|
||||
# within (inclusive)
|
||||
checkvar = ".\"memory-footprint\".Results | .[] | .average.Result"
|
||||
checktype = "mean"
|
||||
midval = 2518364.00
|
||||
minpercent = 20.0
|
||||
maxpercent = 20.0
|
||||
@@ -0,0 +1,34 @@
|
||||
# Copyright (c) 2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# This file contains baseline expectations
|
||||
# for checked results by checkmetrics tool.
|
||||
#
|
||||
# values set specifically for Equinix m3.small.x86.
|
||||
|
||||
[[metric]]
|
||||
name = "boot-times"
|
||||
type = "json"
|
||||
description = "measure container lifecycle timings"
|
||||
# Min and Max values to set a 'range' that
|
||||
# the median of the CSV Results data must fall
|
||||
# within (inclusive)
|
||||
checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result"
|
||||
checktype = "mean"
|
||||
midval = 0.61
|
||||
minpercent = 20.0
|
||||
maxpercent = 20.0
|
||||
|
||||
[[metric]]
|
||||
name = "memory-footprint"
|
||||
type = "json"
|
||||
description = "measure memory usage"
|
||||
# Min and Max values to set a 'range' that
|
||||
# the median of the CSV Results data must fall
|
||||
# within (inclusive)
|
||||
checkvar = ".\"memory-footprint\".Results | .[] | .average.Result"
|
||||
checktype = "mean"
|
||||
midval = 2435844.00
|
||||
minpercent = 20.0
|
||||
maxpercent = 20.0
|
||||
53
tests/metrics/density/README.md
Normal file
53
tests/metrics/density/README.md
Normal file
@@ -0,0 +1,53 @@
|
||||
# Kata Containers density metrics tests
|
||||
|
||||
This directory contains a number of tests to help measure container
|
||||
memory footprint. Some measures are based around the
|
||||
[PSS](https://en.wikipedia.org/wiki/Proportional_set_size) of the runtime
|
||||
components, and others look at the system level (`free` and `/proc/meminfo`
|
||||
for instance) impact.
|
||||
|
||||
## `memory_usage`
|
||||
|
||||
This test measures the PSS footprint of the runtime components whilst
|
||||
launching a number of small ([BusyBox](https://hub.docker.com/_/busybox/)) containers
|
||||
using ctr.
|
||||
|
||||
## `fast_footprint`
|
||||
|
||||
This test takes system level resource measurements after launching a number of
|
||||
containers in parallel and optionally waiting for KSM to settle its memory
|
||||
compaction cycles.
|
||||
|
||||
The script is quite configurable via environment variables, including:
|
||||
|
||||
* Which container workload to run.
|
||||
* How many containers to launch.
|
||||
* How many containers are launched in parallel.
|
||||
* How long to wait until taking the measures.
|
||||
|
||||
See the script itself for more details.
|
||||
|
||||
This test shares many config options with the `footprint_data` test. Thus, referring
|
||||
to the [footprint test documentation](footprint_data.md) may be useful.
|
||||
|
||||
> *Note:* If this test finds KSM is enabled on the host, it will wait for KSM
|
||||
> to "settle" before taking the final measurement. If your KSM is not configured
|
||||
> to process all the allocated VM memory fast enough, the test will hit a timeout
|
||||
> and proceed to take the final measurement anyway.
|
||||
|
||||
## `footprint_data`
|
||||
|
||||
Similar to the `fast_footprint` test, but this test launches the containers
|
||||
sequentially and takes a system level measurement between each launch. Thus,
|
||||
this test provides finer grained information on system scaling, but takes
|
||||
significantly longer to run than the `fast_footprint` test. If you are only
|
||||
interested in the final figure or the average impact, you may be better running
|
||||
the `fast_footprint` test.
|
||||
|
||||
For more details see the [footprint test documentation](footprint_data.md).
|
||||
|
||||
## `memory_usage_inside_container`
|
||||
|
||||
Measures the memory statistics *inside* the container. This allows evaluation of
|
||||
the overhead the VM kernel and rootfs are having on the memory that was requested
|
||||
by the container co-ordination system, and thus supplied to the VM.
|
||||
433
tests/metrics/density/fast_footprint.sh
Executable file
433
tests/metrics/density/fast_footprint.sh
Executable file
@@ -0,0 +1,433 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2017-2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# A script to gather memory 'footprint' information as we launch more
|
||||
# and more containers
|
||||
#
|
||||
# The script gathers information about both user and kernel space consumption
|
||||
# Output is into a .json file, named using some of the config component names
|
||||
# (such as footprint-busybox.json)
|
||||
|
||||
# Pull in some common, useful, items
|
||||
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
|
||||
source "${SCRIPT_PATH}/../lib/common.bash"
|
||||
|
||||
# Note that all vars that can be set from outside the script (that is,
|
||||
# passed in the ENV), use the ':-' setting to allow being over-ridden
|
||||
|
||||
# Default sleep, in seconds, to let containers come up and finish their
|
||||
# initialisation before we take the measures. Some of the larger
|
||||
# containers can take a number of seconds to get running.
|
||||
PAYLOAD_SLEEP="${PAYLOAD_SLEEP:-10}"
|
||||
|
||||
# How long, in seconds, do we wait for KSM to 'settle down', before we
|
||||
# timeout and just continue anyway.
|
||||
KSM_WAIT_TIME="${KSM_WAIT_TIME:-300}"
|
||||
|
||||
# How long, in seconds, do we poll for ctr to complete launching all the
|
||||
# containers?
|
||||
CTR_POLL_TIMEOUT="${CTR_POLL_TIMEOUT:-300}"
|
||||
|
||||
# How many containers do we launch in parallel before taking the PAYLOAD_SLEEP
|
||||
# nap
|
||||
PARALLELISM="${PARALLELISM:-10}"
|
||||
|
||||
### The default config - run a small busybox image
|
||||
# Define what we will be running (app under test)
|
||||
# Default is we run busybox, as a 'small' workload
|
||||
PAYLOAD="${PAYLOAD:-quay.io/prometheus/busybox:latest}"
|
||||
PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}"
|
||||
|
||||
###
|
||||
# which RUNTIME we use is picked up from the env in
|
||||
# common.bash. You can over-ride by setting RUNTIME in your env
|
||||
|
||||
###
|
||||
# Define the cutoff checks for when we stop running the test
|
||||
# Run up to this many containers
|
||||
NUM_CONTAINERS="${NUM_CONTAINERS:-100}"
|
||||
# Run until we have consumed this much memory (from MemFree)
|
||||
MAX_MEMORY_CONSUMED="${MAX_MEMORY_CONSUMED:-256*1024*1024*1024}"
|
||||
# Run until we have this much MemFree left
|
||||
MIN_MEMORY_FREE="${MIN_MEMORY_FREE:-2*1024*1024*1024}"
|
||||
|
||||
# Tools we need to have installed in order to operate
|
||||
REQUIRED_COMMANDS="smem awk"
|
||||
|
||||
# If we 'dump' the system caches before we measure then we get less
|
||||
# noise in the results - they show more what our un-reclaimable footprint is
|
||||
DUMP_CACHES="${DUMP_CACHES:-1}"
|
||||
|
||||
# Affects the name of the file to store the results in
|
||||
TEST_NAME="${TEST_NAME:-fast-footprint-busybox}"
|
||||
|
||||
############# end of configurable items ###################
|
||||
|
||||
# vars to remember where we started so we can calc diffs
|
||||
base_mem_avail=0
|
||||
base_mem_free=0
|
||||
|
||||
# dump the kernel caches, so we get a more precise (or just different)
|
||||
# view of what our footprint really is.
|
||||
function dump_caches() {
|
||||
sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
|
||||
}
|
||||
|
||||
function init() {
|
||||
restart_containerd_service
|
||||
|
||||
check_cmds $REQUIRED_COMMANDS
|
||||
sudo -E "${CTR_EXE}" image pull "$PAYLOAD"
|
||||
|
||||
# Modify the test name if running with KSM enabled
|
||||
check_for_ksm
|
||||
|
||||
# Use the common init func to get to a known state
|
||||
init_env
|
||||
|
||||
# Prepare to start storing results
|
||||
metrics_json_init
|
||||
|
||||
# Store up baseline measures
|
||||
base_mem_avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
|
||||
base_mem_free=$(get_memfree)
|
||||
|
||||
# Store our configuration for this run
|
||||
save_config
|
||||
}
|
||||
|
||||
save_config(){
|
||||
metrics_json_start_array
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"testname": "${TEST_NAME}",
|
||||
"payload": "${PAYLOAD}",
|
||||
"payload_args": "${PAYLOAD_ARGS}",
|
||||
"payload_sleep": ${PAYLOAD_SLEEP},
|
||||
"ksm_settle_time": ${KSM_WAIT_TIME},
|
||||
"num_containers": ${NUM_CONTAINERS},
|
||||
"parallelism": ${PARALLELISM},
|
||||
"max_memory_consumed": "${MAX_MEMORY_CONSUMED}",
|
||||
"min_memory_free": "${MIN_MEMORY_FREE}",
|
||||
"dump_caches": "${DUMP_CACHES}"
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
metrics_json_add_array_element "$json"
|
||||
metrics_json_end_array "Config"
|
||||
}
|
||||
|
||||
function cleanup() {
|
||||
# Finish storing the results
|
||||
metrics_json_save
|
||||
|
||||
clean_env_ctr
|
||||
}
|
||||
|
||||
# helper function to get USS of process in arg1
|
||||
function get_proc_uss() {
|
||||
item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $4}')
|
||||
((item*=1024))
|
||||
echo $item
|
||||
}
|
||||
|
||||
# helper function to get PSS of process in arg1
|
||||
function get_proc_pss() {
|
||||
item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $5}')
|
||||
((item*=1024))
|
||||
echo $item
|
||||
}
|
||||
|
||||
# Get the PSS for the whole of userspace (all processes)
|
||||
# This allows us to see if we had any impact on the rest of the system, for instance
|
||||
# dockerd grows as we launch containers, so we should account for that in our total
|
||||
# memory breakdown
|
||||
function grab_all_pss() {
|
||||
item=$(sudo smem -t | tail -1 | awk '{print $5}')
|
||||
((item*=1024))
|
||||
|
||||
local json="$(cat << EOF
|
||||
"all_pss": {
|
||||
"pss": $item,
|
||||
"Units": "KB"
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
|
||||
metrics_json_add_array_fragment "$json"
|
||||
}
|
||||
|
||||
function grab_user_smem() {
|
||||
# userspace
|
||||
item=$(sudo smem -w | head -5 | tail -1 | awk '{print $3}')
|
||||
((item*=1024))
|
||||
|
||||
local json="$(cat << EOF
|
||||
"user_smem": {
|
||||
"userspace": $item,
|
||||
"Units": "KB"
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
|
||||
metrics_json_add_array_fragment "$json"
|
||||
}
|
||||
|
||||
function grab_slab() {
|
||||
# Grabbing slab total from meminfo is easier than doing the math
|
||||
# on slabinfo
|
||||
item=$(fgrep "Slab:" /proc/meminfo | awk '{print $2}')
|
||||
((item*=1024))
|
||||
|
||||
local json="$(cat << EOF
|
||||
"slab": {
|
||||
"slab": $item,
|
||||
"Units": "KB"
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
|
||||
metrics_json_add_array_fragment "$json"
|
||||
}
|
||||
|
||||
function get_memfree() {
|
||||
mem_free=$(sudo smem -w | head -6 | tail -1 | awk '{print $4}')
|
||||
((mem_free*=1024))
|
||||
echo $mem_free
|
||||
}
|
||||
|
||||
function grab_system() {
|
||||
|
||||
# avail memory, from 'free'
|
||||
local avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
|
||||
local avail_decr=$((base_mem_avail-avail))
|
||||
|
||||
# cached memory, from 'free'
|
||||
local cached=$(free -b | head -2 | tail -1 | awk '{print $6}')
|
||||
|
||||
# free memory from smem
|
||||
local smem_free=$(get_memfree)
|
||||
local free_decr=$((base_mem_free-item))
|
||||
|
||||
# Anon pages
|
||||
local anon=$(fgrep "AnonPages:" /proc/meminfo | awk '{print $2}')
|
||||
((anon*=1024))
|
||||
|
||||
# Mapped pages
|
||||
local mapped=$(egrep "^Mapped:" /proc/meminfo | awk '{print $2}')
|
||||
((mapped*=1024))
|
||||
|
||||
# Cached
|
||||
local meminfo_cached=$(grep "^Cached:" /proc/meminfo | awk '{print $2}')
|
||||
((meminfo_cached*=1024))
|
||||
|
||||
local json="$(cat << EOF
|
||||
"system": {
|
||||
"avail": $avail,
|
||||
"avail_decr": $avail_decr,
|
||||
"cached": $cached,
|
||||
"smem_free": $smem_free,
|
||||
"free_decr": $free_decr,
|
||||
"anon": $anon,
|
||||
"mapped": $mapped,
|
||||
"meminfo_cached": $meminfo_cached,
|
||||
"Units": "KB"
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
|
||||
metrics_json_add_array_fragment "$json"
|
||||
}
|
||||
|
||||
function grab_stats() {
|
||||
# If configured, dump the caches so we get a more stable
|
||||
# view of what our static footprint really is
|
||||
if [[ "$DUMP_CACHES" ]] ; then
|
||||
dump_caches
|
||||
fi
|
||||
|
||||
# user space data
|
||||
# PSS taken all userspace
|
||||
grab_all_pss
|
||||
# user as reported by smem
|
||||
grab_user_smem
|
||||
|
||||
# System overview data
|
||||
# System free and cached
|
||||
grab_system
|
||||
|
||||
# kernel data
|
||||
# The 'total kernel space taken' we can work out as:
|
||||
# ktotal = ((free-avail)-user)
|
||||
# So, we don't grab that number from smem, as that is what it does
|
||||
# internally anyhow.
|
||||
# Still try to grab any finer kernel details that we can though
|
||||
|
||||
# totals from slabinfo
|
||||
grab_slab
|
||||
|
||||
metrics_json_close_array_element
|
||||
}
|
||||
|
||||
function check_limits() {
|
||||
mem_free=$(get_memfree)
|
||||
if ((mem_free <= MIN_MEMORY_FREE)); then
|
||||
echo 1
|
||||
return
|
||||
fi
|
||||
|
||||
mem_consumed=$((base_mem_avail-mem_free))
|
||||
if ((mem_consumed >= MAX_MEMORY_CONSUMED)); then
|
||||
echo 1
|
||||
return
|
||||
fi
|
||||
|
||||
echo 0
|
||||
}
|
||||
|
||||
launch_containers() {
|
||||
local parloops leftovers
|
||||
|
||||
(( parloops=${NUM_CONTAINERS}/${PARALLELISM} ))
|
||||
(( leftovers=${NUM_CONTAINERS} - (${parloops}*${PARALLELISM}) ))
|
||||
|
||||
echo "Launching ${parloops}x${PARALLELISM} containers + ${leftovers} etras"
|
||||
|
||||
containers=()
|
||||
|
||||
local iter n
|
||||
for iter in $(seq 1 $parloops); do
|
||||
echo "Launch iteration ${iter}"
|
||||
for n in $(seq 1 $PARALLELISM); do
|
||||
containers+=($(random_name))
|
||||
sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS &
|
||||
done
|
||||
|
||||
if [[ $PAYLOAD_SLEEP ]]; then
|
||||
sleep $PAYLOAD_SLEEP
|
||||
fi
|
||||
|
||||
# check if we have hit one of our limits and need to wrap up the tests
|
||||
if (($(check_limits))); then
|
||||
echo "Ran out of resources, check_limits failed"
|
||||
return
|
||||
fi
|
||||
done
|
||||
|
||||
for n in $(seq 1 $leftovers); do
|
||||
containers+=($(random_name))
|
||||
sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS &
|
||||
done
|
||||
}
|
||||
|
||||
wait_containers() {
|
||||
local t numcontainers
|
||||
# nap 3s between checks
|
||||
local step=3
|
||||
|
||||
for ((t=0; t<${CTR_POLL_TIMEOUT}; t+=step)); do
|
||||
|
||||
numcontainers=$(sudo -E "${CTR_EXE}" c list -q | wc -l)
|
||||
|
||||
if (( numcontainers >= ${NUM_CONTAINERS} )); then
|
||||
echo "All containers now launched (${t}s)"
|
||||
return
|
||||
else
|
||||
echo "Waiting for containers to launch (${numcontainers} at ${t}s)"
|
||||
fi
|
||||
sleep ${step}
|
||||
done
|
||||
|
||||
echo "Timed out waiting for containers to launch (${t}s)"
|
||||
cleanup
|
||||
die "Timed out waiting for containers to launch (${t}s)"
|
||||
}
|
||||
|
||||
function go() {
|
||||
# Init the json cycle for this save
|
||||
metrics_json_start_array
|
||||
|
||||
# Grab the first set of stats before we run any containers.
|
||||
grab_stats
|
||||
|
||||
launch_containers
|
||||
wait_containers
|
||||
|
||||
if [ $ksm_on == "1" ]; then
|
||||
echo "Wating for KSM to settle..."
|
||||
wait_ksm_settle ${KSM_WAIT_TIME}
|
||||
fi
|
||||
|
||||
grab_stats
|
||||
|
||||
# Wrap up the results array
|
||||
metrics_json_end_array "Results"
|
||||
}
|
||||
|
||||
function show_vars()
|
||||
{
|
||||
echo -e "\nEvironment variables:"
|
||||
echo -e "\tName (default)"
|
||||
echo -e "\t\tDescription"
|
||||
echo -e "\tPAYLOAD (${PAYLOAD})"
|
||||
echo -e "\t\tThe ctr image to run"
|
||||
echo -e "\tPAYLOAD_ARGS (${PAYLOAD_ARGS})"
|
||||
echo -e "\t\tAny extra arguments passed into the docker 'run' command"
|
||||
echo -e "\tPAYLOAD_SLEEP (${PAYLOAD_SLEEP})"
|
||||
echo -e "\t\tSeconds to sleep between launch and measurement, to allow settling"
|
||||
echo -e "\tKSM_WAIT_TIME (${KSM_WAIT_TIME})"
|
||||
echo -e "\t\tSeconds to wait for KSM to settle before we take the final measure"
|
||||
echo -e "\tCTR_POLL_TIMEOUT (${CTR_POLL_TIMEOUT})"
|
||||
echo -e "\t\tSeconds to poll for ctr to finish launching containers"
|
||||
echo -e "\tPARALLELISM (${PARALLELISM})"
|
||||
echo -e "\t\tNumber of containers we launch in parallel"
|
||||
echo -e "\tNUM_CONTAINERS (${NUM_CONTAINERS})"
|
||||
echo -e "\t\tThe total number of containers to run"
|
||||
echo -e "\tMAX_MEMORY_CONSUMED (${MAX_MEMORY_CONSUMED})"
|
||||
echo -e "\t\tThe maximum amount of memory to be consumed before terminating"
|
||||
echo -e "\tMIN_MEMORY_FREE (${MIN_MEMORY_FREE})"
|
||||
echo -e "\t\tThe minimum amount of memory allowed to be free before terminating"
|
||||
echo -e "\tDUMP_CACHES (${DUMP_CACHES})"
|
||||
echo -e "\t\tA flag to note if the system caches should be dumped before capturing stats"
|
||||
echo -e "\tTEST_NAME (${TEST_NAME})"
|
||||
echo -e "\t\tCan be set to over-ride the default JSON results filename"
|
||||
|
||||
}
|
||||
|
||||
function help()
|
||||
{
|
||||
usage=$(cat << EOF
|
||||
Usage: $0 [-h] [options]
|
||||
Description:
|
||||
Launch a series of workloads and take memory metric measurements after
|
||||
each launch.
|
||||
Options:
|
||||
-h, Help page.
|
||||
EOF
|
||||
)
|
||||
echo "$usage"
|
||||
show_vars
|
||||
}
|
||||
|
||||
function main() {
|
||||
|
||||
local OPTIND
|
||||
while getopts "h" opt;do
|
||||
case ${opt} in
|
||||
h)
|
||||
help
|
||||
exit 0;
|
||||
;;
|
||||
esac
|
||||
done
|
||||
shift $((OPTIND-1))
|
||||
|
||||
init
|
||||
go
|
||||
cleanup
|
||||
}
|
||||
|
||||
main "$@"
|
||||
87
tests/metrics/density/footprint_data.md
Normal file
87
tests/metrics/density/footprint_data.md
Normal file
@@ -0,0 +1,87 @@
|
||||
# Footprint data script details
|
||||
|
||||
The `footprint_data.sh` script runs a number of identical containers sequentially
|
||||
via ctr and takes a number of memory related measurements after each
|
||||
launch. The script is generally not used in a CI type environment, but is intended
|
||||
to be run and analyzed manually.
|
||||
|
||||
You can configure the script by setting a number of environment variables.
|
||||
|
||||
The following sections list details of the configurable variables, along with a
|
||||
small example invocation script.
|
||||
|
||||
## Variables
|
||||
Environment variables can take effect in two ways.
|
||||
|
||||
Some variables affect how the payload is executed. The `RUNTIME` and `PAYLOAD`
|
||||
arguments directly affect the payload execution with the following line in
|
||||
the script:
|
||||
|
||||
`$ ctr run --memory-limit $PAYLOAD_RUNTIME_ARGS --rm --runtime=$CONTAINERD_RUNTIME $PAYLOAD $NAME sh -c $PAYLOAD_ARGS`
|
||||
|
||||
Other settings affect how memory footprint is measured and the test termination
|
||||
conditions.
|
||||
|
||||
| Variable | Function
|
||||
| -------- | --------
|
||||
| `PAYLOAD` | The ctr image to run
|
||||
| `PAYLOAD_ARGS` | Any arguments passed into the ctr image
|
||||
| `PAYLOAD_RUNTIME_ARGS` | Any extra arguments passed into the ctr `run` command
|
||||
| `PAYLOAD_SLEEP` | Seconds to sleep between launch and measurement, to allow settling
|
||||
| `MAX_NUM_CONTAINERS` | The maximum number of containers to run before terminating
|
||||
| `MAX_MEMORY_CONSUMED` | The maximum amount of memory to be consumed before terminating
|
||||
| `MIN_MEMORY_FREE` | The minimum amount of memory allowed to be free before terminating
|
||||
| `DUMP_CACHES` | A flag to note if the system caches should be dumped before capturing stats
|
||||
| `DATAFILE` | Can be set to over-ride the default JSON results filename
|
||||
|
||||
## Output files
|
||||
The names of the JSON files generated by the test are dictated by some of the parameters
|
||||
the test is utilising. The default filename is generated in the form of:
|
||||
`footprint-${PAYLOAD}[-ksm].json`
|
||||
|
||||
## Measurements
|
||||
The test measures, calculates, and stores a number of data items:
|
||||
|
||||
| Item | Description
|
||||
| ---- | -----------
|
||||
| `uss` | USS for all the VM runtime components
|
||||
| `pss` | PSS for all the VM runtime components
|
||||
| `all_pss` | PSS of all of userspace - to monitor if we had other impact on the system
|
||||
| `user_smem` | `smem` "userspace" consumption value
|
||||
| `avail` | "available" memory from `free`
|
||||
| `avail_decr` | "available" memory decrease since start of test
|
||||
| `cached` | "Cached" memory from `/proc/meminfo`
|
||||
| `smem_free` | Free memory as reported by `smem`
|
||||
| `free_decr` | Decrease in Free memory reported by `smem` since start of test
|
||||
| `anon` | `AnonPages` as reported from `/proc/meminfo`
|
||||
| `mapped` | Mapped pages as reported from `/proc/meminfo`
|
||||
| `cached` | Cached pages as reported from `/proc/meminfo`
|
||||
| `slab` | Slab as reported from `/proc/meminfo`
|
||||
|
||||
## Example script
|
||||
The following script is an example of how to configure the environment variables and
|
||||
invoke the test script to run a number of different container tests.
|
||||
|
||||
```
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
export MAX_NUM_CONTAINERS=10
|
||||
export MAX_MEMORY_CONSUMED=6*1024*1024*1024
|
||||
|
||||
function run() {
|
||||
###
|
||||
# Define what we will be running (app under test)
|
||||
# Default is we run busybox, as a 'small' workload
|
||||
export PAYLOAD="quay.io/prometheus/busybox:latest"
|
||||
export PAYLOAD_ARGS="tail -f /dev/null"
|
||||
export PAYLOAD_SLEEP=10
|
||||
export PAYLOAD_RUNTIME_ARGS="5120"
|
||||
sudo -E bash $(pwd)/density/footprint_data.sh
|
||||
}
|
||||
|
||||
export CONTAINERD_RUNTIME=io.containerd.kata.v2
|
||||
run
|
||||
```
|
||||
360
tests/metrics/density/footprint_data.sh
Executable file
360
tests/metrics/density/footprint_data.sh
Executable file
@@ -0,0 +1,360 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2017-2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# A script to gather memory 'footprint' information as we launch more
|
||||
# and more containers
|
||||
#
|
||||
# The script gathers information about both user and kernel space consumption
|
||||
# Output is into a .json file, named using some of the config component names
|
||||
# (such as footprint-busybox.json)
|
||||
|
||||
# Pull in some common, useful, items
|
||||
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
|
||||
source "${SCRIPT_PATH}/../lib/common.bash"
|
||||
|
||||
KSM_ENABLE_FILE="/sys/kernel/mm/ksm/run"
|
||||
|
||||
# Note that all vars that can be set from outside the script (that is,
|
||||
# passed in the ENV), use the ':-' setting to allow being over-ridden
|
||||
|
||||
# Default sleep for 10s to let containers come up and finish their
|
||||
# initialisation before we take the measures. Some of the larger
|
||||
# containers can take a number of seconds to get running.
|
||||
PAYLOAD_SLEEP="${PAYLOAD_SLEEP:-10}"
|
||||
|
||||
### The default config - run a small busybox image
|
||||
# Define what we will be running (app under test)
|
||||
# Default is we run busybox, as a 'small' workload
|
||||
PAYLOAD="${PAYLOAD:-quay.io/prometheus/busybox:latest}"
|
||||
PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}"
|
||||
|
||||
###
|
||||
# Define the cutoff checks for when we stop running the test
|
||||
# Run up to this many containers
|
||||
MAX_NUM_CONTAINERS="${MAX_NUM_CONTAINERS:-10}"
|
||||
# Run until we have consumed this much memory (from MemFree)
|
||||
MAX_MEMORY_CONSUMED="${MAX_MEMORY_CONSUMED:-6*1024*1024*1024}"
|
||||
# Run until we have this much MemFree left
|
||||
MIN_MEMORY_FREE="${MIN_MEMORY_FREE:-2*1024*1024*1024}"
|
||||
|
||||
# Tools we need to have installed in order to operate
|
||||
REQUIRED_COMMANDS="smem awk"
|
||||
|
||||
# If we 'dump' the system caches before we measure then we get less
|
||||
# noise in the results - they show more what our un-reclaimable footprint is
|
||||
DUMP_CACHES="${DUMP_CACHES:-1}"
|
||||
|
||||
# Affects the name of the file to store the results in
|
||||
TEST_NAME="${TEST_NAME:-footprint-busybox}"
|
||||
|
||||
############# end of configurable items ###################
|
||||
|
||||
# vars to remember where we started so we can calc diffs
|
||||
base_mem_avail=0
|
||||
base_mem_free=0
|
||||
|
||||
# dump the kernel caches, so we get a more precise (or just different)
|
||||
# view of what our footprint really is.
|
||||
function dump_caches() {
|
||||
sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
|
||||
}
|
||||
|
||||
function init() {
|
||||
restart_containerd_service
|
||||
|
||||
check_cmds $REQUIRED_COMMANDS
|
||||
sudo -E "${CTR_EXE}" image pull "$PAYLOAD"
|
||||
|
||||
# Modify the test name if running with KSM enabled
|
||||
check_for_ksm
|
||||
|
||||
# Use the common init func to get to a known state
|
||||
init_env
|
||||
|
||||
# Prepare to start storing results
|
||||
metrics_json_init
|
||||
|
||||
# Store up baseline measures
|
||||
base_mem_avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
|
||||
base_mem_free=$(get_memfree)
|
||||
|
||||
# Store our configuration for this run
|
||||
save_config
|
||||
}
|
||||
|
||||
save_config(){
|
||||
metrics_json_start_array
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"testname": "${TEST_NAME}",
|
||||
"payload": "${PAYLOAD}",
|
||||
"payload_args": "${PAYLOAD_ARGS}",
|
||||
"payload_sleep": ${PAYLOAD_SLEEP},
|
||||
"max_containers": ${MAX_NUM_CONTAINERS},
|
||||
"max_memory_consumed": "${MAX_MEMORY_CONSUMED}",
|
||||
"min_memory_free": "${MIN_MEMORY_FREE}",
|
||||
"dump_caches": "${DUMP_CACHES}"
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
metrics_json_add_array_element "$json"
|
||||
metrics_json_end_array "Config"
|
||||
}
|
||||
|
||||
function cleanup() {
|
||||
# Finish storing the results
|
||||
metrics_json_save
|
||||
|
||||
clean_env_ctr
|
||||
}
|
||||
|
||||
# helper function to get USS of process in arg1
|
||||
function get_proc_uss() {
|
||||
item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $4}')
|
||||
((item*=1024))
|
||||
echo $item
|
||||
}
|
||||
|
||||
# helper function to get PSS of process in arg1
|
||||
function get_proc_pss() {
|
||||
item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $5}')
|
||||
((item*=1024))
|
||||
echo $item
|
||||
}
|
||||
|
||||
# Get the PSS for the whole of userspace (all processes)
|
||||
# This allows us to see if we had any impact on the rest of the system, for instance
|
||||
# containerd grows as we launch containers, so we should account for that in our total
|
||||
# memory breakdown
|
||||
function grab_all_pss() {
|
||||
item=$(sudo smem -t | tail -1 | awk '{print $5}')
|
||||
((item*=1024))
|
||||
|
||||
local json="$(cat << EOF
|
||||
"all_pss": {
|
||||
"pss": $item,
|
||||
"Units": "KB"
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
|
||||
metrics_json_add_array_fragment "$json"
|
||||
}
|
||||
|
||||
function grab_user_smem() {
|
||||
# userspace
|
||||
item=$(sudo smem -w | head -5 | tail -1 | awk '{print $3}')
|
||||
((item*=1024))
|
||||
|
||||
local json="$(cat << EOF
|
||||
"user_smem": {
|
||||
"userspace": $item,
|
||||
"Units": "KB"
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
|
||||
metrics_json_add_array_fragment "$json"
|
||||
}
|
||||
|
||||
function grab_slab() {
|
||||
# Grabbing slab total from meminfo is easier than doing the math
|
||||
# on slabinfo
|
||||
item=$(fgrep "Slab:" /proc/meminfo | awk '{print $2}')
|
||||
((item*=1024))
|
||||
|
||||
local json="$(cat << EOF
|
||||
"slab": {
|
||||
"slab": $item,
|
||||
"Units": "KB"
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
|
||||
metrics_json_add_array_fragment "$json"
|
||||
}
|
||||
|
||||
function get_memfree() {
|
||||
mem_free=$(sudo smem -w | head -6 | tail -1 | awk '{print $4}')
|
||||
((mem_free*=1024))
|
||||
echo $mem_free
|
||||
}
|
||||
|
||||
function grab_system() {
|
||||
# avail memory, from 'free'
|
||||
local avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
|
||||
local avail_decr=$((base_mem_avail-avail))
|
||||
|
||||
# cached memory, from 'free'
|
||||
local cached=$(free -b | head -2 | tail -1 | awk '{print $6}')
|
||||
|
||||
# free memory from smem
|
||||
local smem_free=$(get_memfree)
|
||||
local free_decr=$((base_mem_free-item))
|
||||
|
||||
# Anon pages
|
||||
local anon=$(fgrep "AnonPages:" /proc/meminfo | awk '{print $2}')
|
||||
((anon*=1024))
|
||||
|
||||
# Mapped pages
|
||||
local mapped=$(egrep "^Mapped:" /proc/meminfo | awk '{print $2}')
|
||||
((mapped*=1024))
|
||||
|
||||
# Cached
|
||||
local meminfo_cached=$(grep "^Cached:" /proc/meminfo | awk '{print $2}')
|
||||
((meminfo_cached*=1024))
|
||||
|
||||
local json="$(cat << EOF
|
||||
"system": {
|
||||
"avail": $avail,
|
||||
"avail_decr": $avail_decr,
|
||||
"cached": $cached,
|
||||
"smem_free": $smem_free,
|
||||
"free_decr": $free_decr,
|
||||
"anon": $anon,
|
||||
"mapped": $mapped,
|
||||
"meminfo_cached": $meminfo_cached,
|
||||
"Units": "KB"
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
|
||||
metrics_json_add_array_fragment "$json"
|
||||
}
|
||||
|
||||
function grab_stats() {
|
||||
# If configured, dump the caches so we get a more stable
|
||||
# view of what our static footprint really is
|
||||
if [[ "$DUMP_CACHES" ]] ; then
|
||||
dump_caches
|
||||
fi
|
||||
|
||||
# user space data
|
||||
# PSS taken all userspace
|
||||
grab_all_pss
|
||||
# user as reported by smem
|
||||
grab_user_smem
|
||||
|
||||
# System overview data
|
||||
# System free and cached
|
||||
grab_system
|
||||
|
||||
# kernel data
|
||||
# The 'total kernel space taken' we can work out as:
|
||||
# ktotal = ((free-avail)-user)
|
||||
# So, we don't grab that number from smem, as that is what it does
|
||||
# internally anyhow.
|
||||
# Still try to grab any finer kernel details that we can though
|
||||
|
||||
# totals from slabinfo
|
||||
grab_slab
|
||||
|
||||
metrics_json_close_array_element
|
||||
}
|
||||
|
||||
function check_limits() {
|
||||
mem_free=$(get_memfree)
|
||||
if ((mem_free <= MIN_MEMORY_FREE)); then
|
||||
echo 1
|
||||
return
|
||||
fi
|
||||
|
||||
mem_consumed=$((base_mem_avail-mem_free))
|
||||
if ((mem_consumed >= MAX_MEMORY_CONSUMED)); then
|
||||
echo 1
|
||||
return
|
||||
fi
|
||||
|
||||
echo 0
|
||||
}
|
||||
|
||||
function go() {
|
||||
# Init the json cycle for this save
|
||||
metrics_json_start_array
|
||||
|
||||
containers=()
|
||||
|
||||
for i in $(seq 1 $MAX_NUM_CONTAINERS); do
|
||||
containers+=($(random_name))
|
||||
sudo -E "${CTR_EXE}" run --rm --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS
|
||||
|
||||
if [[ $PAYLOAD_SLEEP ]]; then
|
||||
sleep $PAYLOAD_SLEEP
|
||||
fi
|
||||
|
||||
grab_stats
|
||||
|
||||
# check if we have hit one of our limits and need to wrap up the tests
|
||||
if (($(check_limits))); then
|
||||
# Wrap up the results array
|
||||
metrics_json_end_array "Results"
|
||||
return
|
||||
fi
|
||||
done
|
||||
|
||||
# Wrap up the results array
|
||||
metrics_json_end_array "Results"
|
||||
}
|
||||
|
||||
|
||||
function show_vars()
|
||||
{
|
||||
echo -e "\nEvironment variables:"
|
||||
echo -e "\tName (default)"
|
||||
echo -e "\t\tDescription"
|
||||
echo -e "\tPAYLOAD (${PAYLOAD})"
|
||||
echo -e "\t\tThe ctr image to run"
|
||||
echo -e "\tPAYLOAD_ARGS (${PAYLOAD_ARGS})"
|
||||
echo -e "\t\tAny extra arguments passed into the ctr 'run' command"
|
||||
echo -e "\tPAYLOAD_SLEEP (${PAYLOAD_SLEEP})"
|
||||
echo -e "\t\tSeconds to sleep between launch and measurement, to allow settling"
|
||||
echo -e "\tMAX_NUM_CONTAINERS (${MAX_NUM_CONTAINERS})"
|
||||
echo -e "\t\tThe maximum number of containers to run before terminating"
|
||||
echo -e "\tMAX_MEMORY_CONSUMED (${MAX_MEMORY_CONSUMED})"
|
||||
echo -e "\t\tThe maximum amount of memory to be consumed before terminating"
|
||||
echo -e "\tMIN_MEMORY_FREE (${MIN_MEMORY_FREE})"
|
||||
echo -e "\t\tThe path to the ctr binary (for 'smem' measurements)"
|
||||
echo -e "\tDUMP_CACHES (${DUMP_CACHES})"
|
||||
echo -e "\t\tA flag to note if the system caches should be dumped before capturing stats"
|
||||
echo -e "\tTEST_NAME (${TEST_NAME})"
|
||||
echo -e "\t\tCan be set to over-ride the default JSON results filename"
|
||||
|
||||
}
|
||||
|
||||
function help()
|
||||
{
|
||||
usage=$(cat << EOF
|
||||
Usage: $0 [-h] [options]
|
||||
Description:
|
||||
Launch a series of workloads and take memory metric measurements after
|
||||
each launch.
|
||||
Options:
|
||||
-h, Help page.
|
||||
EOF
|
||||
)
|
||||
echo "$usage"
|
||||
show_vars
|
||||
}
|
||||
|
||||
function main() {
|
||||
|
||||
local OPTIND
|
||||
while getopts "h" opt;do
|
||||
case ${opt} in
|
||||
h)
|
||||
help
|
||||
exit 0;
|
||||
;;
|
||||
esac
|
||||
done
|
||||
shift $((OPTIND-1))
|
||||
|
||||
init
|
||||
go
|
||||
cleanup
|
||||
}
|
||||
|
||||
main "$@"
|
||||
383
tests/metrics/density/memory_usage.sh
Executable file
383
tests/metrics/density/memory_usage.sh
Executable file
@@ -0,0 +1,383 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2017-2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Description of the test:
|
||||
# This test launches a number of containers in idle mode,
|
||||
# It will then sleep for a configurable period of time to allow
|
||||
# any memory optimisations to 'settle, and then checks the
|
||||
# amount of memory used by all the containers to come up with
|
||||
# an average (using the PSS measurements)
|
||||
# This test uses smem tool to get the memory used.
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
|
||||
source "${SCRIPT_PATH}/../lib/common.bash"
|
||||
|
||||
# Busybox image: Choose a small workload image, this is
|
||||
# in order to measure the runtime footprint, not the workload
|
||||
# footprint.
|
||||
IMAGE='quay.io/prometheus/busybox:latest'
|
||||
|
||||
CMD='tail -f /dev/null'
|
||||
NUM_CONTAINERS="$1"
|
||||
WAIT_TIME="$2"
|
||||
AUTO_MODE="$3"
|
||||
TEST_NAME="memory footprint"
|
||||
SMEM_BIN="smem"
|
||||
KSM_ENABLE_FILE="/sys/kernel/mm/ksm/run"
|
||||
MEM_TMP_FILE=$(mktemp meminfo.XXXXXXXXXX)
|
||||
PS_TMP_FILE=$(mktemp psinfo.XXXXXXXXXX)
|
||||
|
||||
function remove_tmp_file() {
|
||||
rm -rf "${MEM_TMP_FILE}" "${PS_TMP_FILE}"
|
||||
}
|
||||
|
||||
trap remove_tmp_file EXIT
|
||||
|
||||
# Show help about this script
|
||||
function help(){
|
||||
cat << EOF
|
||||
Usage: $0 <count> <wait_time> [auto]
|
||||
Description:
|
||||
<count> : Number of containers to run.
|
||||
<wait_time> : Time in seconds to wait before taking
|
||||
metrics.
|
||||
[auto] : Optional 'auto KSM settle' mode
|
||||
waits for ksm pages_shared to settle down
|
||||
EOF
|
||||
}
|
||||
|
||||
|
||||
function get_runc_pss_memory(){
|
||||
ctr_runc_shim_path="/usr/local/bin/containerd-shim-runc-v2"
|
||||
get_pss_memory "${ctr_runc_shim_path}"
|
||||
}
|
||||
|
||||
function get_runc_individual_memory() {
|
||||
runc_process_result=$(cat "${MEM_TMP_FILE}" | tr "\n" " " | sed -e 's/\s$//g' | sed 's/ /, /g')
|
||||
|
||||
# Verify runc process result
|
||||
if [ -z "${runc_process_result}" ];then
|
||||
die "Runc process not found"
|
||||
fi
|
||||
|
||||
read -r -a runc_values <<< "${runc_process_result}"
|
||||
|
||||
metrics_json_start_array
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"runc individual results": [
|
||||
$(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
|
||||
printf '%s\n\t\t\t' "${runc_values[i]}"
|
||||
done)
|
||||
]
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
metrics_json_add_array_element "$json"
|
||||
metrics_json_end_array "Raw results"
|
||||
}
|
||||
|
||||
# This function measures the PSS average
|
||||
# memory of a process.
|
||||
function get_pss_memory(){
|
||||
ps="$1"
|
||||
mem_amount=0
|
||||
count=0
|
||||
avg=0
|
||||
|
||||
if [ -z "${ps}" ]; then
|
||||
die "No argument to get_pss_memory()"
|
||||
fi
|
||||
|
||||
# Save all the processes names
|
||||
# This will be help us to retrieve raw information
|
||||
echo "${ps}" >> "${PS_TMP_FILE}"
|
||||
|
||||
data=$(sudo "${SMEM_BIN}" --no-header -P "^${ps}" -c "pss" | sed 's/[[:space:]]//g' | tr '\n' ' ' | sed 's/[[:blank:]]*$//')
|
||||
|
||||
# Save all the smem results
|
||||
# This will help us to retrieve raw information
|
||||
echo "${data}" >> "${MEM_TMP_FILE}"
|
||||
|
||||
gral_data=$(echo "${data// /+}" | bc)
|
||||
for i in "${gral_data}"; do
|
||||
if (( $i > 0 ));then
|
||||
mem_amount=$(( i + mem_amount ))
|
||||
(( count++ ))
|
||||
fi
|
||||
done
|
||||
|
||||
if (( "${count}" > 0 ));then
|
||||
avg=$(bc -l <<< "scale=2; ${mem_amount} / ${count}")
|
||||
fi
|
||||
|
||||
echo "${avg}"
|
||||
}
|
||||
|
||||
function ppid() {
|
||||
local pid
|
||||
pid=$(ps -p "${1:-nopid}" -o ppid=)
|
||||
echo "${pid//[[:blank:]]/}"
|
||||
}
|
||||
|
||||
# This function measures the PSS average
|
||||
# memory of virtiofsd.
|
||||
# It is a special case of get_pss_memory,
|
||||
# virtiofsd forks itself so, smem sees the process
|
||||
# two times, this function sum both pss values:
|
||||
# pss_virtiofsd=pss_fork + pss_parent
|
||||
function get_pss_memory_virtiofsd() {
|
||||
mem_amount=0
|
||||
count=0
|
||||
avg=0
|
||||
|
||||
virtiofsd_path=${1:-}
|
||||
if [ -z "${virtiofsd_path}" ]; then
|
||||
die "virtiofsd_path not provided"
|
||||
fi
|
||||
|
||||
echo "${virtiofsd_path}" >> "${PS_TMP_FILE}"
|
||||
|
||||
virtiofsd_pids=$(ps aux | grep [v]irtiofsd | awk '{print $2}' | head -1)
|
||||
data=$(sudo smem --no-header -P "^${virtiofsd_path}" -c pid -c "pid pss")
|
||||
|
||||
for p in "${virtiofsd_pids}"; do
|
||||
parent_pid=$(ppid "${p}")
|
||||
cmd="$(cat /proc/${p}/cmdline | tr -d '\0')"
|
||||
cmd_parent="$(cat /proc/${parent_pid}/cmdline | tr -d '\0')"
|
||||
if [ "${cmd}" != "${cmd_parent}" ]; then
|
||||
pss_parent=$(printf "%s" "${data}" | grep "\s^${p}" | awk '{print $2}')
|
||||
|
||||
fork=$(pgrep -P "${p}")
|
||||
|
||||
pss_fork=$(printf "%s" "${data}" | grep "^\s*${fork}" | awk '{print $2}')
|
||||
pss_process=$((pss_fork + pss_parent))
|
||||
|
||||
# Save all the smem results
|
||||
# This will help us to retrieve raw information
|
||||
echo "${pss_process}" >>"${MEM_TMP_FILE}"
|
||||
|
||||
if ((pss_process > 0)); then
|
||||
mem_amount=$((pss_process + mem_amount))
|
||||
((count++))
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
if (( "${count}" > 0 ));then
|
||||
avg=$(bc -l <<< "scale=2; ${mem_amount} / ${count}")
|
||||
fi
|
||||
echo "${avg}"
|
||||
}
|
||||
|
||||
function get_individual_memory(){
|
||||
# Getting all the individual container information
|
||||
first_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==1' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g')
|
||||
first_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==1' | sed 's/ /, /g')
|
||||
|
||||
second_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==2' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g')
|
||||
second_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==2' | sed 's/ /, /g')
|
||||
|
||||
third_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==3' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g')
|
||||
third_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==3' | sed 's/ /, /g')
|
||||
|
||||
read -r -a first_values <<< "${first_process_result}"
|
||||
read -r -a second_values <<< "${second_process_result}"
|
||||
read -r -a third_values <<< "${third_process_result}"
|
||||
|
||||
metrics_json_start_array
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"${first_process_name} memory": [
|
||||
$(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
|
||||
[ -n "${first_values[i]}" ] &&
|
||||
printf '%s\n\t\t\t' "${first_values[i]}"
|
||||
done)
|
||||
],
|
||||
"${second_process_name} memory": [
|
||||
$(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
|
||||
[ -n "${second_values[i]}" ] &&
|
||||
printf '%s\n\t\t\t' "${second_values[i]}"
|
||||
done)
|
||||
],
|
||||
"${third_process_name} memory": [
|
||||
$(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
|
||||
[ -n "${third_values[i]}" ] &&
|
||||
printf '%s\n\t\t\t' "${third_values[i]}"
|
||||
done)
|
||||
]
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
metrics_json_add_array_element "$json"
|
||||
metrics_json_end_array "Raw results"
|
||||
}
|
||||
|
||||
# Try to work out the 'average memory footprint' of a container.
|
||||
function get_memory_usage(){
|
||||
hypervisor_mem=0
|
||||
virtiofsd_mem=0
|
||||
shim_mem=0
|
||||
memory_usage=0
|
||||
|
||||
containers=()
|
||||
|
||||
info "Creating ${NUM_CONTAINERS} containers"
|
||||
for ((i=1; i<="${NUM_CONTAINERS}"; i++)); do
|
||||
containers+=($(random_name))
|
||||
sudo "${CTR_EXE}" run --runtime "${CTR_RUNTIME}" -d "${IMAGE}" "${containers[-1]}" sh -c "${CMD}"
|
||||
done
|
||||
|
||||
if [ "${AUTO_MODE}" == "auto" ]; then
|
||||
if (( ksm_on != 1 )); then
|
||||
die "KSM not enabled, cannot use auto mode"
|
||||
fi
|
||||
|
||||
echo "Entering KSM settle auto detect mode..."
|
||||
wait_ksm_settle "${WAIT_TIME}"
|
||||
else
|
||||
# If KSM is enabled, then you normally want to sleep long enough to
|
||||
# let it do its work and for the numbers to 'settle'.
|
||||
echo "napping ${WAIT_TIME} s"
|
||||
sleep "${WAIT_TIME}"
|
||||
fi
|
||||
|
||||
metrics_json_start_array
|
||||
# Check the runtime in order in order to determine which process will
|
||||
# be measured about PSS
|
||||
if [ "${RUNTIME}" == "runc" ]; then
|
||||
runc_workload_mem="$(get_runc_pss_memory)"
|
||||
memory_usage="${runc_workload_mem}"
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"average": {
|
||||
"Result": ${memory_usage},
|
||||
"Units" : "KB"
|
||||
},
|
||||
"runc": {
|
||||
"Result": ${runc_workload_mem},
|
||||
"Units" : "KB"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
|
||||
else [ "$RUNTIME" == "kata-runtime" ] || [ "$RUNTIME" == "kata-qemu" ]
|
||||
# Get PSS memory of VM runtime components.
|
||||
# And check that the smem search has found the process - we get a "0"
|
||||
# back if that procedure fails (such as if a process has changed its name
|
||||
# or is not running when expected to be so)
|
||||
# As an added bonus - this script must be run as root.
|
||||
# Now if you do not have enough rights
|
||||
# the smem failure to read the stats will also be trapped.
|
||||
|
||||
hypervisor_mem="$(get_pss_memory ${HYPERVISOR_PATH})"
|
||||
if [ "${hypervisor_mem}" == "0" ]; then
|
||||
die "Failed to find PSS for ${HYPERVISOR_PATH}"
|
||||
fi
|
||||
|
||||
virtiofsd_mem="$(get_pss_memory_virtiofsd ${VIRTIOFSD_PATH})"
|
||||
if [ "${virtiofsd_mem}" == "0" ]; then
|
||||
echo >&2 "WARNING: Failed to find PSS for ${VIRTIOFSD_PATH}"
|
||||
fi
|
||||
shim_mem="$(get_pss_memory ${SHIM_PATH})"
|
||||
if [ "${shim_mem}" == "0" ]; then
|
||||
die "Failed to find PSS for ${SHIM_PATH}"
|
||||
fi
|
||||
|
||||
mem_usage="$(bc -l <<< "scale=2; ${hypervisor_mem} +${virtiofsd_mem} + ${shim_mem}")"
|
||||
memory_usage="${mem_usage}"
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"average": {
|
||||
"Result": ${mem_usage},
|
||||
"Units" : "KB"
|
||||
},
|
||||
"qemus": {
|
||||
"Result": ${hypervisor_mem},
|
||||
"Units" : "KB"
|
||||
},
|
||||
"virtiofsds": {
|
||||
"Result": ${virtiofsd_mem},
|
||||
"Units" : "KB"
|
||||
},
|
||||
"shims": {
|
||||
"Result": ${shim_mem},
|
||||
"Units" : "KB"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
fi
|
||||
|
||||
metrics_json_add_array_element "$json"
|
||||
metrics_json_end_array "Results"
|
||||
|
||||
clean_env_ctr
|
||||
}
|
||||
|
||||
function save_config(){
|
||||
metrics_json_start_array
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"containers": "${NUM_CONTAINERS}",
|
||||
"ksm": "${ksm_on}",
|
||||
"auto": "${AUTO_MODE}",
|
||||
"waittime": "${WAIT_TIME}",
|
||||
"image": "${IMAGE}",
|
||||
"command": "${CMD}"
|
||||
}
|
||||
EOF
|
||||
|
||||
)"
|
||||
metrics_json_add_array_element "$json"
|
||||
metrics_json_end_array "Config"
|
||||
}
|
||||
|
||||
function main(){
|
||||
# Verify enough arguments
|
||||
if [ $# != 2 ] && [ $# != 3 ];then
|
||||
echo >&2 "error: Not enough arguments [$@]"
|
||||
help
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#Check for KSM before reporting test name, as it can modify it
|
||||
check_for_ksm
|
||||
|
||||
init_env
|
||||
|
||||
check_cmds "${SMEM_BIN}" bc
|
||||
check_images "${IMAGE}"
|
||||
|
||||
if [ "${CTR_RUNTIME}" == "io.containerd.kata.v2" ]; then
|
||||
export RUNTIME="kata-runtime"
|
||||
elif [ "${CTR_RUNTIME}" == "io.containerd.runc.v2" ]; then
|
||||
export RUNTIME="runc"
|
||||
else
|
||||
die "Unknown runtime ${CTR_RUNTIME}"
|
||||
fi
|
||||
|
||||
metrics_json_init
|
||||
save_config
|
||||
get_memory_usage
|
||||
|
||||
if [ "$RUNTIME" == "runc" ]; then
|
||||
get_runc_individual_memory
|
||||
elif [ "$RUNTIME" == "kata-runtime" ]; then
|
||||
get_individual_memory
|
||||
fi
|
||||
|
||||
metrics_json_save
|
||||
}
|
||||
|
||||
main "$@"
|
||||
134
tests/metrics/density/memory_usage_inside_container.sh
Executable file
134
tests/metrics/density/memory_usage_inside_container.sh
Executable file
@@ -0,0 +1,134 @@
|
||||
#!/bin/bash
|
||||
# Copyright (c) 2017-2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Description of the test:
|
||||
# This test launches a busybox container and inside
|
||||
# memory free, memory available and total memory
|
||||
# is measured by using /proc/meminfo.
|
||||
|
||||
set -e
|
||||
|
||||
# General env
|
||||
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
|
||||
source "${SCRIPT_PATH}/../lib/common.bash"
|
||||
|
||||
TEST_NAME="memory footprint inside container"
|
||||
VERSIONS_FILE="${SCRIPT_PATH}/../../versions.yaml"
|
||||
IMAGE='quay.io/prometheus/busybox:latest'
|
||||
CMD="sleep 10; cat /proc/meminfo"
|
||||
# We specify here in 'k', as that then matches the results we get from the meminfo,
|
||||
# which makes later direct comparison easier.
|
||||
MEMSIZE=${MEMSIZE:-$((2048*1024))}
|
||||
|
||||
# this variable determines the number of attempts when a test
|
||||
# result is considered not valid (a zero value or a negative value)
|
||||
MAX_FAILED_ATTEMPTS=3
|
||||
memtotalAvg=0
|
||||
units_memtotal=""
|
||||
memfreeAvg=0
|
||||
units_memfree=""
|
||||
memavailableAvg=0
|
||||
units_memavailable=""
|
||||
|
||||
# count_iters: is the index of the current iteration
|
||||
count_iters=0
|
||||
|
||||
# valid_result: if value stored is '1' the result is valid, '0' otherwise
|
||||
valid_result=0
|
||||
|
||||
parse_results() {
|
||||
local raw_results="${1}"
|
||||
|
||||
# Variables used for sum cummulative values in the case of two or more reps.
|
||||
# and used to compute average results for 'json' output format.
|
||||
local memtotal_acu="${2:-0}"
|
||||
local memfree_acu="${3:-0}"
|
||||
local memavailable_acu="${4:-0}"
|
||||
|
||||
local memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $2}')
|
||||
units_memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $3}')
|
||||
|
||||
local memfree=$(echo "$raw_results" | awk '/MemFree/ {print $2}')
|
||||
units_memfree=$(echo "$raw_results" | awk '/MemFree/ {print $3}')
|
||||
|
||||
local memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $2}')
|
||||
units_memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $3}')
|
||||
|
||||
# check results: if any result is zero or negative, it is considered as invalid, and the test will be repeated.
|
||||
if (( $(echo "$memtotal <= 0" | bc -l) )) || (( $(echo "$memfree <= 0" | bc -l) )) || (( $(echo "$memavailable <= 0" | bc -l) )); then
|
||||
MAX_FAILED_ATTEMPTS=$((MAX_FAILED_ATTEMPTS-1))
|
||||
valid_result=0
|
||||
info "Skipping invalid result: memtotal: $memtotal memfree: $memfree memavailable: $memavailable"
|
||||
return 0
|
||||
fi
|
||||
|
||||
memtotalAvg=$((memtotal+memtotal_acu))
|
||||
memfreeAvg=$((memfree+memfree_acu))
|
||||
memavailableAvg=$((memavailable+memavailable_acu))
|
||||
valid_result=1
|
||||
info "Iteration# $count_iters memtotal: $memtotal memfree: $memfree memavailable: $memavailable"
|
||||
}
|
||||
|
||||
store_results_json() {
|
||||
metrics_json_start_array
|
||||
memtotalAvg=$(echo "scale=2; $memtotalAvg / $count_iters" | bc)
|
||||
memfreeAvg=$(echo "scale=2; $memfreeAvg / $count_iters" | bc)
|
||||
memavailableAvg=$(echo "scale=2; $memavailableAvg / $count_iters" | bc)
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"memrequest": {
|
||||
"Result" : ${MEMSIZE},
|
||||
"Units" : "Kb"
|
||||
},
|
||||
"memtotal": {
|
||||
"Result" : ${memtotalAvg},
|
||||
"Units" : "${units_memtotal}"
|
||||
},
|
||||
"memfree": {
|
||||
"Result" : ${memfreeAvg},
|
||||
"Units" : "${units_memfree}"
|
||||
},
|
||||
"memavailable": {
|
||||
"Result" : ${memavailableAvg},
|
||||
"Units" : "${units_memavailable}"
|
||||
},
|
||||
"repetitions": {
|
||||
"Result" : ${count_iters}
|
||||
}
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
metrics_json_add_array_element "$json"
|
||||
metrics_json_end_array "Results"
|
||||
metrics_json_save
|
||||
}
|
||||
|
||||
function main() {
|
||||
# switch to select output format
|
||||
local num_iterations=${1:-1}
|
||||
info "Iterations: $num_iterations"
|
||||
|
||||
# Check tools/commands dependencies
|
||||
cmds=("awk" "ctr")
|
||||
init_env
|
||||
check_cmds "${cmds[@]}"
|
||||
check_images "${IMAGE}"
|
||||
metrics_json_init
|
||||
while [ $count_iters -lt $num_iterations ]; do
|
||||
local output=$(sudo -E "${CTR_EXE}" run --memory-limit $((MEMSIZE*1024)) --rm --runtime=$CTR_RUNTIME $IMAGE busybox sh -c "$CMD" 2>&1)
|
||||
parse_results "${output}" "${memtotalAvg}" "${memfreeAvg}" "${memavailableAvg}"
|
||||
|
||||
# quit if number of attempts exceeds the allowed value.
|
||||
[ ${MAX_FAILED_ATTEMPTS} -eq 0 ] && die "Max number of attempts exceeded."
|
||||
[ ${valid_result} -eq 1 ] && count_iters=$((count_iters+1))
|
||||
done
|
||||
store_results_json
|
||||
clean_env_ctr
|
||||
}
|
||||
|
||||
# Parameters
|
||||
# @1: num_iterations {integer}
|
||||
main "$@"
|
||||
@@ -9,24 +9,28 @@ set -o errexit
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
||||
kata_tarball_dir=${2:-kata-artifacts}
|
||||
kata_tarball_dir="${2:-kata-artifacts}"
|
||||
metrics_dir="$(dirname "$(readlink -f "$0")")"
|
||||
source "${metrics_dir}/../common.bash"
|
||||
source "${metrics_dir}/lib/common.bash"
|
||||
|
||||
create_symbolic_links() {
|
||||
hypervisor="${1:-qemu}"
|
||||
declare -r results_dir="${metrics_dir}/results"
|
||||
declare -r checkmetrics_dir="${metrics_dir}/cmd/checkmetrics"
|
||||
declare -r checkmetrics_config_dir="${checkmetrics_dir}/ci_worker"
|
||||
|
||||
function create_symbolic_links() {
|
||||
local link_configuration_file="/opt/kata/share/defaults/kata-containers/configuration.toml"
|
||||
local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${hypervisor}.toml"
|
||||
local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${KATA_HYPERVISOR}.toml"
|
||||
|
||||
if [ ${hypervisor} != 'qemu' ] && [ ${hypervisor} != 'clh' ]; then
|
||||
die "Failed to set the configuration.toml: '${hypervisor}' is not recognized as a valid hypervisor name."
|
||||
if [ "${KATA_HYPERVISOR}" != 'qemu' ] && [ "${KATA_HYPERVISOR}" != 'clh' ]; then
|
||||
die "Failed to set the configuration.toml: '${KATA_HYPERVISOR}' is not recognized as a valid hypervisor name."
|
||||
fi
|
||||
|
||||
sudo ln -sf "${source_configuration_file}" "${link_configuration_file}"
|
||||
}
|
||||
|
||||
# Configures containerd
|
||||
overwrite_containerd_config() {
|
||||
function overwrite_containerd_config() {
|
||||
containerd_config="/etc/containerd/config.toml"
|
||||
sudo rm "${containerd_config}"
|
||||
sudo tee "${containerd_config}" << EOF
|
||||
@@ -44,7 +48,7 @@ version = 2
|
||||
EOF
|
||||
}
|
||||
|
||||
install_kata() {
|
||||
function install_kata() {
|
||||
local kata_tarball="kata-static.tar.xz"
|
||||
declare -r katadir="/opt/kata"
|
||||
declare -r destdir="/"
|
||||
@@ -53,7 +57,7 @@ install_kata() {
|
||||
# Removing previous kata installation
|
||||
sudo rm -rf "${katadir}"
|
||||
|
||||
pushd ${kata_tarball_dir}
|
||||
pushd "${kata_tarball_dir}"
|
||||
sudo tar -xvf "${kata_tarball}" -C "${destdir}"
|
||||
popd
|
||||
|
||||
@@ -64,17 +68,26 @@ install_kata() {
|
||||
|
||||
check_containerd_config_for_kata
|
||||
restart_containerd_service
|
||||
install_checkmetrics
|
||||
}
|
||||
|
||||
check_containerd_config_for_kata() {
|
||||
function install_checkmetrics() {
|
||||
# Ensure we have the latest checkmetrics
|
||||
pushd "${checkmetrics_dir}"
|
||||
make
|
||||
sudo make install
|
||||
popd
|
||||
}
|
||||
|
||||
function check_containerd_config_for_kata() {
|
||||
# check containerd config
|
||||
declare -r line1="default_runtime_name = \"kata\""
|
||||
declare -r line2="runtime_type = \"io.containerd.kata.v2\""
|
||||
declare -r num_lines_containerd=2
|
||||
declare -r containerd_path="/etc/containerd/config.toml"
|
||||
local count_matches=$(grep -ic "$line1\|$line2" ${containerd_path})
|
||||
local count_matches=$(grep -ic "$line1\|$line2" "${containerd_path}")
|
||||
|
||||
if [ $count_matches = $num_lines_containerd ]; then
|
||||
if [ "${count_matches}" = "${num_lines_containerd}" ]; then
|
||||
info "containerd ok"
|
||||
else
|
||||
info "overwriting containerd configuration w/ a valid one"
|
||||
@@ -82,21 +95,62 @@ check_containerd_config_for_kata() {
|
||||
fi
|
||||
}
|
||||
|
||||
function check_metrics() {
|
||||
local cm_base_file="${checkmetrics_config_dir}/checkmetrics-json-${KATA_HYPERVISOR}-kata-metric8.toml"
|
||||
checkmetrics --debug --percentage --basefile "${cm_base_file}" --metricsdir "${results_dir}"
|
||||
cm_result=$?
|
||||
if [ "${cm_result}" != 0 ]; then
|
||||
die "run-metrics-ci: checkmetrics FAILED (${cm_result})"
|
||||
fi
|
||||
}
|
||||
|
||||
function make_tarball_results() {
|
||||
compress_metrics_results_dir "${metrics_dir}/results" "${GITHUB_WORKSPACE}/results-${KATA_HYPERVISOR}.tar.gz"
|
||||
}
|
||||
|
||||
function run_test_launchtimes() {
|
||||
hypervisor="${1}"
|
||||
info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor"
|
||||
|
||||
info "Running Launch Time test using ${hypervisor} hypervisor"
|
||||
|
||||
create_symbolic_links "${hypervisor}"
|
||||
create_symbolic_links
|
||||
bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20
|
||||
}
|
||||
|
||||
function run_test_memory_usage() {
|
||||
info "Running memory-usage test using ${KATA_HYPERVISOR} hypervisor"
|
||||
|
||||
create_symbolic_links
|
||||
bash tests/metrics/density/memory_usage.sh 20 5
|
||||
|
||||
check_metrics
|
||||
}
|
||||
|
||||
function run_test_memory_usage_inside_container() {
|
||||
info "Running memory-usage inside the container test using ${KATA_HYPERVISOR} hypervisor"
|
||||
|
||||
# ToDo: remove the exit once the metrics workflow is stable
|
||||
exit 0
|
||||
create_symbolic_links
|
||||
bash tests/metrics/density/memory_usage_inside_container.sh 5
|
||||
}
|
||||
|
||||
function run_test_blogbench() {
|
||||
info "Running Blogbench test using ${KATA_HYPERVISOR} hypervisor"
|
||||
|
||||
# ToDo: remove the exit once the metrics workflow is stable
|
||||
exit 0
|
||||
create_symbolic_links
|
||||
bash tests/metrics/storage/blogbench.sh
|
||||
}
|
||||
|
||||
function main() {
|
||||
action="${1:-}"
|
||||
case "${action}" in
|
||||
install-kata) install_kata ;;
|
||||
run-test-launchtimes-qemu) run_test_launchtimes "qemu" ;;
|
||||
run-test-launchtimes-clh) run_test_launchtimes "clh" ;;
|
||||
make-tarball-results) make_tarball_results ;;
|
||||
run-test-launchtimes) run_test_launchtimes ;;
|
||||
run-test-memory-usage) run_test_memory_usage ;;
|
||||
run-test-memory-usage-inside-container) run_test_memory_usage_inside_container ;;
|
||||
run-test-blogbench) run_test_blogbench ;;
|
||||
*) >&2 die "Invalid argument" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
@@ -47,14 +47,14 @@ quay.io/libpod"
|
||||
#
|
||||
# cmds=(“cmd1” “cmd2”)
|
||||
# check_cmds "${cmds[@]}"
|
||||
check_cmds()
|
||||
function check_cmds()
|
||||
{
|
||||
local cmd req_cmds=( "$@" )
|
||||
for cmd in "${req_cmds[@]}"; do
|
||||
if ! command -v "$cmd" > /dev/null 2>&1; then
|
||||
die "command $cmd not available"
|
||||
fi
|
||||
echo "command: $cmd: yes"
|
||||
info "command: $cmd: yes"
|
||||
done
|
||||
}
|
||||
|
||||
@@ -68,19 +68,20 @@ check_cmds()
|
||||
#
|
||||
# images=(“img1” “img2”)
|
||||
# check_imgs "${images[@]}"
|
||||
check_images()
|
||||
function check_images()
|
||||
{
|
||||
local img req_images=( "$@" )
|
||||
for img in "${req_images[@]}"; do
|
||||
echo "ctr pull'ing: $img"
|
||||
info "ctr pull'ing: $img"
|
||||
if ! sudo "${CTR_EXE}" image pull "$img"; then
|
||||
die "Failed to pull image $img"
|
||||
fi
|
||||
echo "ctr pull'd: $img"
|
||||
info "ctr pull'd: $img"
|
||||
done
|
||||
}
|
||||
|
||||
generate_build_dockerfile() {
|
||||
function generate_build_dockerfile()
|
||||
{
|
||||
local dockerfile="$1"
|
||||
local image="$2"
|
||||
local map_key="$3"
|
||||
@@ -99,14 +100,14 @@ generate_build_dockerfile() {
|
||||
# This function performs a build on the image names
|
||||
# passed in, to ensure that we have the latest changes from
|
||||
# the dockerfiles
|
||||
build_dockerfile_image()
|
||||
function build_dockerfile_image()
|
||||
{
|
||||
local image="$1"
|
||||
local dockerfile_path="$2"
|
||||
local dockerfile_dir=${2%/*}
|
||||
|
||||
if [ -f "$dockerfile_path" ]; then
|
||||
echo "docker building $image"
|
||||
info "docker building $image"
|
||||
if ! sudo "${DOCKER_EXE}" build --build-arg http_proxy="${http_proxy}" --build-arg https_proxy="${https_proxy}" --label "$image" --tag "${image}" -f "$dockerfile_path" "$dockerfile_dir"; then
|
||||
die "Failed to docker build image $image"
|
||||
fi
|
||||
@@ -119,7 +120,7 @@ build_dockerfile_image()
|
||||
|
||||
# This function removes the ctr image, builds a new one using a dockerfile
|
||||
# and imports the image from docker to ctr
|
||||
check_ctr_images()
|
||||
function check_ctr_images()
|
||||
{
|
||||
local ctr_image="$1"
|
||||
local dockerfile_path="$2"
|
||||
@@ -138,7 +139,7 @@ check_ctr_images()
|
||||
|
||||
# A one time (per uber test cycle) init that tries to get the
|
||||
# system to a 'known state' as much as possible
|
||||
metrics_onetime_init()
|
||||
function metrics_onetime_init()
|
||||
{
|
||||
# The onetime init must be called once, and only once
|
||||
if [ ! -z "$onetime_init_done" ]; then
|
||||
@@ -155,14 +156,14 @@ metrics_onetime_init()
|
||||
|
||||
# Print a banner to the logs noting clearly which test
|
||||
# we are about to run
|
||||
test_banner()
|
||||
function test_banner()
|
||||
{
|
||||
echo -e "\n===== starting test [$1] ====="
|
||||
info -e "\n===== starting test [$1] ====="
|
||||
}
|
||||
|
||||
# Initialization/verification environment. This function makes
|
||||
# minimal steps for metrics/tests execution.
|
||||
init_env()
|
||||
function init_env()
|
||||
{
|
||||
test_banner "${TEST_NAME}"
|
||||
|
||||
@@ -183,7 +184,8 @@ init_env()
|
||||
# This function checks if there are containers or
|
||||
# shim/proxy/hypervisor processes up, if found, they are
|
||||
# killed to start test with clean environment.
|
||||
kill_processes_before_start() {
|
||||
function kill_processes_before_start()
|
||||
{
|
||||
DOCKER_PROCS=$(sudo "${DOCKER_EXE}" ps -q)
|
||||
[[ -n "${DOCKER_PROCS}" ]] && clean_env
|
||||
|
||||
@@ -195,26 +197,29 @@ kill_processes_before_start() {
|
||||
|
||||
# Generate a random name - generally used when creating containers, but can
|
||||
# be used for any other appropriate purpose
|
||||
random_name() {
|
||||
function random_name()
|
||||
{
|
||||
mktemp -u kata-XXXXXX
|
||||
}
|
||||
|
||||
show_system_ctr_state() {
|
||||
echo "Showing system state:"
|
||||
echo " --Check containers--"
|
||||
function show_system_ctr_state()
|
||||
{
|
||||
info "Showing system state:"
|
||||
info " --Check containers--"
|
||||
sudo "${CTR_EXE}" c list
|
||||
echo " --Check tasks--"
|
||||
info " --Check tasks--"
|
||||
sudo "${CTR_EXE}" task list
|
||||
|
||||
local processes="containerd-shim-kata-v2"
|
||||
|
||||
for p in ${processes}; do
|
||||
echo " --pgrep ${p}--"
|
||||
info " --pgrep ${p}--"
|
||||
pgrep -a ${p}
|
||||
done
|
||||
}
|
||||
|
||||
common_init(){
|
||||
function common_init()
|
||||
{
|
||||
if [ "$CTR_RUNTIME" == "io.containerd.kata.v2" ] || [ "$RUNTIME" == "containerd-shim-kata-v2" ]; then
|
||||
extract_kata_env
|
||||
else
|
||||
@@ -225,17 +230,18 @@ common_init(){
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# Save the current KSM settings so we can restore them later
|
||||
save_ksm_settings(){
|
||||
echo "saving KSM settings"
|
||||
function save_ksm_settings()
|
||||
{
|
||||
info "saving KSM settings"
|
||||
ksm_stored_run=$(cat ${KSM_ENABLE_FILE})
|
||||
ksm_stored_pages=$(cat ${KSM_ENABLE_FILE})
|
||||
ksm_stored_sleep=$(cat ${KSM_ENABLE_FILE})
|
||||
}
|
||||
|
||||
set_ksm_aggressive(){
|
||||
echo "setting KSM to aggressive mode"
|
||||
function set_ksm_aggressive()
|
||||
{
|
||||
info "setting KSM to aggressive mode"
|
||||
# Flip the run off/on to ensure a restart/rescan
|
||||
sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}"
|
||||
sudo bash -c "echo ${KSM_AGGRESIVE_PAGES} > ${KSM_PAGES_FILE}"
|
||||
@@ -245,7 +251,7 @@ set_ksm_aggressive(){
|
||||
if [ "${KATA_HYPERVISOR}" == "qemu" ]; then
|
||||
# Disable virtio-fs and save whether it was enabled previously
|
||||
set_virtio_out=$(sudo -E PATH="$PATH" "${LIB_DIR}/../../.ci/set_kata_config.sh" shared_fs virtio-9p)
|
||||
echo "${set_virtio_out}"
|
||||
info "${set_virtio_out}"
|
||||
grep -q "already" <<< "${set_virtio_out}" || was_virtio_fs=true;
|
||||
fi
|
||||
}
|
||||
@@ -256,8 +262,9 @@ restore_virtio_fs(){
|
||||
info "Not restoring virtio-fs since it wasn't enabled previously"
|
||||
}
|
||||
|
||||
restore_ksm_settings(){
|
||||
echo "restoring KSM settings"
|
||||
function restore_ksm_settings()
|
||||
{
|
||||
info "restoring KSM settings"
|
||||
# First turn off the run to ensure if we are then re-enabling
|
||||
# that any changes take effect
|
||||
sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}"
|
||||
@@ -267,15 +274,17 @@ restore_ksm_settings(){
|
||||
[ "${KATA_HYPERVISOR}" == "qemu" ] && restore_virtio_fs
|
||||
}
|
||||
|
||||
disable_ksm(){
|
||||
echo "disabling KSM"
|
||||
function disable_ksm()
|
||||
{
|
||||
info "disabling KSM"
|
||||
sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}"
|
||||
[ "${KATA_HYPERVISOR}" == "qemu" ] && restore_virtio_fs
|
||||
}
|
||||
|
||||
# See if KSM is enabled.
|
||||
# If so, amend the test name to reflect that
|
||||
check_for_ksm(){
|
||||
function check_for_ksm()
|
||||
{
|
||||
if [ ! -f ${KSM_ENABLE_FILE} ]; then
|
||||
return
|
||||
fi
|
||||
@@ -294,7 +303,8 @@ check_for_ksm(){
|
||||
# a full scan has managed to do few new merges)
|
||||
#
|
||||
# arg1 - timeout in seconds
|
||||
wait_ksm_settle(){
|
||||
function wait_ksm_settle()
|
||||
{
|
||||
[[ "$RUNTIME" == "runc" ]] || [[ "$CTR_RUNTIME" == "io.containerd.runc.v2" ]] && return
|
||||
local t pcnt
|
||||
local oldscan=-1 newscan
|
||||
@@ -305,7 +315,7 @@ wait_ksm_settle(){
|
||||
# Wait some time for KSM to kick in to avoid early dismissal
|
||||
for ((t=0; t<5; t++)); do
|
||||
pages=$(cat "${KSM_PAGES_SHARED}")
|
||||
[[ "$pages" -ne 0 ]] && echo "Discovered KSM activity" && break
|
||||
[[ "$pages" -ne 0 ]] && info "Discovered KSM activity" && break
|
||||
sleep 1
|
||||
done
|
||||
|
||||
@@ -315,13 +325,13 @@ wait_ksm_settle(){
|
||||
|
||||
newscan=$(cat /sys/kernel/mm/ksm/full_scans)
|
||||
newpages=$(cat "${KSM_PAGES_SHARED}")
|
||||
[[ "$newpages" -eq 0 ]] && echo "No need to wait for KSM to settle" && return
|
||||
[[ "$newpages" -eq 0 ]] && info "No need to wait for KSM to settle" && return
|
||||
|
||||
if (( newscan != oldscan )); then
|
||||
echo -e "\nnew full_scan ($oldscan to $newscan)"
|
||||
info -e "\nnew full_scan ($oldscan to $newscan)"
|
||||
|
||||
# Do we have a previous scan to compare with
|
||||
echo "check pages $oldpages to $newpages"
|
||||
info "check pages $oldpages to $newpages"
|
||||
|
||||
if (( oldpages != -1 )); then
|
||||
# avoid divide by zero problems
|
||||
@@ -330,14 +340,14 @@ wait_ksm_settle(){
|
||||
# abs()
|
||||
pcnt=$(( $pcnt * -1 ))
|
||||
|
||||
echo "$oldpages to $newpages is ${pcnt}%"
|
||||
info "$oldpages to $newpages is ${pcnt}%"
|
||||
|
||||
if (( $pcnt <= 5 )); then
|
||||
echo "KSM stabilised at ${t}s"
|
||||
info "KSM stabilised at ${t}s"
|
||||
return
|
||||
fi
|
||||
else
|
||||
echo "$oldpages KSM pages... waiting"
|
||||
info "$oldpages KSM pages... waiting"
|
||||
fi
|
||||
fi
|
||||
oldscan=$newscan
|
||||
@@ -347,7 +357,7 @@ wait_ksm_settle(){
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "Timed out after ${1}s waiting for KSM to settle"
|
||||
info "Timed out after ${1}s waiting for KSM to settle"
|
||||
}
|
||||
|
||||
common_init
|
||||
|
||||
124
tests/metrics/storage/blogbench.sh
Executable file
124
tests/metrics/storage/blogbench.sh
Executable file
@@ -0,0 +1,124 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2018-2023 Intel Corporation
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
# Description of the test:
|
||||
# This test runs the 'blogbench', and extracts the 'scores' for reads
|
||||
# and writes
|
||||
# Note - the scores are *not* normalised for the number of iterations run,
|
||||
# they are total scores for all iterations (this is the blogbench default output)
|
||||
|
||||
set -e
|
||||
|
||||
# General env
|
||||
SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
|
||||
source "${SCRIPT_PATH}/../lib/common.bash"
|
||||
|
||||
TEST_NAME="blogbench"
|
||||
IMAGE="docker.io/library/local-blogbench:latest"
|
||||
DOCKERFILE="${SCRIPT_PATH}/blogbench_dockerfile/Dockerfile"
|
||||
|
||||
# Number of iterations for blogbench to run - note, results are not
|
||||
# scaled to iterations - more iterations results in bigger results
|
||||
ITERATIONS="${ITERATIONS:-30}"
|
||||
|
||||
# Directory to run the test on
|
||||
# This is run inside of the container
|
||||
TESTDIR="${TESTDIR:-/tmp}"
|
||||
CMD="blogbench -i ${ITERATIONS} -d ${TESTDIR}"
|
||||
|
||||
function main() {
|
||||
# Check tools/commands dependencies
|
||||
cmds=("awk" "docker")
|
||||
|
||||
init_env
|
||||
check_cmds "${cmds[@]}"
|
||||
check_ctr_images "${IMAGE}" "${DOCKERFILE}"
|
||||
metrics_json_init
|
||||
|
||||
local output=$(sudo -E ${CTR_EXE} run --rm --runtime=${CTR_RUNTIME} ${IMAGE} test ${CMD})
|
||||
|
||||
# Save configuration
|
||||
metrics_json_start_array
|
||||
|
||||
local frequency=$(echo "${output}" | grep "Frequency" | cut -d "=" -f2 | cut -d ' ' -f2)
|
||||
local iterations=$(echo "${output}" | grep -w "iterations" | cut -d ' ' -f3)
|
||||
local spawing_writers=$(echo "${output}" | grep -w "writers" | cut -d ' ' -f2)
|
||||
local spawing_rewriters=$(echo "${output}" | grep -w "rewriters" | cut -d ' ' -f2)
|
||||
local spawing_commenters=$(echo "${output}" | grep -w "commenters" | cut -d ' ' -f2)
|
||||
local spawing_readers=$(echo "${output}" | grep -w "readers" | cut -d ' ' -f2)
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"Frequency" : ${frequency},
|
||||
"Iterations" : ${iterations},
|
||||
"Number of spawing writers" : ${spawing_writers},
|
||||
"Number of spawing rewriters" : ${spawing_rewriters},
|
||||
"Number of spawing commenters" : ${spawing_commenters},
|
||||
"Number of spawing readers" : ${spawing_readers}
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
metrics_json_add_array_element "${json}"
|
||||
metrics_json_end_array "Config"
|
||||
|
||||
# Save results
|
||||
metrics_json_start_array
|
||||
|
||||
local writes=$(tail -2 <<< "${output}" | head -1 | awk '{print $5}')
|
||||
local reads=$(tail -1 <<< "${output}" | awk '{print $6}')
|
||||
|
||||
# Obtaining other Blogbench results
|
||||
local -r data=$(echo "${output}" | tail -n +12 | head -n -3)
|
||||
local nb_blogs=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $1} ' | tr '\t' ',' | sed '$ s/.$//')
|
||||
local r_articles=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $2} ' | tr '\t' ',' | sed '$ s/.$//')
|
||||
local w_articles=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $3} ' | tr '\t' ',' | sed '$ s/.$//')
|
||||
local r_pictures=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $4} ' | tr '\t' ',' | sed '$ s/.$//')
|
||||
local w_pictures=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $5} ' | tr '\t' ',' | sed '$ s/.$//')
|
||||
local r_comments=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $6} ' | tr '\t' ',' | sed '$ s/.$//')
|
||||
local w_comments=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $7} ' | tr '\t' ',' | sed '$ s/.$//')
|
||||
|
||||
local json="$(cat << EOF
|
||||
{
|
||||
"write": {
|
||||
"Result" : "${writes}",
|
||||
"Units" : "items"
|
||||
},
|
||||
"read": {
|
||||
"Result" : "${reads}",
|
||||
"Units" : "items"
|
||||
},
|
||||
"Nb blogs": {
|
||||
"Result" : "${nb_blogs}"
|
||||
},
|
||||
"R articles": {
|
||||
"Result" : "${r_articles}"
|
||||
},
|
||||
"W articles": {
|
||||
"Result" : "${w_articles}"
|
||||
},
|
||||
"R pictures": {
|
||||
"Result" : "${r_pictures}"
|
||||
},
|
||||
"W pictures": {
|
||||
"Result" : "${w_pictures}"
|
||||
},
|
||||
"R comments": {
|
||||
"Result" : "${r_comments}"
|
||||
},
|
||||
"W comments": {
|
||||
"Result" : "${w_comments}"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
)"
|
||||
|
||||
metrics_json_add_array_element "${json}"
|
||||
metrics_json_end_array "Results"
|
||||
metrics_json_save
|
||||
clean_env_ctr
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user