Merge pull request #7300 from stevenhorsman/CCv0-merge-10th-july

CCv0: Merge main into CCv0 branch
2026-01-23 16:24:19 +01:00 · 2023-07-18 09:42:43 -03:00
parent 6f2c95a2ea e16235584c
commit b2fdaf2e13
113 changed files with 4281 additions and 1116 deletions
--- a/.github/workflows/build-kata-static-tarball-amd64.yaml
+++ b/.github/workflows/build-kata-static-tarball-amd64.yaml
@@ -13,14 +13,15 @@ on:
        required: false
        type: string
        default: no
+      commit-hash:
+        required: false
+        type: string

 jobs:
  build-asset:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        stage:
-          - ${{ inputs.stage }}
        asset:
          - cloud-hypervisor
          - cloud-hypervisor-glibc
@@ -46,9 +47,11 @@ jobs:
          - shim-v2
          - tdvf
          - virtiofsd
+        stage:
+          - ${{ inputs.stage }}
        exclude:
-          - stage: release
-            asset: cloud-hypervisor-glibc
+          - asset: cloud-hypervisor-glibc
+            stage: release
    steps:
      - name: Login to Kata Containers quay.io
        if: ${{ inputs.push-to-registry == 'yes' }}
@@ -60,7 +63,7 @@ jobs:

      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}
          fetch-depth: 0 # This is needed in order to keep the commit ids history

      - name: Build ${{ matrix.asset }}
@@ -88,7 +91,7 @@ jobs:
    steps:
      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}
      - name: get-artifacts
        uses: actions/download-artifact@v3
        with:
--- a/.github/workflows/build-kata-static-tarball-arm64.yaml
+++ b/.github/workflows/build-kata-static-tarball-arm64.yaml
@@ -9,6 +9,9 @@ on:
        required: false
        type: string
        default: no
+      commit-hash:
+        required: false
+        type: string

 jobs:
  build-asset:
@@ -41,7 +44,7 @@ jobs:

      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}
          fetch-depth: 0 # This is needed in order to keep the commit ids history
      - name: Build ${{ matrix.asset }}
        run: |
@@ -72,7 +75,7 @@ jobs:

      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}
      - name: get-artifacts
        uses: actions/download-artifact@v3
        with:
--- a/.github/workflows/build-kata-static-tarball-s390x.yaml
+++ b/.github/workflows/build-kata-static-tarball-s390x.yaml
@@ -9,6 +9,9 @@ on:
        required: false
        type: string
        default: no
+      commit-hash:
+        required: false
+        type: string

 jobs:
  build-asset:
@@ -37,7 +40,7 @@ jobs:

      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}
          fetch-depth: 0 # This is needed in order to keep the commit ids history
      - name: Build ${{ matrix.asset }}
        run: |
@@ -69,7 +72,7 @@ jobs:

      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}
      - name: get-artifacts
        uses: actions/download-artifact@v3
        with:
--- a/.github/workflows/ci-nightly.yaml
+++ b/.github/workflows/ci-nightly.yaml
@@ -0,0 +1,14 @@
+name: Kata Containers Nightly CI
+on:
+  schedule:
+    - cron: '0 0 * * *'
+  workflow_dispatch:
+
+jobs:
+  kata-containers-ci-on-push:
+    uses: ./.github/workflows/ci.yaml
+    with:
+      commit-hash: ${{ github.sha }}
+      pr-number: "nightly"
+      tag: ${{ github.sha }}-nightly
+    secrets: inherit
--- a/.github/workflows/ci-on-push.yaml
+++ b/.github/workflows/ci-on-push.yaml
@@ -12,65 +12,14 @@ on:
      - synchronize
      - reopened
      - labeled
-
+    paths-ignore:
+      - 'docs/**'
 jobs:
-  build-kata-static-tarball-amd64:
+  kata-containers-ci-on-push:
    if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
-    uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml
+    uses: ./.github/workflows/ci.yaml
    with:
-      tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }}
-
-  publish-kata-deploy-payload-amd64:
-    if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
-    needs: build-kata-static-tarball-amd64
-    uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml
-    with:
-      tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }}
-      registry: ghcr.io
-      repo: ${{ github.repository_owner }}/kata-deploy-ci
-      tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
+      commit-hash: ${{ github.event.pull_request.head.sha }}
+      pr-number: ${{ github.event.pull_request.number }}
+      tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}
    secrets: inherit
-
-  run-k8s-tests-on-aks:
-    if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
-    needs: publish-kata-deploy-payload-amd64
-    uses: ./.github/workflows/run-k8s-tests-on-aks.yaml
-    with:
-      registry: ghcr.io
-      repo: ${{ github.repository_owner }}/kata-deploy-ci
-      tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
-    secrets: inherit
-
-  run-k8s-tests-on-sev:
-    if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
-    needs: publish-kata-deploy-payload-amd64
-    uses: ./.github/workflows/run-k8s-tests-on-sev.yaml
-    with:
-      registry: ghcr.io
-      repo: ${{ github.repository_owner }}/kata-deploy-ci
-      tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
-
-  run-k8s-tests-on-snp:
-    if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
-    needs: publish-kata-deploy-payload-amd64
-    uses: ./.github/workflows/run-k8s-tests-on-snp.yaml
-    with:
-      registry: ghcr.io
-      repo: ${{ github.repository_owner }}/kata-deploy-ci
-      tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
-
-  run-k8s-tests-on-tdx:
-    if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
-    needs: publish-kata-deploy-payload-amd64
-    uses: ./.github/workflows/run-k8s-tests-on-tdx.yaml
-    with:
-      registry: ghcr.io
-      repo: ${{ github.repository_owner }}/kata-deploy-ci
-      tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
-
-  run-metrics-tests:
-    if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }}
-    needs: build-kata-static-tarball-amd64
-    uses: ./.github/workflows/run-metrics.yaml
-    with:
-      tarball-suffix: -${{ github.event.pull_request.number}}-${{ github.event.pull_request.head.sha }}
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,76 @@
+name: Run the Kata Containers CI
+on:
+  workflow_call:
+    inputs:
+      commit-hash:
+        required: true
+        type: string
+      pr-number:
+        required: true
+        type: string
+      tag:
+        required: true
+        type: string
+
+jobs:
+  build-kata-static-tarball-amd64:
+    uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml
+    with:
+      tarball-suffix: -${{ inputs.tag }}
+      commit-hash: ${{ inputs.commit-hash }}
+
+  publish-kata-deploy-payload-amd64:
+    needs: build-kata-static-tarball-amd64
+    uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml
+    with:
+      tarball-suffix: -${{ inputs.tag }}
+      registry: ghcr.io
+      repo: ${{ github.repository_owner }}/kata-deploy-ci
+      tag: ${{ inputs.tag }}-amd64
+      commit-hash: ${{ inputs.commit-hash }}
+    secrets: inherit
+
+  run-k8s-tests-on-aks:
+    needs: publish-kata-deploy-payload-amd64
+    uses: ./.github/workflows/run-k8s-tests-on-aks.yaml
+    with:
+      registry: ghcr.io
+      repo: ${{ github.repository_owner }}/kata-deploy-ci
+      tag: ${{ inputs.tag }}-amd64
+      commit-hash: ${{ inputs.commit-hash }}
+      pr-number: ${{ inputs.pr-number }}
+    secrets: inherit
+
+  run-k8s-tests-on-sev:
+    needs: publish-kata-deploy-payload-amd64
+    uses: ./.github/workflows/run-k8s-tests-on-sev.yaml
+    with:
+      registry: ghcr.io
+      repo: ${{ github.repository_owner }}/kata-deploy-ci
+      tag: ${{ inputs.tag }}-amd64
+      commit-hash: ${{ inputs.commit-hash }}
+
+  run-k8s-tests-on-snp:
+    needs: publish-kata-deploy-payload-amd64
+    uses: ./.github/workflows/run-k8s-tests-on-snp.yaml
+    with:
+      registry: ghcr.io
+      repo: ${{ github.repository_owner }}/kata-deploy-ci
+      tag: ${{ inputs.tag }}-amd64
+      commit-hash: ${{ inputs.commit-hash }}
+
+  run-k8s-tests-on-tdx:
+    needs: publish-kata-deploy-payload-amd64
+    uses: ./.github/workflows/run-k8s-tests-on-tdx.yaml
+    with:
+      registry: ghcr.io
+      repo: ${{ github.repository_owner }}/kata-deploy-ci
+      tag: ${{ inputs.tag }}-amd64
+      commit-hash: ${{ inputs.commit-hash }}
+
+  run-metrics-tests:
+    needs: build-kata-static-tarball-amd64
+    uses: ./.github/workflows/run-metrics.yaml
+    with:
+      tarball-suffix: -${{ inputs.tag }}
+      commit-hash: ${{ inputs.commit-hash }}
--- a/.github/workflows/payload-after-push.yaml
+++ b/.github/workflows/payload-after-push.yaml
@@ -9,18 +9,21 @@ jobs:
  build-assets-amd64:
    uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml
    with:
+      commit-hash: ${{ github.sha }}
      push-to-registry: yes
    secrets: inherit

  build-assets-arm64:
    uses: ./.github/workflows/build-kata-static-tarball-arm64.yaml
    with:
+      commit-hash: ${{ github.sha }}
      push-to-registry: yes
    secrets: inherit

  build-assets-s390x:
    uses: ./.github/workflows/build-kata-static-tarball-s390x.yaml
    with:
+      commit-hash: ${{ github.sha }}
      push-to-registry: yes
    secrets: inherit

@@ -28,6 +31,7 @@ jobs:
    needs: build-assets-amd64
    uses: ./.github/workflows/publish-kata-deploy-payload-amd64.yaml
    with:
+      commit-hash: ${{ github.sha }}
      registry: quay.io
      repo: kata-containers/kata-deploy-ci
      tag: kata-containers-amd64
@@ -37,6 +41,7 @@ jobs:
    needs: build-assets-arm64
    uses: ./.github/workflows/publish-kata-deploy-payload-arm64.yaml
    with:
+      commit-hash: ${{ github.sha }}
      registry: quay.io
      repo: kata-containers/kata-deploy-ci
      tag: kata-containers-arm64
@@ -46,6 +51,7 @@ jobs:
    needs: build-assets-s390x
    uses: ./.github/workflows/publish-kata-deploy-payload-s390x.yaml
    with:
+      commit-hash: ${{ github.sha }}
      registry: quay.io
      repo: kata-containers/kata-deploy-ci
      tag: kata-containers-s390x
--- a/.github/workflows/publish-kata-deploy-payload-amd64.yaml
+++ b/.github/workflows/publish-kata-deploy-payload-amd64.yaml
@@ -14,6 +14,9 @@ on:
      tag:
        required: true
        type: string
+      commit-hash:
+        required: false
+        type: string

 jobs:
  kata-payload:
@@ -21,7 +24,7 @@ jobs:
    steps:
      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}

      - name: get-kata-tarball
        uses: actions/download-artifact@v3
--- a/.github/workflows/publish-kata-deploy-payload-arm64.yaml
+++ b/.github/workflows/publish-kata-deploy-payload-arm64.yaml
@@ -14,6 +14,9 @@ on:
      tag:
        required: true
        type: string
+      commit-hash:
+        required: false
+        type: string

 jobs:
  kata-payload:
@@ -25,7 +28,7 @@ jobs:

      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}

      - name: get-kata-tarball
        uses: actions/download-artifact@v3
--- a/.github/workflows/publish-kata-deploy-payload-s390x.yaml
+++ b/.github/workflows/publish-kata-deploy-payload-s390x.yaml
@@ -14,6 +14,9 @@ on:
      tag:
        required: true
        type: string
+      commit-hash:
+        required: false
+        type: string

 jobs:
  kata-payload:
@@ -25,7 +28,7 @@ jobs:

      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}

      - name: get-kata-tarball
        uses: actions/download-artifact@v3
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -72,8 +72,7 @@ jobs:
      - uses: actions/checkout@v3
      - name: install hub
        run: |
-          HUB_VER=$(curl -s "https://api.github.com/repos/github/hub/releases/latest" | jq -r .tag_name | sed 's/^v//')
-          wget -q -O- https://github.com/github/hub/releases/download/v$HUB_VER/hub-linux-amd64-$HUB_VER.tgz | \
+          wget -q -O- https://github.com/mislav/hub/releases/download/v2.14.2/hub-linux-amd64-2.14.2.tgz | \
          tar xz --strip-components=2 --wildcards '*/bin/hub' && sudo mv hub /usr/local/bin/hub

      - name: download-artifacts-amd64
--- a/.github/workflows/run-k8s-tests-on-aks.yaml
+++ b/.github/workflows/run-k8s-tests-on-aks.yaml
@@ -11,6 +11,12 @@ on:
      tag:
        required: true
        type: string
+      pr-number:
+        required: true
+        type: string
+      commit-hash:
+        required: false
+        type: string

 jobs:
  run-k8s-tests:
@@ -31,13 +37,13 @@ jobs:
      DOCKER_REGISTRY: ${{ inputs.registry }}
      DOCKER_REPO: ${{ inputs.repo }}
      DOCKER_TAG: ${{ inputs.tag }}
-      GH_PR_NUMBER: ${{ github.event.pull_request.number }}
+      GH_PR_NUMBER: ${{ inputs.pr-number }}
      KATA_HOST_OS: ${{ matrix.host_os }}
      KATA_HYPERVISOR: ${{ matrix.vmm }}
    steps:
      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}

      - name: Download Azure CLI
        run: bash tests/integration/gha-run.sh install-azure-cli
--- a/.github/workflows/run-k8s-tests-on-sev.yaml
+++ b/.github/workflows/run-k8s-tests-on-sev.yaml
@@ -11,6 +11,9 @@ on:
      tag:
        required: true
        type: string
+      commit-hash:
+        required: false
+        type: string

 jobs:
  run-k8s-tests:
@@ -29,7 +32,7 @@ jobs:
    steps:
      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}

      - name: Run tests
        timeout-minutes: 30
--- a/.github/workflows/run-k8s-tests-on-snp.yaml
+++ b/.github/workflows/run-k8s-tests-on-snp.yaml
@@ -11,6 +11,9 @@ on:
      tag:
        required: true
        type: string
+      commit-hash:
+        required: false
+        type: string

 jobs:
  run-k8s-tests:
@@ -29,7 +32,7 @@ jobs:
    steps:
      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}

      - name: Run tests
        timeout-minutes: 30
--- a/.github/workflows/run-k8s-tests-on-tdx.yaml
+++ b/.github/workflows/run-k8s-tests-on-tdx.yaml
@@ -11,6 +11,9 @@ on:
      tag:
        required: true
        type: string
+      commit-hash:
+        required: false
+        type: string

 jobs:
  run-k8s-tests:
@@ -29,7 +32,7 @@ jobs:
    steps:
      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}

      - name: Run tests
        timeout-minutes: 30
--- a/.github/workflows/run-metrics.yaml
+++ b/.github/workflows/run-metrics.yaml
@@ -5,16 +5,25 @@ on:
      tarball-suffix:
        required: false
        type: string
+      commit-hash:
+        required: false
+        type: string

 jobs:
  run-metrics:
+    strategy:
+      fail-fast: true
+      matrix:
+        vmm: ['clh', 'qemu']
+      max-parallel: 1
    runs-on: metrics
    env:
      GOPATH: ${{ github.workspace }}
+      KATA_HYPERVISOR: ${{ matrix.vmm }}
    steps:
      - uses: actions/checkout@v3
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
+          ref: ${{ inputs.commit-hash }}

      - name: get-kata-tarball
        uses: actions/download-artifact@v3
@@ -25,8 +34,25 @@ jobs:
      - name: Install kata
        run: bash tests/metrics/gha-run.sh install-kata kata-artifacts

-      - name: run launch times on qemu
-        run: bash tests/metrics/gha-run.sh run-test-launchtimes-qemu
+      - name: run launch times test
+        run: bash tests/metrics/gha-run.sh run-test-launchtimes

-      - name: run launch times on clh
-        run:  bash tests/metrics/gha-run.sh run-test-launchtimes-clh
+      - name: run memory foot print test
+        run:  bash tests/metrics/gha-run.sh run-test-memory-usage
+
+      - name: run memory usage inside container test
+        run:  bash tests/metrics/gha-run.sh run-test-memory-usage-inside-container
+
+      - name: run blogbench test
+        run:  bash tests/metrics/gha-run.sh run-test-blogbench
+
+      - name: make metrics tarball ${{ matrix.vmm }}
+        run: bash tests/metrics/gha-run.sh make-tarball-results
+          
+      - name: archive metrics results ${{ matrix.vmm }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: metrics-artifacts-${{ matrix.vmm }}
+          path: results-${{ matrix.vmm }}.tar.gz
+          retention-days: 1
+          if-no-files-found: error
--- a/.github/workflows/static-checks-dragonball.yaml
+++ b/.github/workflows/static-checks-dragonball.yaml
@@ -23,7 +23,7 @@ jobs:
        if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
        run: |
          ./ci/install_rust.sh
-          PATH=$PATH:"$HOME/.cargo/bin"
+          echo PATH="$HOME/.cargo/bin:$PATH" >> $GITHUB_ENV
      - name: Run Unit Test
        if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
        run: |
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <img src="https://object-storage-ca-ymq-1.vexxhost.net/swift/v1/6e4619c416ff4bd19e1c087f27a43eea/www-images-prod/openstack-logo/kata/SVG/kata-1.svg" width="900">

-[![CI | Publish Kata Containers payload](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml/badge.svg)](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml)
+[![CI | Publish Kata Containers payload](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml/badge.svg)](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml) [![Kata Containers Nightly CI](https://github.com/kata-containers/kata-containers/actions/workflows/ci-nightly.yaml/badge.svg)](https://github.com/kata-containers/kata-containers/actions/workflows/ci-nightly.yaml)

 # Kata Containers

--- a/docs/design/README.md
+++ b/docs/design/README.md
@@ -17,6 +17,7 @@ Kata Containers design documents:
 - [Design for Kata Containers `Lazyload` ability with `nydus`](kata-nydus-design.md)
 - [Design for direct-assigned volume](direct-blk-device-assignment.md)
 - [Design for core-scheduling](core-scheduling.md)
+- [Virtualization Reference Architecture](kata-vra.md)
 ---

 - [Design proposals](proposals)
--- a/docs/design/kata-vra.md
+++ b/docs/design/kata-vra.md
@@ -0,0 +1,434 @@
+# Virtualization Reference Architecture
+
+## Subject to Change | © 2022 by NVIDIA Corporation. All rights reserved. | For test and development only_
+
+Before digging deeper into the virtualization reference architecture, let's
+first look at the various GPUDirect use cases in the following table. We’re
+distinguishing between two top-tier use cases where the devices are (1)
+passthrough and (2) virtualized, where a VM gets assigned a virtual function
+(VF) and not the physical function (PF). A combination of PF and VF would also
+be possible.
+
+| Device #1  (passthrough)  | Device #2 (passthrough) | P2P Compatibility and Mode                   |
+| ------------------------- | ----------------------- | -------------------------------------------- |
+| GPU PF                    | GPU PF                  | GPUDirect P2P                                |
+| GPU PF                    | NIC PF                  | GPUDirect RDMA                               |
+| MIG-slice                 | MIG-slice               | _No GPUDirect P2P_                           |
+| MIG-slice                 | NIC PF                  | GPUDirect RDMA                               |
+| **PDevice #1  (virtualized)** | **Device #2 (virtualized)** | **P2P Compatibility and   Mode**     |
+| Time-slice vGPU VF        | Time-slice vGPU VF      | _No GPUDirect P2P  but NVLINK P2P available_ |
+| Time-slice vGPU VF        | NIC VF                  | GPUDirect RDMA                               |
+| MIG-slice vGPU            | MIG-slice vGPU          | _No GPUDirect P2P_                           |
+| MIG-slice vGPU            | NIC VF                  | GPUDirect RDMA                               |
+
+In a virtualized environment we have several distinct features that may prevent
+Peer-to-peer (P2P) communication of two endpoints in a PCI Express topology. The
+IOMMU translates IO virtual addresses (IOVA) to physical addresses (PA). Each
+device behind an IOMMU has its own IOVA memory space, usually, no two devices
+share the same IOVA memory space but it’s up to the hypervisor or OS how it
+chooses to map devices to IOVA spaces.  Any PCI Express DMA transactions will
+use IOVAs, which the IOMMU must translate. By default, all the traffic is routed
+to the root complex and not issued directly to the peer device.
+
+An IOMMU can be used to isolate and protect devices even if virtualization is
+not used; since devices can only access memory regions that are mapped for it, a
+DMA from one device to another is not possible. DPDK uses the IOMMU to have
+better isolation between devices, another benefit is that IOVA space can be
+represented as a contiguous memory even if the PA space is heavily scattered.
+
+In the case of virtualization, the IOMMU is responsible for isolating the device
+and memory between VMs for safe device assignment without compromising the host
+and other guest OSes. Without an IOMMU, any device can access the entire system
+and perform DMA transactions _anywhere_.
+
+The second feature is ACS (Access Control Services), which controls which
+devices are allowed to communicate with one another and thus avoids improper
+routing of packets irrespectively of whether IOMMU is enabled or not.
+
+When IOMMU is enabled, ACS is normally configured to force all PCI Express DMA
+to go through the root complex so IOMMU can translate it, impacting performance
+between peers with higher latency and reduced bandwidth.
+
+A way to avoid the performance hit is to enable Address Translation Services
+(ATS). ATS-capable endpoints can prefetch IOVA -> PA translations from the IOMMU
+and then perform DMA transactions directly to another endpoint. Hypervisors
+enable this by enabling ATS in such endpoints, configuring ACS to enable Direct
+Translated P2P, and configuring the IOMMU to allow Address Translation requests.
+
+Another important factor is that the NVIDIA driver stack will use the PCI
+Express topology of the system it is running on to determine whether the
+hardware is capable of supporting P2P. The driver stack qualifies specific
+chipsets, and PCI Express switches for use with GPUDirect P2P. In virtual
+environments, the PCI Express topology is flattened and obfuscated to present a
+uniform environment to the software inside the VM, which breaks the GPUDirect
+P2P use case.
+
+On a bare metal machine, the driver stack groups GPUs into cliques that can
+perform GPUDirect P2P communication, excluding peer mappings where P2P
+communication is not possible, prominently if GPUs are attached to multiple CPU
+sockets.  
+
+CPUs and local memory banks are referred to as NUMA nodes. In a two-socket
+server, each of the CPUs has a local memory bank for a total of two NUMA nodes.
+Some servers provide the ability to configure additional NUMA nodes per CPU,
+which means a CPU socket can have two NUMA nodes  (some servers support four
+NUMA nodes per socket) with local memory banks and L3 NUMA domains for improved
+performance.
+
+One of the current solutions is that the hypervisor provides additional topology
+information that the driver stack can pick up and enable GPUDirect P2P between
+GPUs, even if the virtualized environment does not directly expose it. The PCI
+Express virtual P2P approval capability structure in the PCI configuration space
+is entirely emulated by the hypervisor of passthrough GPU devices.
+
+A clique ID is provided where GPUs with the same clique ID belong to a group of
+GPUs capable of P2P communication
+
+On vSphere, Azure, and other CPSs,  the hypervisor lays down a `topologies.xml`
+which NCCL can pick up and deduce the right P2P level[^1]. NCCL is leveraging
+Infiniband (IB) and/or Unified Communication X (UCX) for communication, and
+GPUDirect P2P and GPUDirect RDMA should just work in this case. The only culprit
+is that software or applications that do not use the XML file to deduce the
+topology will fail and not enable GPUDirect ( [`nccl-p2p-level`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-p2p-level) )
+
+## Hypervisor PCI Express Topology
+
+To enable every part of the accelerator stack, we propose a virtualized
+reference architecture to enable GPUDirect P2P and GPUDirect RDMA for any
+hypervisor. The idea is split into two parts to enable the right PCI Express
+topology. The first part builds upon extending the PCI Express virtual P2P
+approval capability structure to every device that wants to do P2P in some way
+and groups devices by clique ID. The other part involves replicating a subset of
+the host topology so that applications running in the VM do not need to read
+additional information and enable the P2P capability like in the bare-metal use
+case described above. The driver stack can then deduce automatically if the
+topology presented in the VM is capable of P2P communication.
+
+We will work with the following host topology for the following sections. It is
+a system with two converged DPUs, each having an `A100X` GPU and two `ConnectX-6`
+network ports connected to the downstream ports of a PCI Express switch.
+
+```sh
+-00.0-[d8-df]----00.0-[d9-df]--+-00.0-[da-db]--+-00.0  Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
+                                |               +-00.1  Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
+                                |               \-00.2  Mellanox Tech MT42822 BlueField-2 SoC Management Interface
+                                 \-01.0-[dc-df]----00.0-[dd-df]----08.0-[de-df]----00.0  NVIDIA Corporation GA100 [A100X]
+
+-00.0-[3b-42]----00.0-[3c-42]--+-00.0-[3d-3e]--+-00.0  Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
+                                |               +-00.1  Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx network
+                                |               \-00.2  Mellanox Tech MT42822 BlueField-2 SoC Management Interface
+                                 \-01.0-[3f-42]----00.0-[40-42]----08.0-[41-42]----00.0  NVIDIA Corporation GA100 [A100X]
+```
+
+The green path highlighted above is the optimal and preferred path for
+efficient P2P communication.
+
+## PCI Express Virtual P2P Approval Capability
+
+Most of the time, the PCI Express topology is flattened and obfuscated to ensure
+easy migration of the VM image between different physical hardware topologies.
+In Kata, we can configure the hypervisor to use PCI Express root ports to
+hotplug the VFIO  devices one is passing through. A user can select how many PCI
+Express root ports to allocate depending on how many devices are passed through.
+A recent addition to Kata will detect the right amount of PCI Express devices
+that need hotplugging and bail out if the number of root ports is insufficient.
+In Kata, we do not automatically increase the number of root ports, we want the
+user to be in full control of the topology.
+
+```toml
+# /etc/kata-containers/configuration.toml
+
+# VFIO devices are hotplugged on a bridge by default.
+# Enable hot-plugging on the root bus. This may be required for devices with
+# a large PCI bar, as this is a current limitation with hot-plugging on
+# a bridge.
+# Default “bridge-port”
+hotplug_vfio = "root-port"
+
+# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
+# Use this parameter when using some large PCI bar devices, such as NVIDIA GPU
+# The value means the number of pcie_root_port
+# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35"
+# Default 0
+pcie_root_port = 8
+```
+
+VFIO devices are hotplugged on a PCIe-PCI bridge by default. Hotplug of PCI
+Express devices is only supported on PCI Express root or downstream ports. With
+this configuration set, if we start up a Kata container, we can inspect our
+topology and see the allocated PCI Express root ports and the hotplugged
+devices.
+
+```sh
+$ lspci -tv
+ -[0000:00]-+-00.0  Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
+           +-01.0  Red Hat, Inc. Virtio console
+           +-02.0  Red Hat, Inc. Virtio SCSI
+           +-03.0  Red Hat, Inc. Virtio RNG
+           +-04.0-[01]----00.0  Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6
+           +-05.0-[02]----00.0  Mellanox Technologies MT42822 BlueField-2 integrated ConnectX-6
+           +-06.0-[03]----00.0  NVIDIA Corporation Device 20b8
+           +-07.0-[04]----00.0  NVIDIA Corporation Device 20b8
+           +-08.0-[05]--
+           +-09.0-[06]--
+           +-0a.0-[07]--
+           +-0b.0-[08]--
+           +-0c.0  Red Hat, Inc. Virtio socket
+           +-0d.0  Red Hat, Inc. Virtio file system
+           +-1f.0  Intel Corporation 82801IB (ICH9) LPC Interface Controller
+           +-1f.2  Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller
+           \-1f.3  Intel Corporation 82801I (ICH9 Family) SMBus Controller
+```
+
+For devices with huge BARs (Base Address Registers) like the GPU (we need to
+configure the PCI Express root port properly and allocate enough memory for
+mapping), we have added a heuristic to Kata to deduce the right settings. Hence,
+the BARs can be mapped correctly. This functionality is added to
+[`nvidia/go-nvlib1](https://gitlab.com/nvidia/cloud-native/go-nvlib) which is part
+of Kata now.
+
+```sh
+$ sudo dmesg | grep BAR
+[    0.179960] pci 0000:00:04.0: BAR 7: assigned [io  0x1000-0x1fff]
+[    0.179962] pci 0000:00:05.0: BAR 7: assigned [io  0x2000-0x2fff]
+[    0.179963] pci 0000:00:06.0: BAR 7: assigned [io  0x3000-0x3fff]
+[    0.179964] pci 0000:00:07.0: BAR 7: assigned [io  0x4000-0x4fff]
+[    0.179966] pci 0000:00:08.0: BAR 7: assigned [io  0x5000-0x5fff]
+[    0.179967] pci 0000:00:09.0: BAR 7: assigned [io  0x6000-0x6fff]
+[    0.179968] pci 0000:00:0a.0: BAR 7: assigned [io  0x7000-0x7fff]
+[    0.179969] pci 0000:00:0b.0: BAR 7: assigned [io  0x8000-0x8fff]
+[    2.115912] pci 0000:01:00.0: BAR 0: assigned [mem 0x13000000000-0x13001ffffff 64bit pref]
+[    2.116203] pci 0000:01:00.0: BAR 2: assigned [mem 0x13002000000-0x130027fffff 64bit pref]
+[    2.683132] pci 0000:02:00.0: BAR 0: assigned [mem 0x12000000000-0x12001ffffff 64bit pref]
+[    2.683419] pci 0000:02:00.0: BAR 2: assigned [mem 0x12002000000-0x120027fffff 64bit pref]
+[    2.959155] pci 0000:03:00.0: BAR 1: assigned [mem 0x11000000000-0x117ffffffff 64bit pref]
+[    2.959345] pci 0000:03:00.0: BAR 3: assigned [mem 0x11800000000-0x11801ffffff 64bit pref]
+[    2.959523] pci 0000:03:00.0: BAR 0: assigned [mem 0xf9000000-0xf9ffffff]
+[    2.966119] pci 0000:04:00.0: BAR 1: assigned [mem 0x10000000000-0x107ffffffff 64bit pref]
+[    2.966295] pci 0000:04:00.0: BAR 3: assigned [mem 0x10800000000-0x10801ffffff 64bit pref]
+[    2.966472] pci 0000:04:00.0: BAR 0: assigned [mem 0xf7000000-0xf7ffffff]
+```
+
+The NVIDIA driver stack in this case would refuse to do P2P communication since
+(1) the topology is not what it expects, (2)  we do not have a qualified
+chipset. Since our P2P devices are not connected to a PCI Express switch port,
+we need to provide additional information to support the P2P functionality. One
+way of providing such meta information would be to annotate the container; most
+of the settings in Kata's configuration file can be overridden via annotations,
+but this limits the flexibility, and a user would need to update all the
+containers that he wants to run with Kata. The goal is to make such things as
+transparent as possible, so we also introduced
+[CDI](https://github.com/container-orchestrated-devices/container-device-interface)
+(Container Device Interface) to Kata. CDI is a[
+specification](https://github.com/container-orchestrated-devices/container-device-interface/blob/master/SPEC.md)
+for container runtimes to support third-party devices.
+
+As written before, we can provide a clique ID for the devices that belong
+together and are capable of doing P2P. This information is provided to the
+hypervisor, which will set up things in the VM accordingly. Let's suppose the
+user wanted to do GPUDirect RDMA with the first GPU and the NIC that reside on
+the same DPU, one could provide the specification telling the hypervisor that
+they belong to the same clique.
+
+```yaml
+# /etc/cdi/nvidia.yaml
+cdiVersion: 0.4.0
+kind: nvidia.com/gpu
+devices:
+- name: gpu0
+  annotations:
+    bdf: “41:00.0”
+    clique-id: “0”
+  containerEdits:
+    deviceNodes:
+    - path: “/dev/vfio/71"
+
+# /etc/cdi/mellanox.yaml
+cdiVersion: 0.4.0
+kind: mellanox.com/nic
+devices:
+- name: nic0
+  annotations:
+    bdf: “3d:00.0”
+    clique-id: “0”
+    attach-pci: “true”
+  containerEdits:
+    deviceNodes:
+    - path: "/dev/vfio/66"
+```
+
+Since this setting is bound to the device and not the container we do not need
+to alter the container just allocate the right resource and GPUDirect RDMA would
+be set up correctly. Rather than exposing them separately, an idea would be to
+expose a GPUDirect RDMA device via NFD (Node Feature Discovery) that combines
+both of them; this way, we could make sure that the right pair is allocated and
+used more on  Kubernetes deployment in the next section.
+
+The GPU driver stack is leveraging the PCI Express virtual P2P approval
+capability, but the NIC stack does not use this now. One of the action items is
+to enable MOFED to read the P2P approval capability and enable ATS and ACS
+settings as described above.
+
+This way, we could enable GPUDirect P2P and GPUDirect RDMA on any topology
+presented to the VM application. It is the responsibility of the administrator
+or infrastructure engineer to provide the right information either via
+annotations or a CDI specification.
+
+## Host Topology Replication
+
+The other way to represent the PCI Express topology in the VM is to replicate a
+subset of the topology needed to support the P2P use case inside the VM. Similar
+to the configuration for the root ports, we can easily configure the usage of
+PCI Express switch ports to hotplug the devices.
+
+```toml
+# /etc/kata-containers/configuration.toml
+
+# VFIO devices are hotplugged on a bridge by default.
+# Enable hot plugging on the root bus. This may be required for devices with
+# a large PCI bar, as this is a current limitation with hot plugging on
+# a bridge.
+# Default “bridge-port”
+hotplug_vfio = "switch-port"
+
+# Before hot plugging a PCIe device, you need to add a pcie_root_port device.
+# Use this parameter when using some large PCI bar devices, such as Nvidia GPU
+# The value means the number of pcie_root_port
+# This value is valid when hotplug_vfio_on_root_bus is true and machine_type is "q35"
+# Default 0
+pcie_switch_port = 8
+```
+
+Each device that is passed through is attached to a PCI Express downstream port
+as illustrated below. We can even replicate the host’s two DPUs topologies with
+added metadata through the CDI. Most of the time, a container only needs one
+pair of GPU and NIC for GPUDirect RDMA. This is more of a showcase of what we
+can do with the power of Kata and CDI. One could even think of adding groups of
+devices that support P2P, even from different CPU sockets or NUMA nodes, into
+one container; indeed, the first group is NUMA node 0 (red), and the second
+group is NUMA node 1 (green). Since they are grouped correctly, P2P would be
+enabled naturally inside a group, aka clique ID.
+
+```sh
+$ lspci -tv
+ -[0000:00]-+-00.0  Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
+            +-01.0  Red Hat, Inc. Virtio console
+            +-02.0  Red Hat, Inc. Virtio SCSI
+            +-03.0  Red Hat, Inc. Virtio RNG
+            +-04.0-[01-04]----00.0-[02-04]--+-00.0-[03]----00.0  NVIDIA Corporation Device 20b8
+            |                               \-01.0-[04]----00.0  Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx
+            +-05.0-[05-08]----00.0-[06-08]--+-00.0-[07]----00.0  Mellanox Tech MT42822 BlueField-2 integrated ConnectX-6 Dx
+            |                               \-01.0-[08]----00.0  NVIDIA Corporation Device 20b8
+            +-06.0  Red Hat, Inc. Virtio socket
+            +-07.0  Red Hat, Inc. Virtio file system
+            +-1f.0  Intel Corporation 82801IB (ICH9) LPC Interface Controller
+            +-1f.2  Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller [AHCI mode]
+            \-1f.3  Intel Corporation 82801I (ICH9 Family) SMBus Controller
+            \-1f.3  Intel Corporation 82801I (ICH9 Family) SMBus Controller
+```
+
+The configuration of using either the root port or switch port can be applied on
+a per Container or Pod basis, meaning we can switch PCI Express topologies on
+each run of an application.
+
+## Hypervisor Resource Limits
+
+Every hypervisor will have resource limits in terms of how many PCI Express root
+ports, switch ports, or bridge ports can be created, especially with devices
+that need to reserve a 4K IO range per PCI specification. Each instance of root
+or switch port will consume 4K IO of very limited capacity, 64k is the maximum.
+
+Simple math brings us to the conclusion that we can have a maximum of 16 PCI
+Express root ports or 16 PCI Express switch ports in QEMU if devices with IO
+BARs are used in the PCI Express hierarchy.
+
+Additionally, one can have 32 slots on the PCI root bus and a maximum of 256
+slots for the complete PCI(e) topology.
+
+Per default, QEMU will attach a multi-function device in the last slot on the
+PCI root bus,
+
+```sh
+ +-1f.0  Intel Corporation 82801IB (ICH9) LPC Interface Controller
+ +-1f.2  Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller [AHCI mode]
+ \-1f.3  Intel Corporation 82801I (ICH9 Family) SMBus Controller
+```
+
+Kata will additionally add `virtio-xxx-pci` devices consuming (5 slots) plus a
+PCIe-PCI-bridge (1 slot) and a DRAM controller (1 slot), meaning per default, we
+have already eight slots used. This leaves us 24 slots for adding other devices
+to the root bus.
+
+The problem that arises here is one use-case from a customer that uses recent
+RTX GPUs with Kata. The user wanted to pass through eight of these GPUs into one
+container and ran into issues. The problem is that those cards often consist of
+four individual device nodes: GPU, Audio, and two USB controller devices (some
+cards have a USB-C output).
+
+These devices are grouped into one IOMMU group. Since one needs to pass through
+the complete IOMMU group into the VM, we need to allocate 32 PCI Express root
+ports or 32 PCI Express switch ports, which is technically impossible due to the
+resource limits outlined above. Since all the devices appear as PCI Express
+devices, we need to hotplug those into a root or switch port.
+
+The solution to this problem is leveraging CDI. For each device, add the
+information if it is going to be hotplugged as a PCI Express or PCI device,
+which results in either using a PCI Express root/switch port or an ordinary PCI
+bridge. PCI bridges are not affected by the limited IO range. This way, the GPU
+is attached as a PCI Express device to a root/switch port and the other three
+PCI devices to a PCI bridge, leaving enough resources to create the needed PCI
+Express root/switch ports.  For example, we’re going to attach the GPUs to a PCI
+Express root port and the NICs to a PCI bridge.
+
+```jsonld
+# /etc/cdi/mellanox.json
+cdiVersion: 0.4.0
+kind: mellanox.com/nic
+devices:
+- name: nic0
+  annotations:
+    bdf: “3d:00.0”
+    clique-id: “0”
+    attach-pci: “true”
+  containerEdits:
+    deviceNodes:
+    - path: "/dev/vfio/66"
+- name: nic1
+  annotations:
+    bdf: “3d:00.1”
+    clique-id: “1”
+    attach-pci: “true”
+  containerEdits:
+    deviceNodes:
+    - path: "/dev/vfio/67”
+```
+
+The configuration is set to use eight root ports for the GPUs and attach the
+NICs to a PCI bridge which is connected to a PCI Express-PCI bridge which is the
+preferred way of introducing a PCI topology in a PCI Express machine.
+
+```sh
+$ lspci -tv
+-[0000:00]-+-00.0  Intel Corporation 82G33/G31/P35/P31 Express DRAM Controller
+           +-01.0  Red Hat, Inc. Virtio console
+           +-02.0  Red Hat, Inc. Virtio SCSI
+           +-03.0  Red Hat, Inc. Virtio RNG
+           +-04.0-[01]----00.0  NVIDIA Corporation Device 20b8
+           +-05.0-[02]----00.0  NVIDIA Corporation Device 20b8
+           +-06.0-[03]--
+           +-07.0-[04]--
+           +-08.0-[05]--
+           +-09.0-[06]--
+           +-0a.0-[07]--
+           +-0b.0-[08]--
+           +-0c.0-[09-0a]----00.0-[0a]--+-00.0  Mellanox Tech MT42822 BlueField-2 ConnectX-6
+           |                             \-01.0  Mellanox Tech MT42822 BlueField-2 ConnectX-6
+           +-0d.0  Red Hat, Inc. Virtio socket
+           +-0e.0  Red Hat, Inc. Virtio file system
+           +-1f.0  Intel Corporation 82801IB (ICH9) LPC Interface Controller
+           +-1f.2  Intel Corporation 82801IR/IO/IH (ICH9R/DO/DH) 6 port SATA Controller
+           \-1f.3  Intel Corporation 82801I (ICH9 Family) SMBus Controller
+```
+
+The PCI devices will consume a slot of which we have 256 in the PCI(e) topology
+and leave scarce resources for the needed PCI Express devices.
--- a/src/agent/rustjail/src/cgroups/fs/mod.rs
+++ b/src/agent/rustjail/src/cgroups/fs/mod.rs
@@ -39,11 +39,9 @@ use std::path::Path;

 const GUEST_CPUS_PATH: &str = "/sys/devices/system/cpu/online";

-// Convenience macro to obtain the scope logger
-macro_rules! sl {
-    () => {
-        slog_scope::logger().new(o!("subsystem" => "cgroups"))
-    };
+// Convenience function to obtain the scope logger.
+fn sl() -> slog::Logger {
+    slog_scope::logger().new(o!("subsystem" => "cgroups"))
 }

 macro_rules! get_controller_or_return_singular_none {
@@ -82,7 +80,7 @@ impl CgroupManager for Manager {

    fn set(&self, r: &LinuxResources, update: bool) -> Result<()> {
        info!(
-            sl!(),
+            sl(),
            "cgroup manager set resources for container. Resources input {:?}", r
        );

@@ -120,7 +118,7 @@ impl CgroupManager for Manager {

        // set devices resources
        set_devices_resources(&self.cgroup, &r.devices, res);
-        info!(sl!(), "resources after processed {:?}", res);
+        info!(sl(), "resources after processed {:?}", res);

        // apply resources
        self.cgroup.apply(res)?;
@@ -197,7 +195,7 @@ impl CgroupManager for Manager {
        if guest_cpuset.is_empty() {
            return Ok(());
        }
-        info!(sl!(), "update_cpuset_path to: {}", guest_cpuset);
+        info!(sl(), "update_cpuset_path to: {}", guest_cpuset);

        let h = cgroups::hierarchies::auto();
        let root_cg = h.root_control_group();
@@ -205,12 +203,12 @@ impl CgroupManager for Manager {
        let root_cpuset_controller: &CpuSetController = root_cg.controller_of().unwrap();
        let path = root_cpuset_controller.path();
        let root_path = Path::new(path);
-        info!(sl!(), "root cpuset path: {:?}", &path);
+        info!(sl(), "root cpuset path: {:?}", &path);

        let container_cpuset_controller: &CpuSetController = self.cgroup.controller_of().unwrap();
        let path = container_cpuset_controller.path();
        let container_path = Path::new(path);
-        info!(sl!(), "container cpuset path: {:?}", &path);
+        info!(sl(), "container cpuset path: {:?}", &path);

        let mut paths = vec![];
        for ancestor in container_path.ancestors() {
@@ -219,7 +217,7 @@ impl CgroupManager for Manager {
            }
            paths.push(ancestor);
        }
-        info!(sl!(), "parent paths to update cpuset: {:?}", &paths);
+        info!(sl(), "parent paths to update cpuset: {:?}", &paths);

        let mut i = paths.len();
        loop {
@@ -233,7 +231,7 @@ impl CgroupManager for Manager {
                .to_str()
                .unwrap()
                .trim_start_matches(root_path.to_str().unwrap());
-            info!(sl!(), "updating cpuset for parent path {:?}", &r_path);
+            info!(sl(), "updating cpuset for parent path {:?}", &r_path);
            let cg = new_cgroup(cgroups::hierarchies::auto(), r_path)?;
            let cpuset_controller: &CpuSetController = cg.controller_of().unwrap();
            cpuset_controller.set_cpus(guest_cpuset)?;
@@ -241,7 +239,7 @@ impl CgroupManager for Manager {

        if !container_cpuset.is_empty() {
            info!(
-                sl!(),
+                sl(),
                "updating cpuset for container path: {:?} cpuset: {}",
                &container_path,
                container_cpuset
@@ -276,7 +274,7 @@ fn set_network_resources(
    network: &LinuxNetwork,
    res: &mut cgroups::Resources,
 ) {
-    info!(sl!(), "cgroup manager set network");
+    info!(sl(), "cgroup manager set network");

    // set classid
    // description can be found at https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v1/net_cls.html
@@ -303,7 +301,7 @@ fn set_devices_resources(
    device_resources: &[LinuxDeviceCgroup],
    res: &mut cgroups::Resources,
 ) {
-    info!(sl!(), "cgroup manager set devices");
+    info!(sl(), "cgroup manager set devices");
    let mut devices = vec![];

    for d in device_resources.iter() {
@@ -332,7 +330,7 @@ fn set_hugepages_resources(
    hugepage_limits: &[LinuxHugepageLimit],
    res: &mut cgroups::Resources,
 ) {
-    info!(sl!(), "cgroup manager set hugepage");
+    info!(sl(), "cgroup manager set hugepage");
    let mut limits = vec![];
    let hugetlb_controller = cg.controller_of::<HugeTlbController>();

@@ -346,7 +344,7 @@ fn set_hugepages_resources(
            limits.push(hr);
        } else {
            warn!(
-                sl!(),
+                sl(),
                "{} page size support cannot be verified, dropping requested limit", l.page_size
            );
        }
@@ -359,7 +357,7 @@ fn set_block_io_resources(
    blkio: &LinuxBlockIo,
    res: &mut cgroups::Resources,
 ) {
-    info!(sl!(), "cgroup manager set block io");
+    info!(sl(), "cgroup manager set block io");

    res.blkio.weight = blkio.weight;
    res.blkio.leaf_weight = blkio.leaf_weight;
@@ -387,13 +385,13 @@ fn set_block_io_resources(
 }

 fn set_cpu_resources(cg: &cgroups::Cgroup, cpu: &LinuxCpu) -> Result<()> {
-    info!(sl!(), "cgroup manager set cpu");
+    info!(sl(), "cgroup manager set cpu");

    let cpuset_controller: &CpuSetController = cg.controller_of().unwrap();

    if !cpu.cpus.is_empty() {
        if let Err(e) = cpuset_controller.set_cpus(&cpu.cpus) {
-            warn!(sl!(), "write cpuset failed: {:?}", e);
+            warn!(sl(), "write cpuset failed: {:?}", e);
        }
    }

@@ -424,7 +422,7 @@ fn set_cpu_resources(cg: &cgroups::Cgroup, cpu: &LinuxCpu) -> Result<()> {
 }

 fn set_memory_resources(cg: &cgroups::Cgroup, memory: &LinuxMemory, update: bool) -> Result<()> {
-    info!(sl!(), "cgroup manager set memory");
+    info!(sl(), "cgroup manager set memory");
    let mem_controller: &MemController = cg.controller_of().unwrap();

    if !update {
@@ -493,7 +491,7 @@ fn set_memory_resources(cg: &cgroups::Cgroup, memory: &LinuxMemory, update: bool
 }

 fn set_pids_resources(cg: &cgroups::Cgroup, pids: &LinuxPids) -> Result<()> {
-    info!(sl!(), "cgroup manager set pids");
+    info!(sl(), "cgroup manager set pids");
    let pid_controller: &PidController = cg.controller_of().unwrap();
    let v = if pids.limit > 0 {
        MaxValue::Value(pids.limit)
@@ -965,7 +963,7 @@ pub fn get_paths() -> Result<HashMap<String, String>> {
    for l in fs::read_to_string(PATHS)?.lines() {
        let fl: Vec<&str> = l.split(':').collect();
        if fl.len() != 3 {
-            info!(sl!(), "Corrupted cgroup data!");
+            info!(sl(), "Corrupted cgroup data!");
            continue;
        }

@@ -986,7 +984,7 @@ pub fn get_mounts(paths: &HashMap<String, String>) -> Result<HashMap<String, Str
        let post: Vec<&str> = p[1].split(' ').collect();

        if post.len() != 3 {
-            warn!(sl!(), "can't parse {} line {:?}", MOUNTS, l);
+            warn!(sl(), "can't parse {} line {:?}", MOUNTS, l);
            continue;
        }

--- a/src/agent/rustjail/src/cgroups/notifier.rs
+++ b/src/agent/rustjail/src/cgroups/notifier.rs
@@ -16,11 +16,9 @@ use inotify::{Inotify, WatchMask};
 use tokio::io::AsyncReadExt;
 use tokio::sync::mpsc::{channel, Receiver};

-// Convenience macro to obtain the scope logger
-macro_rules! sl {
-    () => {
-        slog_scope::logger().new(o!("subsystem" => "cgroups_notifier"))
-    };
+// Convenience function to obtain the scope logger.
+fn sl() -> slog::Logger {
+    slog_scope::logger().new(o!("subsystem" => "cgroups_notifier"))
 }

 pub async fn notify_oom(cid: &str, cg_dir: String) -> Result<Receiver<String>> {
@@ -38,7 +36,7 @@ pub async fn notify_oom(cid: &str, cg_dir: String) -> Result<Receiver<String>> {
 fn get_value_from_cgroup(path: &Path, key: &str) -> Result<i64> {
    let content = fs::read_to_string(path)?;
    info!(
-        sl!(),
+        sl(),
        "get_value_from_cgroup file: {:?}, content: {}", &path, &content
    );

@@ -67,11 +65,11 @@ async fn register_memory_event_v2(
    let event_control_path = Path::new(&cg_dir).join(memory_event_name);
    let cgroup_event_control_path = Path::new(&cg_dir).join(cgroup_event_name);
    info!(
-        sl!(),
+        sl(),
        "register_memory_event_v2 event_control_path: {:?}", &event_control_path
    );
    info!(
-        sl!(),
+        sl(),
        "register_memory_event_v2 cgroup_event_control_path: {:?}", &cgroup_event_control_path
    );

@@ -82,8 +80,8 @@ async fn register_memory_event_v2(
    // Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited
    let cg_wd = inotify.add_watch(&cgroup_event_control_path, WatchMask::MODIFY)?;

-    info!(sl!(), "ev_wd: {:?}", ev_wd);
-    info!(sl!(), "cg_wd: {:?}", cg_wd);
+    info!(sl(), "ev_wd: {:?}", ev_wd);
+    info!(sl(), "cg_wd: {:?}", cg_wd);

    let (sender, receiver) = channel(100);
    let containere_id = containere_id.to_string();
@@ -97,17 +95,17 @@ async fn register_memory_event_v2(
        while let Some(event_or_error) = stream.next().await {
            let event = event_or_error.unwrap();
            info!(
-                sl!(),
+                sl(),
                "container[{}] get event for container: {:?}", &containere_id, &event
            );
            // info!("is1: {}", event.wd == wd1);
-            info!(sl!(), "event.wd: {:?}", event.wd);
+            info!(sl(), "event.wd: {:?}", event.wd);

            if event.wd == ev_wd {
                let oom = get_value_from_cgroup(&event_control_path, "oom_kill");
                if oom.unwrap_or(0) > 0 {
                    let _ = sender.send(containere_id.clone()).await.map_err(|e| {
-                        error!(sl!(), "send containere_id failed, error: {:?}", e);
+                        error!(sl(), "send containere_id failed, error: {:?}", e);
                    });
                    return;
                }
@@ -171,13 +169,13 @@ async fn register_memory_event(
            let mut buf = [0u8; 8];
            match eventfd_stream.read(&mut buf).await {
                Err(err) => {
-                    warn!(sl!(), "failed to read from eventfd: {:?}", err);
+                    warn!(sl(), "failed to read from eventfd: {:?}", err);
                    return;
                }
                Ok(_) => {
                    let content = fs::read_to_string(path.clone());
                    info!(
-                        sl!(),
+                        sl(),
                        "cgroup event for container: {}, path: {:?}, content: {:?}",
                        &containere_id,
                        &path,
@@ -193,7 +191,7 @@ async fn register_memory_event(
            }

            let _ = sender.send(containere_id.clone()).await.map_err(|e| {
-                error!(sl!(), "send containere_id failed, error: {:?}", e);
+                error!(sl(), "send containere_id failed, error: {:?}", e);
            });
        }
    });
--- a/src/agent/rustjail/src/container.rs
+++ b/src/agent/rustjail/src/container.rs
@@ -1596,10 +1596,8 @@ mod tests {
    use tempfile::tempdir;
    use test_utils::skip_if_not_root;

-    macro_rules! sl {
-        () => {
-            slog_scope::logger()
-        };
+    fn sl() -> slog::Logger {
+        slog_scope::logger()
    }

    #[test]
@@ -1854,7 +1852,7 @@ mod tests {
        let _ = new_linux_container_and_then(|mut c: LinuxContainer| {
            c.processes.insert(
                1,
-                Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap(),
+                Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap(),
            );
            let p = c.get_process("123");
            assert!(p.is_ok(), "Expecting Ok, Got {:?}", p);
@@ -1881,7 +1879,7 @@ mod tests {
        let (c, _dir) = new_linux_container();
        let ret = c
            .unwrap()
-            .start(Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap())
+            .start(Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap())
            .await;
        assert!(ret.is_err(), "Expecting Err, Got {:?}", ret);
    }
@@ -1891,7 +1889,7 @@ mod tests {
        let (c, _dir) = new_linux_container();
        let ret = c
            .unwrap()
-            .run(Process::new(&sl!(), &oci::Process::default(), "123", true, 1).unwrap())
+            .run(Process::new(&sl(), &oci::Process::default(), "123", true, 1).unwrap())
            .await;
        assert!(ret.is_err(), "Expecting Err, Got {:?}", ret);
    }
--- a/src/agent/rustjail/src/process.rs
+++ b/src/agent/rustjail/src/process.rs
@@ -161,7 +161,7 @@ impl Process {

    pub fn notify_term_close(&mut self) {
        let notify = self.term_exit_notifier.clone();
-        notify.notify_waiters();
+        notify.notify_one();
    }

    pub fn close_stdin(&mut self) {
--- a/src/agent/src/device.rs
+++ b/src/agent/src/device.rs
@@ -26,11 +26,9 @@ use oci::{LinuxDeviceCgroup, LinuxResources, Spec};
 use protocols::agent::Device;
 use tracing::instrument;

-// Convenience macro to obtain the scope logger
-macro_rules! sl {
-    () => {
-        slog_scope::logger().new(o!("subsystem" => "device"))
-    };
+// Convenience function to obtain the scope logger.
+fn sl() -> slog::Logger {
+    slog_scope::logger().new(o!("subsystem" => "device"))
 }

 const VM_ROOTFS: &str = "/";
@@ -78,7 +76,7 @@ where
 {
    let syspci = Path::new(&syspci);
    let drv = drv.as_ref();
-    info!(sl!(), "rebind_pci_driver: {} => {:?}", dev, drv);
+    info!(sl(), "rebind_pci_driver: {} => {:?}", dev, drv);

    let devpath = syspci.join("devices").join(dev.to_string());
    let overridepath = &devpath.join("driver_override");
@@ -606,7 +604,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) -
            let host_minor = specdev.minor;

            info!(
-                sl!(),
+                sl(),
                "update_spec_devices() updating device";
                "container_path" => &specdev.path,
                "type" => &specdev.r#type,
@@ -659,7 +657,7 @@ fn update_spec_devices(spec: &mut Spec, mut updates: HashMap<&str, DevUpdate>) -
                if let Some(update) = res_updates.get(&(host_type.as_str(), host_major, host_minor))
                {
                    info!(
-                        sl!(),
+                        sl(),
                        "update_spec_devices() updating resource";
                        "type" => &host_type,
                        "host_major" => host_major,
@@ -923,7 +921,7 @@ pub async fn add_devices(
 #[instrument]
 async fn add_device(device: &Device, sandbox: &Arc<Mutex<Sandbox>>) -> Result<SpecUpdate> {
    // log before validation to help with debugging gRPC protocol version differences.
-    info!(sl!(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}",
+    info!(sl(), "device-id: {}, device-type: {}, device-vm-path: {}, device-container-path: {}, device-options: {:?}",
          device.id, device.type_, device.vm_path, device.container_path, device.options);

    if device.type_.is_empty() {
--- a/src/agent/src/image_rpc.rs
+++ b/src/agent/src/image_rpc.rs
@@ -38,11 +38,9 @@ const KATA_CC_IMAGE_WORK_DIR: &str = "/run/image/";
 const KATA_CC_PAUSE_BUNDLE: &str = "/pause_bundle";
 const CONFIG_JSON: &str = "config.json";

-// Convenience macro to obtain the scope logger
-macro_rules! sl {
-    () => {
-        slog_scope::logger()
-    };
+// Convenience function to obtain the scope logger.
+fn sl() -> slog::Logger {
+    slog_scope::logger().new(o!("subsystem" => "cgroups"))
 }

 pub struct ImageService {
@@ -57,18 +55,17 @@ impl ImageService {
        env::set_var("CC_IMAGE_WORK_DIR", KATA_CC_IMAGE_WORK_DIR);
        let mut image_client = ImageClient::default();

-        let image_policy_file = &AGENT_CONFIG.read().await.image_policy_file;
+        let image_policy_file = &AGENT_CONFIG.image_policy_file;
        if !image_policy_file.is_empty() {
            image_client.config.file_paths.sigstore_config = image_policy_file.clone();
        }

-        let simple_signing_sigstore_config =
-            &AGENT_CONFIG.read().await.simple_signing_sigstore_config;
+        let simple_signing_sigstore_config = &AGENT_CONFIG.simple_signing_sigstore_config;
        if !simple_signing_sigstore_config.is_empty() {
            image_client.config.file_paths.sigstore_config = simple_signing_sigstore_config.clone();
        }

-        let image_registry_auth_file = &AGENT_CONFIG.read().await.image_registry_auth_file;
+        let image_registry_auth_file = &AGENT_CONFIG.image_registry_auth_file;
        if !image_registry_auth_file.is_empty() {
            image_client.config.file_paths.auth_file = image_registry_auth_file.clone();
        }
@@ -88,7 +85,7 @@ impl ImageService {
            return Err(anyhow!("Pause image not present in rootfs"));
        }

-        info!(sl!(), "use guest pause image cid {:?}", cid);
+        info!(sl(), "use guest pause image cid {:?}", cid);
        let pause_bundle = Path::new(CONTAINER_BASE).join(cid);
        let pause_rootfs = pause_bundle.join("rootfs");
        let pause_config = pause_bundle.join(CONFIG_JSON);
@@ -159,12 +156,12 @@ impl ImageService {
    async fn pull_image(&self, req: &image::PullImageRequest) -> Result<String> {
        env::set_var("OCICRYPT_KEYPROVIDER_CONFIG", OCICRYPT_CONFIG_PATH);

-        let https_proxy = &AGENT_CONFIG.read().await.https_proxy;
+        let https_proxy = &AGENT_CONFIG.https_proxy;
        if !https_proxy.is_empty() {
            env::set_var("HTTPS_PROXY", https_proxy);
        }

-        let no_proxy = &AGENT_CONFIG.read().await.no_proxy;
+        let no_proxy = &AGENT_CONFIG.no_proxy;
        if !no_proxy.is_empty() {
            env::set_var("NO_PROXY", no_proxy);
        }
@@ -179,7 +176,7 @@ impl ImageService {
            return Ok(image.to_owned());
        }

-        let aa_kbc_params = &AGENT_CONFIG.read().await.aa_kbc_params;
+        let aa_kbc_params = &AGENT_CONFIG.aa_kbc_params;
        if !aa_kbc_params.is_empty() {
            match self.attestation_agent_started.compare_exchange_weak(
                false,
@@ -188,22 +185,21 @@ impl ImageService {
                Ordering::SeqCst,
            ) {
                Ok(_) => Self::init_attestation_agent()?,
-                Err(_) => info!(sl!(), "Attestation Agent already running"),
+                Err(_) => info!(sl(), "Attestation Agent already running"),
            }
        }
        // If the attestation-agent is being used, then enable the authenticated credentials support
        info!(
-            sl!(),
+            sl(),
            "image_client.config.auth set to: {}",
            !aa_kbc_params.is_empty()
        );
        self.image_client.lock().await.config.auth = !aa_kbc_params.is_empty();

        // Read enable signature verification from the agent config and set it in the image_client
-        let enable_signature_verification =
-            &AGENT_CONFIG.read().await.enable_signature_verification;
+        let enable_signature_verification = &AGENT_CONFIG.enable_signature_verification;
        info!(
-            sl!(),
+            sl(),
            "enable_signature_verification set to: {}", enable_signature_verification
        );
        self.image_client.lock().await.config.security_validate = *enable_signature_verification;
@@ -215,7 +211,7 @@ impl ImageService {

        let decrypt_config = format!("provider:attestation-agent:{}", aa_kbc_params);

-        info!(sl!(), "pull image {:?}, bundle path {:?}", cid, bundle_path);
+        info!(sl(), "pull image {:?}, bundle path {:?}", cid, bundle_path);
        // Image layers will store at KATA_CC_IMAGE_WORK_DIR, generated bundles
        // with rootfs and config.json will store under CONTAINER_BASE/cid.
        let res = self
@@ -228,13 +224,13 @@ impl ImageService {
        match res {
            Ok(image) => {
                info!(
-                    sl!(),
+                    sl(),
                    "pull and unpack image {:?}, cid: {:?}, with image-rs succeed. ", image, cid
                );
            }
            Err(e) => {
                error!(
-                    sl!(),
+                    sl(),
                    "pull and unpack image {:?}, cid: {:?}, with image-rs failed with {:?}. ",
                    image,
                    cid,
--- a/src/agent/src/main.rs
+++ b/src/agent/src/main.rs
@@ -65,7 +65,7 @@ use tokio::{
    io::AsyncWrite,
    sync::{
        watch::{channel, Receiver},
-        Mutex, RwLock,
+        Mutex,
    },
    task::JoinHandle,
 };
@@ -84,12 +84,11 @@ cfg_if! {
 const NAME: &str = "kata-agent";

 lazy_static! {
-    static ref AGENT_CONFIG: Arc<RwLock<AgentConfig>> = Arc::new(RwLock::new(
+    static ref AGENT_CONFIG: AgentConfig =
        // Note: We can't do AgentOpts.parse() here to send through the processed arguments to AgentConfig
        // clap::Parser::parse() greedily process all command line input including cargo test parameters,
        // so should only be used inside main.
-        AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap()
-    ));
+        AgentConfig::from_cmdline("/proc/cmdline", env::args().collect()).unwrap();
 }

 #[derive(Parser)]
@@ -182,13 +181,13 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {

        lazy_static::initialize(&AGENT_CONFIG);

-        init_agent_as_init(&logger, AGENT_CONFIG.read().await.unified_cgroup_hierarchy)?;
+        init_agent_as_init(&logger, AGENT_CONFIG.unified_cgroup_hierarchy)?;
        drop(logger_async_guard);
    } else {
        lazy_static::initialize(&AGENT_CONFIG);
    }

-    let config = AGENT_CONFIG.read().await;
+    let config = &AGENT_CONFIG;
    let log_vport = config.log_vport as u32;

    let log_handle = tokio::spawn(create_logger_task(rfd, log_vport, shutdown_rx.clone()));
@@ -201,7 +200,7 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
    let (logger, logger_async_guard) =
        logging::create_logger(NAME, "agent", config.log_level, writer);

-    announce(&logger, &config);
+    announce(&logger, config);

    // This variable is required as it enables the global (and crucially static) logger,
    // which is required to satisfy the the lifetime constraints of the auto-generated gRPC code.
@@ -229,7 +228,7 @@ async fn real_main() -> std::result::Result<(), Box<dyn std::error::Error>> {
    let span_guard = root_span.enter();

    // Start the sandbox and wait for its ttRPC server to end
-    start_sandbox(&logger, &config, init_mode, &mut tasks, shutdown_rx.clone()).await?;
+    start_sandbox(&logger, config, init_mode, &mut tasks, shutdown_rx.clone()).await?;

    // Install a NOP logger for the remainder of the shutdown sequence
    // to ensure any log calls made by local crates using the scope logger
--- a/src/agent/src/metrics.rs
+++ b/src/agent/src/metrics.rs
@@ -15,11 +15,9 @@ use tracing::instrument;
 const NAMESPACE_KATA_AGENT: &str = "kata_agent";
 const NAMESPACE_KATA_GUEST: &str = "kata_guest";

-// Convenience macro to obtain the scope logger
-macro_rules! sl {
-    () => {
-        slog_scope::logger().new(o!("subsystem" => "metrics"))
-    };
+// Convenience function to obtain the scope logger.
+fn sl() -> slog::Logger {
+    slog_scope::logger().new(o!("subsystem" => "metrics"))
 }

 lazy_static! {
@@ -139,7 +137,7 @@ fn update_agent_metrics() -> Result<()> {
        Ok(p) => p,
        Err(e) => {
            // FIXME: return Ok for all errors?
-            warn!(sl!(), "failed to create process instance: {:?}", e);
+            warn!(sl(), "failed to create process instance: {:?}", e);

            return Ok(());
        }
@@ -160,7 +158,7 @@ fn update_agent_metrics() -> Result<()> {
    // io
    match me.io() {
        Err(err) => {
-            info!(sl!(), "failed to get process io stat: {:?}", err);
+            info!(sl(), "failed to get process io stat: {:?}", err);
        }
        Ok(io) => {
            set_gauge_vec_proc_io(&AGENT_IO_STAT, &io);
@@ -169,7 +167,7 @@ fn update_agent_metrics() -> Result<()> {

    match me.stat() {
        Err(err) => {
-            info!(sl!(), "failed to get process stat: {:?}", err);
+            info!(sl(), "failed to get process stat: {:?}", err);
        }
        Ok(stat) => {
            set_gauge_vec_proc_stat(&AGENT_PROC_STAT, &stat);
@@ -177,7 +175,7 @@ fn update_agent_metrics() -> Result<()> {
    }

    match me.status() {
-        Err(err) => error!(sl!(), "failed to get process status: {:?}", err),
+        Err(err) => error!(sl(), "failed to get process status: {:?}", err),
        Ok(status) => set_gauge_vec_proc_status(&AGENT_PROC_STATUS, &status),
    }

@@ -189,7 +187,7 @@ fn update_guest_metrics() {
    // try get load and task info
    match procfs::LoadAverage::new() {
        Err(err) => {
-            info!(sl!(), "failed to get guest LoadAverage: {:?}", err);
+            info!(sl(), "failed to get guest LoadAverage: {:?}", err);
        }
        Ok(load) => {
            GUEST_LOAD
@@ -209,7 +207,7 @@ fn update_guest_metrics() {
    // try to get disk stats
    match procfs::diskstats() {
        Err(err) => {
-            info!(sl!(), "failed to get guest diskstats: {:?}", err);
+            info!(sl(), "failed to get guest diskstats: {:?}", err);
        }
        Ok(diskstats) => {
            for diskstat in diskstats {
@@ -221,7 +219,7 @@ fn update_guest_metrics() {
    // try to get vm stats
    match procfs::vmstat() {
        Err(err) => {
-            info!(sl!(), "failed to get guest vmstat: {:?}", err);
+            info!(sl(), "failed to get guest vmstat: {:?}", err);
        }
        Ok(vmstat) => {
            for (k, v) in vmstat {
@@ -233,7 +231,7 @@ fn update_guest_metrics() {
    // cpu stat
    match procfs::KernelStats::new() {
        Err(err) => {
-            info!(sl!(), "failed to get guest KernelStats: {:?}", err);
+            info!(sl(), "failed to get guest KernelStats: {:?}", err);
        }
        Ok(kernel_stats) => {
            set_gauge_vec_cpu_time(&GUEST_CPU_TIME, "total", &kernel_stats.total);
@@ -246,7 +244,7 @@ fn update_guest_metrics() {
    // try to get net device stats
    match procfs::net::dev_status() {
        Err(err) => {
-            info!(sl!(), "failed to get guest net::dev_status: {:?}", err);
+            info!(sl(), "failed to get guest net::dev_status: {:?}", err);
        }
        Ok(devs) => {
            // netdev: map[string]procfs::net::DeviceStatus
@@ -259,7 +257,7 @@ fn update_guest_metrics() {
    // get statistics about memory from /proc/meminfo
    match procfs::Meminfo::new() {
        Err(err) => {
-            info!(sl!(), "failed to get guest Meminfo: {:?}", err);
+            info!(sl(), "failed to get guest Meminfo: {:?}", err);
        }
        Ok(meminfo) => {
            set_gauge_vec_meminfo(&GUEST_MEMINFO, &meminfo);
--- a/src/agent/src/rpc.rs
+++ b/src/agent/src/rpc.rs
--- a/src/agent/src/tracer.rs
+++ b/src/agent/src/tracer.rs
@@ -69,7 +69,7 @@ macro_rules! trace_rpc_call {
            propagator.extract(&extract_carrier_from_ttrpc($ctx))
        });

-        info!(sl!(), "rpc call from shim to agent: {:?}", $name);
+        info!(sl(), "rpc call from shim to agent: {:?}", $name);

        // generate tracing span
        let rpc_span = span!(tracing::Level::INFO, $name, "mod"="rpc.rs", req=?$req);
--- a/src/agent/src/uevent.rs
+++ b/src/agent/src/uevent.rs
@@ -19,11 +19,9 @@ use tokio::sync::watch::Receiver;
 use tokio::sync::Mutex;
 use tracing::instrument;

-// Convenience macro to obtain the scope logger
-macro_rules! sl {
-    () => {
-        slog_scope::logger().new(o!("subsystem" => "uevent"))
-    };
+// Convenience function to obtain the scope logger.
+fn sl() -> slog::Logger {
+    slog_scope::logger().new(o!("subsystem" => "uevent"))
 }

 #[derive(Debug, Default, Clone, PartialEq, Eq)]
@@ -120,11 +118,11 @@ pub async fn wait_for_uevent(
 ) -> Result<Uevent> {
    let logprefix = format!("Waiting for {:?}", &matcher);

-    info!(sl!(), "{}", logprefix);
+    info!(sl(), "{}", logprefix);
    let mut sb = sandbox.lock().await;
    for uev in sb.uevent_map.values() {
        if matcher.is_match(uev) {
-            info!(sl!(), "{}: found {:?} in uevent map", logprefix, &uev);
+            info!(sl(), "{}: found {:?} in uevent map", logprefix, &uev);
            return Ok(uev.clone());
        }
    }
@@ -139,9 +137,9 @@ pub async fn wait_for_uevent(
    sb.uevent_watchers.push(Some((Box::new(matcher), tx)));
    drop(sb); // unlock

-    info!(sl!(), "{}: waiting on channel", logprefix);
+    info!(sl(), "{}: waiting on channel", logprefix);

-    let hotplug_timeout = AGENT_CONFIG.read().await.hotplug_timeout;
+    let hotplug_timeout = AGENT_CONFIG.hotplug_timeout;

    let uev = match tokio::time::timeout(hotplug_timeout, rx).await {
        Ok(v) => v?,
@@ -157,7 +155,7 @@ pub async fn wait_for_uevent(
        }
    };

-    info!(sl!(), "{}: found {:?} on channel", logprefix, &uev);
+    info!(sl(), "{}: found {:?} on channel", logprefix, &uev);
    Ok(uev)
 }

--- a/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs
+++ b/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs
@@ -341,7 +341,10 @@ impl DragonballInner {

        // cannot exceed maximum value
        if new_vcpus > self.config.cpu_info.default_maxvcpus {
-            return Err(anyhow!("resize vcpu error: cannot greater than maxvcpus"));
+            warn!(
+                sl!(),
+                "Cannot allocate more vcpus than the max allowed number of vcpus. The maximum allowed amount of vcpus will be used instead.");
+            return Ok((current_vcpus, self.config.cpu_info.default_maxvcpus));
        }

        Ok((current_vcpus, new_vcpus))
--- a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs
+++ b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs
@@ -105,7 +105,12 @@ impl InitialSizeManager {
            hv.cpu_info.default_vcpus = self.resource.vcpu as i32
        }
        if self.resource.mem_mb > 0 {
-            hv.memory_info.default_memory = self.resource.mem_mb;
+            // since the memory overhead introduced by kata-agent and system components
+            // will really affect the amount of memory the user can use, so we choose to
+            // plus the default_memory here, instead of overriding it.
+            // (if we override the default_memory here, and user apllications still
+            // use memory as they orignally expected, it would be easy to OOM.)
+            hv.memory_info.default_memory += self.resource.mem_mb;
        }
        Ok(())
    }
--- a/src/runtime/cmd/kata-runtime/kata-env.go
+++ b/src/runtime/cmd/kata-runtime/kata-env.go
@@ -17,7 +17,7 @@ import (
 	"github.com/prometheus/procfs"
 	"github.com/urfave/cli"

-	hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
+	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
@@ -113,8 +113,8 @@ type HypervisorInfo struct {
 	SocketPath           string
 	Msize9p              uint32
 	MemorySlots          uint32
-	PCIeRootPort         uint32
-	ColdPlugVFIO         hv.PCIePort
+	HotPlugVFIO          config.PCIePort
+	ColdPlugVFIO         config.PCIePort
 	HotplugVFIOOnRootBus bool
 	Debug                bool
 }
@@ -317,9 +317,9 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) {
 		EntropySource:        config.HypervisorConfig.EntropySource,
 		SharedFS:             config.HypervisorConfig.SharedFS,
 		VirtioFSDaemon:       config.HypervisorConfig.VirtioFSDaemon,
+		HotPlugVFIO:          config.HypervisorConfig.HotPlugVFIO,
 		ColdPlugVFIO:         config.HypervisorConfig.ColdPlugVFIO,
 		HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
-		PCIeRootPort:         config.HypervisorConfig.PCIeRootPort,
 		SocketPath:           socketPath,
 	}, nil
 }
--- a/src/runtime/cmd/kata-runtime/kata-env_test.go
+++ b/src/runtime/cmd/kata-runtime/kata-env_test.go
@@ -19,12 +19,12 @@ import (
 	"testing"

 	"github.com/BurntSushi/toml"
-	hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
 	vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
 	vcUtils "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/urfave/cli"

+	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
@@ -74,8 +74,9 @@ func createConfig(configPath string, fileData string) error {
 	return nil
 }

-func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeConfig, err error) {
-	var coldPlugVFIO hv.PCIePort
+func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.RuntimeConfig, err error) {
+	var hotPlugVFIO config.PCIePort
+	var coldPlugVFIO config.PCIePort
 	const logPath = "/log/path"
 	hypervisorPath := filepath.Join(prefixDir, "hypervisor")
 	kernelPath := filepath.Join(prefixDir, "kernel")
@@ -87,8 +88,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
 	blockStorageDriver := "virtio-scsi"
 	enableIOThreads := true
 	hotplugVFIOOnRootBus := true
-	pcieRootPort := uint32(2)
-	coldPlugVFIO = hv.NoPort
+	hotPlugVFIO = config.BridgePort
+	coldPlugVFIO = config.NoPort
 	disableNewNetNs := false
 	sharedFS := "virtio-9p"
 	virtioFSdaemon := filepath.Join(prefixDir, "virtiofsd")
@@ -132,8 +133,8 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
 		BlockDeviceDriver:    blockStorageDriver,
 		EnableIOThreads:      enableIOThreads,
 		HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
+		HotPlugVFIO:          hotPlugVFIO,
 		ColdPlugVFIO:         coldPlugVFIO,
-		PCIeRootPort:         pcieRootPort,
 		DisableNewNetNs:      disableNewNetNs,
 		DefaultVCPUCount:     hypConfig.NumVCPUs,
 		DefaultMaxVCPUCount:  hypConfig.DefaultMaxVCPUs,
@@ -156,12 +157,12 @@ func makeRuntimeConfig(prefixDir string) (configFile string, config oci.RuntimeC
 		return "", oci.RuntimeConfig{}, err
 	}

-	_, config, err = katautils.LoadConfiguration(configFile, true)
+	_, ociConfig, err = katautils.LoadConfiguration(configFile, true)
 	if err != nil {
 		return "", oci.RuntimeConfig{}, err
 	}

-	return configFile, config, nil
+	return configFile, ociConfig, nil
 }

 func getExpectedAgentDetails(config oci.RuntimeConfig) (AgentInfo, error) {
@@ -277,7 +278,7 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo {
 		VirtioFSDaemon:    config.HypervisorConfig.VirtioFSDaemon,

 		HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus,
-		PCIeRootPort:         config.HypervisorConfig.PCIeRootPort,
+		HotPlugVFIO:          config.HypervisorConfig.HotPlugVFIO,
 		ColdPlugVFIO:         config.HypervisorConfig.ColdPlugVFIO,
 	}

--- a/src/runtime/config/configuration-clh.toml.in
+++ b/src/runtime/config/configuration-clh.toml.in
@@ -131,6 +131,11 @@ default_maxmemory = @DEFMAXMEMSZ@
 # Shared file system type:
 #   - virtio-fs (default)
 #   - virtio-fs-nydus
+#   - none
+# WARNING: "none" should be carefully used, and only used in very few specific cases, as
+# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
+# issues with rotation of secrets, certs, or configurations via kubernetes objects like
+# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
 shared_fs = "@DEFSHAREDFS_CLH_VIRTIOFS@"

 # Path to vhost-user-fs daemon.
--- a/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in
+++ b/src/runtime/config/configuration-qemu-nvidia-gpu.toml.in
@@ -178,6 +178,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
 #   - virtio-fs (default)
 #   - virtio-9p
 #   - virtio-fs-nydus
+#   - none
+# WARNING: "none" should be carefully used, and only used in very few specific cases, as
+# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
+# issues with rotation of secrets, certs, or configurations via kubernetes objects like
+# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
 shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@"

 # Path to vhost-user-fs daemon.
--- a/src/runtime/config/configuration-qemu-sev.toml.in
+++ b/src/runtime/config/configuration-qemu-sev.toml.in
@@ -186,6 +186,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
 #   - virtio-fs (default)
 #   - virtio-9p
 #   - virtio-fs-nydus
+#   - none
+# WARNING: "none" should be carefully used, and only used in very few specific cases, as
+# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
+# issues with rotation of secrets, certs, or configurations via kubernetes objects like
+# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
 shared_fs = "@DEFSHAREDFS_QEMU_SEV_VIRTIOFS@"

 # Path to vhost-user-fs daemon.
@@ -669,4 +674,4 @@ service_offload = @DEFSERVICEOFFLOAD@
 #
 # Keys can be remotely provisioned. The Kata agent fetches them from e.g.
 # a HTTPS URL:
-#provision=https://my-key-broker.foo/tenant/<tenant-id>
+#provision=https://my-key-broker.foo/tenant/<tenant-id>
--- a/src/runtime/config/configuration-qemu-snp.toml.in
+++ b/src/runtime/config/configuration-qemu-snp.toml.in
@@ -184,6 +184,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
 #   - virtio-fs (default)
 #   - virtio-9p
 #   - virtio-fs-nydus
+#   - none
+# WARNING: "none" should be carefully used, and only used in very few specific cases, as
+# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
+# issues with rotation of secrets, certs, or configurations via kubernetes objects like
+# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
 shared_fs = "@DEFSHAREDFS_QEMU_SNP_VIRTIOFS@"

 # Path to vhost-user-fs daemon.
--- a/src/runtime/config/configuration-qemu-tdx.toml.in
+++ b/src/runtime/config/configuration-qemu-tdx.toml.in
@@ -172,6 +172,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
 #   - virtio-fs (default)
 #   - virtio-9p
 #   - virtio-fs-nydus
+#   - none
+# WARNING: "none" should be carefully used, and only used in very few specific cases, as
+# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
+# issues with rotation of secrets, certs, or configurations via kubernetes objects like
+# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
 shared_fs = "@DEFSHAREDFS_QEMU_TDX_VIRTIOFS@"

 # Path to vhost-user-fs daemon.
--- a/src/runtime/config/configuration-qemu.toml.in
+++ b/src/runtime/config/configuration-qemu.toml.in
@@ -206,6 +206,11 @@ disable_block_device_use = @DEFDISABLEBLOCK@
 #   - virtio-fs (default)
 #   - virtio-9p
 #   - virtio-fs-nydus
+#   - none
+# WARNING: "none" should be carefully used, and only used in very few specific cases, as
+# any update to the mount will *NOT* be reflected during the lifecycle of the pod, causing
+# issues with rotation of secrets, certs, or configurations via kubernetes objects like
+# configMaps or secrets, as those will be copied into the guest at *pod* *creation* *time*.
 shared_fs = "@DEFSHAREDFS_QEMU_VIRTIOFS@"

 # Path to vhost-user-fs daemon.
@@ -380,8 +385,15 @@ pflashes = []
 # Default false
 #hotplug_vfio_on_root_bus = true

+# Enable hot-plugging of VFIO devices to a bridge-port, 
+# root-port or switch-port. 
+# The default setting is  "no-port"
+#hot_plug_vfio = "root-port" 
+
 # In a confidential compute environment hot-plugging can compromise
-# security. Enable cold-plugging of VFIO devices to a root-port.
+# security. 
+# Enable cold-plugging of VFIO devices to a bridge-port, 
+# root-port or switch-port. 
 # The default setting is  "no-port", which means disabled. 
 #cold_plug_vfio = "root-port" 

--- a/src/runtime/pkg/containerd-shim-v2/create_test.go
+++ b/src/runtime/pkg/containerd-shim-v2/create_test.go
@@ -20,7 +20,7 @@ import (
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/stretchr/testify/assert"

-	hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
+	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
 	ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
 	vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
 	vcAnnotations "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
@@ -308,8 +308,9 @@ func TestCreateContainerConfigFail(t *testing.T) {
 	assert.Error(err)
 }

-func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err error) {
-	var coldPlugVFIO hv.PCIePort
+func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, err error) {
+	var hotPlugVFIO config.PCIePort
+	var coldPlugVFIO config.PCIePort
 	if dir == "" {
 		return "", fmt.Errorf("BUG: need directory")
 	}
@@ -330,11 +331,11 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
 	blockDeviceDriver := "virtio-scsi"
 	enableIOThreads := true
 	hotplugVFIOOnRootBus := true
-	pcieRootPort := uint32(2)
 	disableNewNetNs := false
 	sharedFS := "virtio-9p"
 	virtioFSdaemon := path.Join(dir, "virtiofsd")
-	coldPlugVFIO = hv.RootPort
+	hotPlugVFIO = config.BridgePort
+	coldPlugVFIO = config.RootPort

 	configFileOptions := ktu.RuntimeConfigOptions{
 		Hypervisor:           "qemu",
@@ -349,10 +350,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config string, err err
 		BlockDeviceDriver:    blockDeviceDriver,
 		EnableIOThreads:      enableIOThreads,
 		HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
-		PCIeRootPort:         pcieRootPort,
 		DisableNewNetNs:      disableNewNetNs,
 		SharedFS:             sharedFS,
 		VirtioFSDaemon:       virtioFSdaemon,
+		HotPlugVFIO:          hotPlugVFIO,
 		ColdPlugVFIO:         coldPlugVFIO,
 	}

--- a/src/runtime/pkg/device/config/config.go
+++ b/src/runtime/pkg/device/config/config.go
@@ -81,6 +81,17 @@ const (

 	// VirtioFSNydus means use nydus for the shared file system
 	VirtioFSNydus = "virtio-fs-nydus"
+
+	// NoSharedFS means *no* shared file system solution will be used
+	// and files will be copied into the guest system.
+	//
+	// WARNING: This should be carefully used, and only used in very few
+	// specific cases, as any update to the mount will *NOT* be reflected
+	// during the lifecycle of the pod, causing issues with rotation of
+	// secrets, certs, or configurations via kubernetes objects like
+	// configMaps or secrets, as those will be copied into the guest at
+	// *pod* *creation* *time*.
+	NoSharedFS = "none"
 )

 const (
@@ -114,14 +125,117 @@ const (
 // SysDevPrefix is static string of /sys/dev
 var SysDevPrefix = "/sys/dev"

-// SysIOMMUPath is static string of /sys/kernel/iommu_groups
-var SysIOMMUPath = "/sys/kernel/iommu_groups"
+// SysIOMMUGroupPath is static string of /sys/kernel/iommu_groups
+var SysIOMMUGroupPath = "/sys/kernel/iommu_groups"

 // SysBusPciDevicesPath is static string of /sys/bus/pci/devices
 var SysBusPciDevicesPath = "/sys/bus/pci/devices"

 var getSysDevPath = getSysDevPathImpl

+// PCIePortBusPrefix gives us the correct bus nameing dependeing on the port
+// used to hot(cold)-plug the device
+type PCIePortBusPrefix string
+
+const (
+	PCIeRootPortPrefix              PCIePortBusPrefix = "rp"
+	PCIeSwitchPortPrefix            PCIePortBusPrefix = "sw"
+	PCIeSwitchUpstreamPortPrefix    PCIePortBusPrefix = "swup"
+	PCIeSwitchhDownstreamPortPrefix PCIePortBusPrefix = "swdp"
+	PCIBridgePortPrefix             PCIePortBusPrefix = "bp"
+)
+
+func (p PCIePortBusPrefix) String() string {
+	switch p {
+	case PCIeRootPortPrefix:
+		fallthrough
+	case PCIeSwitchPortPrefix:
+		fallthrough
+	case PCIeSwitchUpstreamPortPrefix:
+		fallthrough
+	case PCIeSwitchhDownstreamPortPrefix:
+		fallthrough
+	case PCIBridgePortPrefix:
+		return string(p)
+	}
+	return fmt.Sprintf("<unknown PCIePortBusPrefix: %s>", string(p))
+}
+
+// PCIePort distinguish only between root and switch port
+type PCIePort string
+
+const (
+	// RootPort attach VFIO devices to a root-port
+	RootPort PCIePort = "root-port"
+	// SwitchPort attach VFIO devices to a switch-port
+	SwitchPort = "switch-port"
+	// BridgePort is the default
+	BridgePort = "bridge-port"
+	// NoPort is for disabling VFIO hotplug/coldplug
+	NoPort = "no-port"
+	// InvalidPort is for invalid port
+	InvalidPort = "invalid-port"
+)
+
+func (p PCIePort) String() string {
+	switch p {
+	case RootPort:
+		fallthrough
+	case SwitchPort:
+		fallthrough
+	case BridgePort:
+		fallthrough
+	case NoPort:
+		fallthrough
+	case InvalidPort:
+		return string(p)
+	}
+	return fmt.Sprintf("<unknown PCIePort: %s>", string(p))
+}
+
+var PCIePortPrefixMapping = map[PCIePort]PCIePortBusPrefix{
+	RootPort:   PCIeRootPortPrefix,
+	SwitchPort: PCIeSwitchhDownstreamPortPrefix,
+	BridgePort: PCIBridgePortPrefix,
+}
+
+func (p PCIePort) Invalid() bool {
+	switch p {
+	case RootPort:
+		fallthrough
+	case SwitchPort:
+		fallthrough
+	case BridgePort:
+		fallthrough
+	case NoPort:
+		return false
+	}
+	return true
+}
+
+func (p PCIePort) Valid() bool {
+	switch p {
+	case RootPort:
+		fallthrough
+	case SwitchPort:
+		fallthrough
+	case BridgePort:
+		fallthrough
+	case NoPort:
+		return true
+	}
+	return false
+}
+
+type PCIePortMapping map[string]bool
+
+var (
+	// Each of this structures keeps track of the devices attached to the
+	// different types of PCI ports. We can deduces the Bus number from it
+	// and eliminate duplicates being assigned.
+	PCIeDevices = map[PCIePort]PCIePortMapping{}
+)
+
 // DeviceInfo is an embedded type that contains device data common to all types of devices.
 type DeviceInfo struct {
 	// DriverOptions is specific options for each device driver
@@ -167,6 +281,9 @@ type DeviceInfo struct {
 	// ColdPlug specifies whether the device must be cold plugged (true)
 	// or hot plugged (false).
 	ColdPlug bool
+
+	// Specifies the PCIe port type to which the device is attached
+	Port PCIePort
 }

 // BlockDrive represents a block storage drive which may be used in case the storage
@@ -268,14 +385,8 @@ const (
 	VFIOAPDeviceMediatedType
 )

-type VFIODev interface {
-	GetID() *string
-	GetType() VFIODeviceType
-	GetSysfsDev() *string
-}
-
-// VFIOPCIDev represents a VFIO PCI device used for hotplugging
-type VFIOPCIDev struct {
+// VFIODev represents a VFIO PCI device used for hotplugging
+type VFIODev struct {
 	// ID is used to identify this drive in the hypervisor options.
 	ID string

@@ -305,44 +416,15 @@ type VFIOPCIDev struct {

 	// IsPCIe specifies device is PCIe or PCI
 	IsPCIe bool
-}
-
-func (d VFIOPCIDev) GetID() *string {
-	return &d.ID
-}
-
-func (d VFIOPCIDev) GetType() VFIODeviceType {
-	return d.Type
-}
-
-func (d VFIOPCIDev) GetSysfsDev() *string {
-	return &d.SysfsDev
-}
-
-type VFIOAPDev struct {
-	// ID is used to identify this drive in the hypervisor options.
-	ID string
-
-	// sysfsdev of VFIO mediated device
-	SysfsDev string

 	// APDevices are the Adjunct Processor devices assigned to the mdev
 	APDevices []string

-	// Type of VFIO device
-	Type VFIODeviceType
-}
+	// Rank identifies a device in a IOMMU group
+	Rank int

-func (d VFIOAPDev) GetID() *string {
-	return &d.ID
-}
-
-func (d VFIOAPDev) GetType() VFIODeviceType {
-	return d.Type
-}
-
-func (d VFIOAPDev) GetSysfsDev() *string {
-	return &d.SysfsDev
+	// Port is the PCIe port type to which the device is attached
+	Port PCIePort
 }

 // RNGDev represents a random number generator device
--- a/src/runtime/pkg/device/drivers/utils.go
+++ b/src/runtime/pkg/device/drivers/utils.go
@@ -47,9 +47,9 @@ func deviceLogger() *logrus.Entry {
 	return api.DeviceLogger()
 }

-// Identify PCIe device by reading the size of the PCI config space
+// IsPCIeDevice identifies PCIe device by reading the size of the PCI config space
 // Plain PCI device have 256 bytes of config space where PCIe devices have 4K
-func isPCIeDevice(bdf string) bool {
+func IsPCIeDevice(bdf string) bool {
 	if len(strings.Split(bdf, ":")) == 2 {
 		bdf = PCIDomain + ":" + bdf
 	}
@@ -157,14 +157,12 @@ func checkIgnorePCIClass(pciClass string, deviceBDF string, bitmask uint64) (boo

 // GetAllVFIODevicesFromIOMMUGroup returns all the VFIO devices in the IOMMU group
 // We can reuse this function at various levels, sandbox, container.
-// Only the VFIO module is allowed to do bus assignments, all other modules need to
-// ignore it if used as helper function to get VFIO information.
-func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignment bool) ([]*config.VFIODev, error) {
+func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODev, error) {

 	vfioDevs := []*config.VFIODev{}

 	vfioGroup := filepath.Base(device.HostPath)
-	iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices")
+	iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioGroup, "devices")

 	deviceFiles, err := os.ReadDir(iommuDevicesPath)
 	if err != nil {
@@ -174,7 +172,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme
 	// Pass all devices in iommu group
 	for i, deviceFile := range deviceFiles {
 		//Get bdf of device eg 0000:00:1c.0
-		deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath)
+		deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(deviceFile.Name(), iommuDevicesPath)
 		if err != nil {
 			return nil, err
 		}
@@ -196,27 +194,24 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme

 		switch vfioDeviceType {
 		case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
-			isPCIe := isPCIeDevice(deviceBDF)
 			// Do not directly assign to `vfio` -- need to access field still
-			vfioPCI := config.VFIOPCIDev{
+			vfio = config.VFIODev{
 				ID:       id,
 				Type:     vfioDeviceType,
 				BDF:      deviceBDF,
 				SysfsDev: deviceSysfsDev,
-				IsPCIe:   isPCIe,
+				IsPCIe:   IsPCIeDevice(deviceBDF),
 				Class:    pciClass,
+				Rank:     -1,
+				Port:     device.Port,
 			}
-			if isPCIe && !ignoreBusAssignment {
-				vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs))
-				AllPCIeDevs[deviceBDF] = true
-			}
-			vfio = vfioPCI
+
 		case config.VFIOAPDeviceMediatedType:
 			devices, err := GetAPVFIODevices(deviceSysfsDev)
 			if err != nil {
 				return nil, err
 			}
-			vfio = config.VFIOAPDev{
+			vfio = config.VFIODev{
 				ID:        id,
 				SysfsDev:  deviceSysfsDev,
 				Type:      config.VFIOAPDeviceMediatedType,
--- a/src/runtime/pkg/device/drivers/vfio.go
+++ b/src/runtime/pkg/device/drivers/vfio.go
@@ -28,14 +28,9 @@ const (
 	vfioRemoveIDPath    = "/sys/bus/pci/drivers/vfio-pci/remove_id"
 	iommuGroupPath      = "/sys/bus/pci/devices/%s/iommu_group"
 	vfioDevPath         = "/dev/vfio/%s"
-	pcieRootPortPrefix  = "rp"
 	vfioAPSysfsDir      = "/sys/devices/vfio_ap"
 )

-var (
-	AllPCIeDevs = map[string]bool{}
-)
-
 // VFIODevice is a vfio device meant to be passed to the hypervisor
 // to be used by the Virtual Machine.
 type VFIODevice struct {
@@ -70,10 +65,17 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece
 		}
 	}()

-	device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo, false)
+	device.VfioDevs, err = GetAllVFIODevicesFromIOMMUGroup(*device.DeviceInfo)
 	if err != nil {
 		return err
 	}
+	for _, vfio := range device.VfioDevs {
+		if vfio.IsPCIe {
+			busIndex := len(config.PCIeDevices[vfio.Port])
+			vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex)
+			config.PCIeDevices[vfio.Port][vfio.BDF] = true
+		}
+	}

 	coldPlug := device.DeviceInfo.ColdPlug
 	deviceLogger().WithField("cold-plug", coldPlug).Info("Attaching VFIO device")
@@ -169,23 +171,18 @@ func (device *VFIODevice) Load(ds config.DeviceState) {
 	for _, dev := range ds.VFIODevs {
 		var vfio config.VFIODev

-		vfioDeviceType := (*device.VfioDevs[0]).GetType()
-		switch vfioDeviceType {
+		switch dev.Type {
 		case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
-			bdf := ""
-			if pciDev, ok := (*dev).(config.VFIOPCIDev); ok {
-				bdf = pciDev.BDF
-			}
-			vfio = config.VFIOPCIDev{
-				ID:       *(*dev).GetID(),
-				Type:     config.VFIODeviceType((*dev).GetType()),
-				BDF:      bdf,
-				SysfsDev: *(*dev).GetSysfsDev(),
+			vfio = config.VFIODev{
+				ID:       dev.ID,
+				Type:     config.VFIODeviceType(dev.Type),
+				BDF:      dev.BDF,
+				SysfsDev: dev.SysfsDev,
 			}
 		case config.VFIOAPDeviceMediatedType:
-			vfio = config.VFIOAPDev{
-				ID:       *(*dev).GetID(),
-				SysfsDev: *(*dev).GetSysfsDev(),
+			vfio = config.VFIODev{
+				ID:       dev.ID,
+				SysfsDev: dev.SysfsDev,
 			}
 		default:
 			deviceLogger().WithError(
@@ -200,7 +197,7 @@ func (device *VFIODevice) Load(ds config.DeviceState) {

 // It should implement GetAttachCount() and DeviceID() as api.Device implementation
 // here it shares function from *GenericDevice so we don't need duplicate codes
-func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
+func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) {
 	sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
 	vfioDeviceType, err = GetVFIODeviceType(sysfsDevStr)
 	if err != nil {
@@ -210,14 +207,18 @@ func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceS
 	switch vfioDeviceType {
 	case config.VFIOPCIDeviceNormalType:
 		// Get bdf of device eg. 0000:00:1c.0
-		deviceBDF = getBDF(deviceFileName)
+		// OLD IMPL: deviceBDF = getBDF(deviceFileName)
+		// The old implementation did not consider the case where
+		// vfio devices are located on different root busses. The
+		// kata-agent will handle the case now, here, use the full PCI addr
+		deviceBDF = deviceFileName
 		// Get sysfs path used by cloud-hypervisor
 		deviceSysfsDev = filepath.Join(config.SysBusPciDevicesPath, deviceFileName)
 	case config.VFIOPCIDeviceMediatedType:
 		// Get sysfsdev of device eg. /sys/devices/pci0000:00/0000:00:02.0/f79944e4-5a3d-11e8-99ce-479cbab002e4
 		sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
 		deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
-		deviceBDF = getBDF(getMediatedBDF(deviceSysfsDev))
+		deviceBDF = GetBDF(getMediatedBDF(deviceSysfsDev))
 	case config.VFIOAPDeviceMediatedType:
 		sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName)
 		deviceSysfsDev, err = GetSysfsDev(sysfsDevStr)
@@ -240,7 +241,7 @@ func getMediatedBDF(deviceSysfsDev string) string {

 // getBDF returns the BDF of pci device
 // Expected input string format is [<domain>]:[<bus>][<slot>].[<func>] eg. 0000:02:10.0
-func getBDF(deviceSysStr string) string {
+func GetBDF(deviceSysStr string) string {
 	tokens := strings.SplitN(deviceSysStr, ":", 2)
 	if len(tokens) == 1 {
 		return ""
--- a/src/runtime/pkg/device/drivers/vfio_test.go
+++ b/src/runtime/pkg/device/drivers/vfio_test.go
@@ -20,7 +20,7 @@ func TestGetVFIODetails(t *testing.T) {
 	}

 	data := []testData{
-		{"0000:02:10.0", "02:10.0"},
+		{"0000:02:10.0", "0000:02:10.0"},
 		{"0000:0210.0", ""},
 		{"f79944e4-5a3d-11e8-99ce-", ""},
 		{"f79944e4-5a3d-11e8-99ce", ""},
@@ -29,7 +29,7 @@ func TestGetVFIODetails(t *testing.T) {
 	}

 	for _, d := range data {
-		deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(d.deviceStr, "")
+		deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(d.deviceStr, "")

 		switch vfioDeviceType {
 		case config.VFIOPCIDeviceNormalType:
--- a/src/runtime/pkg/device/manager/manager.go
+++ b/src/runtime/pkg/device/manager/manager.go
@@ -71,7 +71,11 @@ func NewDeviceManager(blockDriver string, vhostUserStoreEnabled bool, vhostUserS
 		dm.blockDriver = config.VirtioSCSI
 	}

-	drivers.AllPCIeDevs = make(map[string]bool)
+	config.PCIeDevices = make(map[config.PCIePort]config.PCIePortMapping)
+
+	config.PCIeDevices[config.RootPort] = make(map[string]bool)
+	config.PCIeDevices[config.SwitchPort] = make(map[string]bool)
+	config.PCIeDevices[config.BridgePort] = make(map[string]bool)

 	for _, dev := range devices {
 		dm.devices[dev.DeviceID()] = dev
@@ -118,7 +122,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device
 	}
 	if IsVFIO(devInfo.HostPath) {
 		return drivers.NewVFIODevice(&devInfo), nil
-	} else if isVhostUserBlk(devInfo) {
+	} else if IsVhostUserBlk(devInfo) {
 		if devInfo.DriverOptions == nil {
 			devInfo.DriverOptions = make(map[string]string)
 		}
--- a/src/runtime/pkg/device/manager/manager_test.go
+++ b/src/runtime/pkg/device/manager/manager_test.go
@@ -116,14 +116,14 @@ func TestAttachVFIODevice(t *testing.T) {
 	_, err = os.Create(deviceConfigFile)
 	assert.Nil(t, err)

-	savedIOMMUPath := config.SysIOMMUPath
-	config.SysIOMMUPath = tmpDir
+	savedIOMMUPath := config.SysIOMMUGroupPath
+	config.SysIOMMUGroupPath = tmpDir

 	savedSysBusPciDevicesPath := config.SysBusPciDevicesPath
 	config.SysBusPciDevicesPath = devicesDir

 	defer func() {
-		config.SysIOMMUPath = savedIOMMUPath
+		config.SysIOMMUGroupPath = savedIOMMUPath
 		config.SysBusPciDevicesPath = savedSysBusPciDevicesPath
 	}()

--- a/src/runtime/pkg/device/manager/utils.go
+++ b/src/runtime/pkg/device/manager/utils.go
@@ -37,7 +37,7 @@ func isBlock(devInfo config.DeviceInfo) bool {
 }

 // isVhostUserBlk checks if the device is a VhostUserBlk device.
-func isVhostUserBlk(devInfo config.DeviceInfo) bool {
+func IsVhostUserBlk(devInfo config.DeviceInfo) bool {
 	return devInfo.DevType == "b" && devInfo.Major == config.VhostUserBlkMajor
 }

--- a/src/runtime/pkg/device/manager/utils_test.go
+++ b/src/runtime/pkg/device/manager/utils_test.go
@@ -70,7 +70,7 @@ func TestIsVhostUserBlk(t *testing.T) {
 	}

 	for _, d := range data {
-		isVhostUserBlk := isVhostUserBlk(
+		isVhostUserBlk := IsVhostUserBlk(
 			config.DeviceInfo{
 				DevType: d.devType,
 				Major:   d.major,
--- a/src/runtime/pkg/govmm/qemu/qemu.go
+++ b/src/runtime/pkg/govmm/qemu/qemu.go
@@ -123,6 +123,14 @@ const (
 	// PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port.
 	PCIeRootPort DeviceDriver = "pcie-root-port"

+	// PCIeSwitchUpstreamPort is a PCIe switch upstream port
+	// A upstream port connects to a PCIe Root Port
+	PCIeSwitchUpstreamPort DeviceDriver = "x3130-upstream"
+
+	// PCIeSwitchDownstreamPort is a PCIe switch downstream port
+	// PCIe devices can be hot-plugged to the downstream port.
+	PCIeSwitchDownstreamPort DeviceDriver = "xio3130-downstream"
+
 	// Loader is the Loader device driver.
 	Loader DeviceDriver = "loader"

@@ -236,6 +244,7 @@ const (

 	// SecExecGuest represents an s390x Secure Execution (Protected Virtualization in QEMU) object
 	SecExecGuest ObjectType = "s390-pv-guest"
+
 	// PEFGuest represent ppc64le PEF(Protected Execution Facility) object.
 	PEFGuest ObjectType = "pef-guest"
 )
@@ -410,7 +419,6 @@ func (object Object) QemuParams(config *Config) []string {
 		deviceParams = append(deviceParams, string(object.Driver))
 		deviceParams = append(deviceParams, fmt.Sprintf("id=%s", object.DeviceID))
 		deviceParams = append(deviceParams, fmt.Sprintf("host-path=%s", object.File))
-
 	}

 	if len(deviceParams) > 0 {
@@ -1722,6 +1730,106 @@ func (b PCIeRootPortDevice) Valid() bool {
 	return true
 }

+// PCIeSwitchUpstreamPortDevice is the port connecting to the root port
+type PCIeSwitchUpstreamPortDevice struct {
+	ID  string // format: sup{n}, n>=0
+	Bus string // default is rp0
+}
+
+// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
+func (b PCIeSwitchUpstreamPortDevice) QemuParams(config *Config) []string {
+	var qemuParams []string
+	var deviceParams []string
+
+	driver := PCIeSwitchUpstreamPort
+
+	deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
+	deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
+
+	qemuParams = append(qemuParams, "-device")
+	qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
+	return qemuParams
+}
+
+// Valid returns true if the PCIeSwitchUpstreamPortDevice structure is valid and complete.
+func (b PCIeSwitchUpstreamPortDevice) Valid() bool {
+	if b.ID == "" {
+		return false
+	}
+	if b.Bus == "" {
+		return false
+	}
+	return true
+}
+
+// PCIeSwitchDownstreamPortDevice is the port connecting to the root port
+type PCIeSwitchDownstreamPortDevice struct {
+	ID      string // format: sup{n}, n>=0
+	Bus     string // default is rp0
+	Chassis string // (slot, chassis) pair is mandatory and must be unique for each downstream port, >=0, default is 0x00
+	Slot    string // >=0, default is 0x00
+	// This to work needs patches to QEMU
+	BusReserve string
+	// Pref64 and Pref32 are not allowed to be set simultaneously
+	Pref64Reserve string // reserve prefetched MMIO aperture, 64-bit
+	Pref32Reserve string // reserve prefetched MMIO aperture, 32-bit
+	MemReserve    string // reserve non-prefetched MMIO aperture, 32-bit *only*
+	IOReserve     string // IO reservation
+
+}
+
+// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice.
+func (b PCIeSwitchDownstreamPortDevice) QemuParams(config *Config) []string {
+	var qemuParams []string
+	var deviceParams []string
+	driver := PCIeSwitchDownstreamPort
+
+	deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID))
+	deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus))
+	deviceParams = append(deviceParams, fmt.Sprintf("chassis=%s", b.Chassis))
+	deviceParams = append(deviceParams, fmt.Sprintf("slot=%s", b.Slot))
+	if b.BusReserve != "" {
+		deviceParams = append(deviceParams, fmt.Sprintf("bus-reserve=%s", b.BusReserve))
+	}
+
+	if b.Pref64Reserve != "" {
+		deviceParams = append(deviceParams, fmt.Sprintf("pref64-reserve=%s", b.Pref64Reserve))
+	}
+
+	if b.Pref32Reserve != "" {
+		deviceParams = append(deviceParams, fmt.Sprintf("pref32-reserve=%s", b.Pref32Reserve))
+	}
+
+	if b.MemReserve != "" {
+		deviceParams = append(deviceParams, fmt.Sprintf("mem-reserve=%s", b.MemReserve))
+	}
+
+	if b.IOReserve != "" {
+		deviceParams = append(deviceParams, fmt.Sprintf("io-reserve=%s", b.IOReserve))
+	}
+
+	qemuParams = append(qemuParams, "-device")
+	qemuParams = append(qemuParams, strings.Join(deviceParams, ","))
+	return qemuParams
+}
+
+// Valid returns true if the PCIeSwitchUpstremPortDevice structure is valid and complete.
+func (b PCIeSwitchDownstreamPortDevice) Valid() bool {
+	if b.ID == "" {
+		return false
+	}
+	if b.Bus == "" {
+		return false
+	}
+	if b.Chassis == "" {
+		return false
+	}
+	if b.Slot == "" {
+		return false
+	}
+	return true
+}
+
 // VFIODevice represents a qemu vfio device meant for direct access by guest OS.
 type VFIODevice struct {
 	// Bus-Device-Function of device
--- a/src/runtime/pkg/hypervisors/hypervisor_state.go
+++ b/src/runtime/pkg/hypervisors/hypervisor_state.go
@@ -5,7 +5,7 @@

 package hypervisors

-import "fmt"
+import "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"

 // Bridge is a bridge where devices can be hot plugged
 type Bridge struct {
@@ -28,37 +28,8 @@ type CPUDevice struct {
 	ID string
 }

-// PCIePort distinguish only between root and switch port
-type PCIePort string
-
-const (
-	// RootPort attach VFIO devices to a root-port
-	RootPort PCIePort = "root-port"
-	// SwitchPort attach VFIO devices to a switch-port
-	SwitchPort = "switch-port"
-	// BridgePort is the default
-	BridgePort = "bridge-port"
-	// NoPort is for disabling VFIO hotplug/coldplug
-	NoPort = "no-port"
-)
-
-func (p PCIePort) String() string {
-	switch p {
-	case RootPort:
-		return "root-port"
-	case SwitchPort:
-		return "switch-port"
-	case BridgePort:
-		return "bridge-port"
-	case NoPort:
-		return "no-port"
-	}
-	return fmt.Sprintf("<unknown PCIePort: %s>", string(p))
-}
-
 type HypervisorState struct {
 	BlockIndexMap map[int]struct{}
-
 	// Type of hypervisor, E.g. qemu/firecracker/acrn.
 	Type string
 	UUID string
@@ -74,7 +45,7 @@ type HypervisorState struct {
 	HotpluggedMemory     int
 	VirtiofsDaemonPid    int
 	Pid                  int
-	PCIeRootPort         int
-	ColdPlugVFIO         PCIePort
+	HotPlugVFIO          config.PCIePort
+	ColdPlugVFIO         config.PCIePort
 	HotplugVFIOOnRootBus bool
 }
--- a/src/runtime/pkg/katatestutils/utils.go
+++ b/src/runtime/pkg/katatestutils/utils.go
@@ -14,7 +14,7 @@ import (
 	"strconv"
 	"testing"

-	hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
+	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/stretchr/testify/assert"
 )
@@ -224,8 +224,8 @@ type RuntimeConfigOptions struct {
 	JaegerUser           string
 	JaegerPassword       string
 	PFlash               []string
-	PCIeRootPort         uint32
-	ColdPlugVFIO         hv.PCIePort
+	HotPlugVFIO          config.PCIePort
+	ColdPlugVFIO         config.PCIePort
 	DefaultVCPUCount     uint32
 	DefaultMaxVCPUCount  uint32
 	DefaultMemSize       uint32
@@ -318,7 +318,6 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string {
 	disable_block_device_use =  ` + strconv.FormatBool(config.DisableBlock) + `
 	enable_iothreads =  ` + strconv.FormatBool(config.EnableIOThreads) + `
 	hotplug_vfio_on_root_bus =  ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + `
-	pcie_root_port = ` + strconv.FormatUint(uint64(config.PCIeRootPort), 10) + `
 	cold_plug_vfio =  "` + config.ColdPlugVFIO.String() + `"
 	msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + `
 	enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + `
--- a/src/runtime/pkg/katautils/config-settings.go.in
+++ b/src/runtime/pkg/katautils/config-settings.go.in
@@ -10,7 +10,7 @@
 package katautils

 import (
-	hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
+	config "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
 )

 // name is the name of the runtime
@@ -82,7 +82,6 @@ const defaultEnableDebug bool = false
 const defaultDisableNestingChecks bool = false
 const defaultMsize9p uint32 = 8192
 const defaultHotplugVFIOOnRootBus bool = false
-const defaultPCIeRootPort = 0
 const defaultEntropySource = "/dev/urandom"
 const defaultGuestHookPath string = ""
 const defaultVirtioFSCacheMode = "never"
@@ -115,4 +114,5 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock"
 // Default config file used by stateless systems.
 var defaultRuntimeConfiguration = "@CONFIG_PATH@"

-const defaultColdPlugVFIO = hv.NoPort
+const defaultHotPlugVFIO = config.NoPort
+const defaultColdPlugVFIO = config.NoPort
--- a/src/runtime/pkg/katautils/config.go
+++ b/src/runtime/pkg/katautils/config.go
@@ -20,7 +20,6 @@ import (
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
 	govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
-	hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
 	vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
@@ -79,98 +78,98 @@ type factory struct {
 }

 type hypervisor struct {
-	Path                           string      `toml:"path"`
-	JailerPath                     string      `toml:"jailer_path"`
-	Kernel                         string      `toml:"kernel"`
-	CtlPath                        string      `toml:"ctlpath"`
-	Initrd                         string      `toml:"initrd"`
-	Image                          string      `toml:"image"`
-	RootfsType                     string      `toml:"rootfs_type"`
-	Firmware                       string      `toml:"firmware"`
-	FirmwareVolume                 string      `toml:"firmware_volume"`
-	MachineAccelerators            string      `toml:"machine_accelerators"`
-	CPUFeatures                    string      `toml:"cpu_features"`
-	KernelParams                   string      `toml:"kernel_params"`
-	MachineType                    string      `toml:"machine_type"`
-	BlockDeviceDriver              string      `toml:"block_device_driver"`
-	EntropySource                  string      `toml:"entropy_source"`
-	SharedFS                       string      `toml:"shared_fs"`
-	VirtioFSDaemon                 string      `toml:"virtio_fs_daemon"`
-	VirtioFSCache                  string      `toml:"virtio_fs_cache"`
-	VhostUserStorePath             string      `toml:"vhost_user_store_path"`
-	FileBackedMemRootDir           string      `toml:"file_mem_backend"`
-	GuestHookPath                  string      `toml:"guest_hook_path"`
-	GuestMemoryDumpPath            string      `toml:"guest_memory_dump_path"`
-	SeccompSandbox                 string      `toml:"seccompsandbox"`
-	GuestPreAttestationURI         string      `toml:"guest_pre_attestation_kbs_uri"`
-	GuestPreAttestationMode        string      `toml:"guest_pre_attestation_kbs_mode"`
-	GuestPreAttestationKeyset      string      `toml:"guest_pre_attestation_keyset"`
-	SEVCertChainPath               string      `toml:"sev_cert_chain"`
-	BlockDeviceAIO                 string      `toml:"block_device_aio"`
-	RemoteHypervisorSocket         string      `toml:"remote_hypervisor_socket"`
-	HypervisorPathList             []string    `toml:"valid_hypervisor_paths"`
-	JailerPathList                 []string    `toml:"valid_jailer_paths"`
-	CtlPathList                    []string    `toml:"valid_ctlpaths"`
-	VirtioFSDaemonList             []string    `toml:"valid_virtio_fs_daemon_paths"`
-	VirtioFSExtraArgs              []string    `toml:"virtio_fs_extra_args"`
-	PFlashList                     []string    `toml:"pflashes"`
-	VhostUserStorePathList         []string    `toml:"valid_vhost_user_store_paths"`
-	FileBackedMemRootList          []string    `toml:"valid_file_mem_backends"`
-	EntropySourceList              []string    `toml:"valid_entropy_sources"`
-	EnableAnnotations              []string    `toml:"enable_annotations"`
-	RxRateLimiterMaxRate           uint64      `toml:"rx_rate_limiter_max_rate"`
-	TxRateLimiterMaxRate           uint64      `toml:"tx_rate_limiter_max_rate"`
-	MemOffset                      uint64      `toml:"memory_offset"`
-	DefaultMaxMemorySize           uint64      `toml:"default_maxmemory"`
-	DiskRateLimiterBwMaxRate       int64       `toml:"disk_rate_limiter_bw_max_rate"`
-	DiskRateLimiterBwOneTimeBurst  int64       `toml:"disk_rate_limiter_bw_one_time_burst"`
-	DiskRateLimiterOpsMaxRate      int64       `toml:"disk_rate_limiter_ops_max_rate"`
-	DiskRateLimiterOpsOneTimeBurst int64       `toml:"disk_rate_limiter_ops_one_time_burst"`
-	NetRateLimiterBwMaxRate        int64       `toml:"net_rate_limiter_bw_max_rate"`
-	NetRateLimiterBwOneTimeBurst   int64       `toml:"net_rate_limiter_bw_one_time_burst"`
-	NetRateLimiterOpsMaxRate       int64       `toml:"net_rate_limiter_ops_max_rate"`
-	NetRateLimiterOpsOneTimeBurst  int64       `toml:"net_rate_limiter_ops_one_time_burst"`
-	VirtioFSCacheSize              uint32      `toml:"virtio_fs_cache_size"`
-	VirtioFSQueueSize              uint32      `toml:"virtio_fs_queue_size"`
-	DefaultMaxVCPUs                uint32      `toml:"default_maxvcpus"`
-	MemorySize                     uint32      `toml:"default_memory"`
-	MemSlots                       uint32      `toml:"memory_slots"`
-	DefaultBridges                 uint32      `toml:"default_bridges"`
-	Msize9p                        uint32      `toml:"msize_9p"`
-	PCIeRootPort                   uint32      `toml:"pcie_root_port"`
-	GuestPreAttestationGRPCTimeout uint32      `toml:"guest_pre_attestation_grpc_timeout"`
-	SEVGuestPolicy                 uint32      `toml:"sev_guest_policy"`
-	SNPGuestPolicy                 uint64      `toml:"snp_guest_policy"`
-	RemoteHypervisorTimeout        uint32      `toml:"remote_hypervisor_timeout"`
-	NumVCPUs                       int32       `toml:"default_vcpus"`
-	BlockDeviceCacheSet            bool        `toml:"block_device_cache_set"`
-	BlockDeviceCacheDirect         bool        `toml:"block_device_cache_direct"`
-	BlockDeviceCacheNoflush        bool        `toml:"block_device_cache_noflush"`
-	EnableVhostUserStore           bool        `toml:"enable_vhost_user_store"`
-	VhostUserDeviceReconnect       uint32      `toml:"vhost_user_reconnect_timeout_sec"`
-	DisableBlockDeviceUse          bool        `toml:"disable_block_device_use"`
-	MemPrealloc                    bool        `toml:"enable_mem_prealloc"`
-	HugePages                      bool        `toml:"enable_hugepages"`
-	VirtioMem                      bool        `toml:"enable_virtio_mem"`
-	IOMMU                          bool        `toml:"enable_iommu"`
-	IOMMUPlatform                  bool        `toml:"enable_iommu_platform"`
-	Debug                          bool        `toml:"enable_debug"`
-	DisableNestingChecks           bool        `toml:"disable_nesting_checks"`
-	EnableIOThreads                bool        `toml:"enable_iothreads"`
-	DisableImageNvdimm             bool        `toml:"disable_image_nvdimm"`
-	HotplugVFIOOnRootBus           bool        `toml:"hotplug_vfio_on_root_bus"`
-	ColdPlugVFIO                   hv.PCIePort `toml:"cold_plug_vfio"`
-	DisableVhostNet                bool        `toml:"disable_vhost_net"`
-	GuestMemoryDumpPaging          bool        `toml:"guest_memory_dump_paging"`
-	ConfidentialGuest              bool        `toml:"confidential_guest"`
-	SevSnpGuest                    bool        `toml:"sev_snp_guest"`
-	GuestSwap                      bool        `toml:"enable_guest_swap"`
-	Rootless                       bool        `toml:"rootless"`
-	DisableSeccomp                 bool        `toml:"disable_seccomp"`
-	DisableSeLinux                 bool        `toml:"disable_selinux"`
-	DisableGuestSeLinux            bool        `toml:"disable_guest_selinux"`
-	LegacySerial                   bool        `toml:"use_legacy_serial"`
-	GuestPreAttestation            bool        `toml:"guest_pre_attestation"`
+	Path                           string          `toml:"path"`
+	JailerPath                     string          `toml:"jailer_path"`
+	Kernel                         string          `toml:"kernel"`
+	CtlPath                        string          `toml:"ctlpath"`
+	Initrd                         string          `toml:"initrd"`
+	Image                          string          `toml:"image"`
+	RootfsType                     string          `toml:"rootfs_type"`
+	Firmware                       string          `toml:"firmware"`
+	FirmwareVolume                 string          `toml:"firmware_volume"`
+	MachineAccelerators            string          `toml:"machine_accelerators"`
+	CPUFeatures                    string          `toml:"cpu_features"`
+	KernelParams                   string          `toml:"kernel_params"`
+	MachineType                    string          `toml:"machine_type"`
+	BlockDeviceDriver              string          `toml:"block_device_driver"`
+	EntropySource                  string          `toml:"entropy_source"`
+	SharedFS                       string          `toml:"shared_fs"`
+	VirtioFSDaemon                 string          `toml:"virtio_fs_daemon"`
+	VirtioFSCache                  string          `toml:"virtio_fs_cache"`
+	VhostUserStorePath             string          `toml:"vhost_user_store_path"`
+	FileBackedMemRootDir           string          `toml:"file_mem_backend"`
+	GuestHookPath                  string          `toml:"guest_hook_path"`
+	GuestMemoryDumpPath            string          `toml:"guest_memory_dump_path"`
+	SeccompSandbox                 string          `toml:"seccompsandbox"`
+	GuestPreAttestationURI         string          `toml:"guest_pre_attestation_kbs_uri"`
+	GuestPreAttestationMode        string          `toml:"guest_pre_attestation_kbs_mode"`
+	GuestPreAttestationKeyset      string          `toml:"guest_pre_attestation_keyset"`
+	SEVCertChainPath               string          `toml:"sev_cert_chain"`
+	BlockDeviceAIO                 string          `toml:"block_device_aio"`
+	RemoteHypervisorSocket         string          `toml:"remote_hypervisor_socket"`
+	HypervisorPathList             []string        `toml:"valid_hypervisor_paths"`
+	JailerPathList                 []string        `toml:"valid_jailer_paths"`
+	CtlPathList                    []string        `toml:"valid_ctlpaths"`
+	VirtioFSDaemonList             []string        `toml:"valid_virtio_fs_daemon_paths"`
+	VirtioFSExtraArgs              []string        `toml:"virtio_fs_extra_args"`
+	PFlashList                     []string        `toml:"pflashes"`
+	VhostUserStorePathList         []string        `toml:"valid_vhost_user_store_paths"`
+	FileBackedMemRootList          []string        `toml:"valid_file_mem_backends"`
+	EntropySourceList              []string        `toml:"valid_entropy_sources"`
+	EnableAnnotations              []string        `toml:"enable_annotations"`
+	RxRateLimiterMaxRate           uint64          `toml:"rx_rate_limiter_max_rate"`
+	TxRateLimiterMaxRate           uint64          `toml:"tx_rate_limiter_max_rate"`
+	MemOffset                      uint64          `toml:"memory_offset"`
+	DefaultMaxMemorySize           uint64          `toml:"default_maxmemory"`
+	DiskRateLimiterBwMaxRate       int64           `toml:"disk_rate_limiter_bw_max_rate"`
+	DiskRateLimiterBwOneTimeBurst  int64           `toml:"disk_rate_limiter_bw_one_time_burst"`
+	DiskRateLimiterOpsMaxRate      int64           `toml:"disk_rate_limiter_ops_max_rate"`
+	DiskRateLimiterOpsOneTimeBurst int64           `toml:"disk_rate_limiter_ops_one_time_burst"`
+	NetRateLimiterBwMaxRate        int64           `toml:"net_rate_limiter_bw_max_rate"`
+	NetRateLimiterBwOneTimeBurst   int64           `toml:"net_rate_limiter_bw_one_time_burst"`
+	NetRateLimiterOpsMaxRate       int64           `toml:"net_rate_limiter_ops_max_rate"`
+	NetRateLimiterOpsOneTimeBurst  int64           `toml:"net_rate_limiter_ops_one_time_burst"`
+	VirtioFSCacheSize              uint32          `toml:"virtio_fs_cache_size"`
+	VirtioFSQueueSize              uint32          `toml:"virtio_fs_queue_size"`
+	DefaultMaxVCPUs                uint32          `toml:"default_maxvcpus"`
+	MemorySize                     uint32          `toml:"default_memory"`
+	MemSlots                       uint32          `toml:"memory_slots"`
+	DefaultBridges                 uint32          `toml:"default_bridges"`
+	Msize9p                        uint32          `toml:"msize_9p"`
+	GuestPreAttestationGRPCTimeout uint32          `toml:"guest_pre_attestation_grpc_timeout"`
+	SEVGuestPolicy                 uint32          `toml:"sev_guest_policy"`
+	SNPGuestPolicy                 uint64          `toml:"snp_guest_policy"`
+	RemoteHypervisorTimeout        uint32          `toml:"remote_hypervisor_timeout"`
+	NumVCPUs                       int32           `toml:"default_vcpus"`
+	BlockDeviceCacheSet            bool            `toml:"block_device_cache_set"`
+	BlockDeviceCacheDirect         bool            `toml:"block_device_cache_direct"`
+	BlockDeviceCacheNoflush        bool            `toml:"block_device_cache_noflush"`
+	EnableVhostUserStore           bool            `toml:"enable_vhost_user_store"`
+	VhostUserDeviceReconnect       uint32          `toml:"vhost_user_reconnect_timeout_sec"`
+	DisableBlockDeviceUse          bool            `toml:"disable_block_device_use"`
+	MemPrealloc                    bool            `toml:"enable_mem_prealloc"`
+	HugePages                      bool            `toml:"enable_hugepages"`
+	VirtioMem                      bool            `toml:"enable_virtio_mem"`
+	IOMMU                          bool            `toml:"enable_iommu"`
+	IOMMUPlatform                  bool            `toml:"enable_iommu_platform"`
+	Debug                          bool            `toml:"enable_debug"`
+	DisableNestingChecks           bool            `toml:"disable_nesting_checks"`
+	EnableIOThreads                bool            `toml:"enable_iothreads"`
+	DisableImageNvdimm             bool            `toml:"disable_image_nvdimm"`
+	HotplugVFIOOnRootBus           bool            `toml:"hotplug_vfio_on_root_bus"`
+	HotPlugVFIO                    config.PCIePort `toml:"hot_plug_vfio"`
+	ColdPlugVFIO                   config.PCIePort `toml:"cold_plug_vfio"`
+	DisableVhostNet                bool            `toml:"disable_vhost_net"`
+	GuestMemoryDumpPaging          bool            `toml:"guest_memory_dump_paging"`
+	ConfidentialGuest              bool            `toml:"confidential_guest"`
+	SevSnpGuest                    bool            `toml:"sev_snp_guest"`
+	GuestSwap                      bool            `toml:"enable_guest_swap"`
+	Rootless                       bool            `toml:"rootless"`
+	DisableSeccomp                 bool            `toml:"disable_seccomp"`
+	DisableSeLinux                 bool            `toml:"disable_selinux"`
+	DisableGuestSeLinux            bool            `toml:"disable_guest_selinux"`
+	LegacySerial                   bool            `toml:"use_legacy_serial"`
+	GuestPreAttestation            bool            `toml:"guest_pre_attestation"`
 }

 type runtime struct {
@@ -298,12 +297,18 @@ func (h hypervisor) firmware() (string, error) {
 	return ResolvePath(p)
 }

-func (h hypervisor) coldPlugVFIO() hv.PCIePort {
+func (h hypervisor) coldPlugVFIO() config.PCIePort {
 	if h.ColdPlugVFIO == "" {
 		return defaultColdPlugVFIO
 	}
 	return h.ColdPlugVFIO
 }
+func (h hypervisor) hotPlugVFIO() config.PCIePort {
+	if h.HotPlugVFIO == "" {
+		return defaultHotPlugVFIO
+	}
+	return h.HotPlugVFIO
+}

 func (h hypervisor) firmwareVolume() (string, error) {
 	p := h.FirmwareVolume
@@ -523,7 +528,7 @@ func (h hypervisor) blockDeviceAIO() (string, error) {
 }

 func (h hypervisor) sharedFS() (string, error) {
-	supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus}
+	supportedSharedFS := []string{config.Virtio9P, config.VirtioFS, config.VirtioFSNydus, config.NoSharedFS}

 	if h.SharedFS == "" {
 		return config.VirtioFS, nil
@@ -838,6 +843,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
 		KernelPath:                kernel,
 		InitrdPath:                initrd,
 		ImagePath:                 image,
+		RootfsType:                rootfsType,
 		FirmwarePath:              firmware,
 		FirmwareVolumePath:        firmwareVolume,
 		PFlash:                    pflashes,
@@ -880,8 +886,8 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
 		Msize9p:                   h.msize9p(),
 		DisableImageNvdimm:        h.DisableImageNvdimm,
 		HotplugVFIOOnRootBus:      h.HotplugVFIOOnRootBus,
+		HotPlugVFIO:               h.hotPlugVFIO(),
 		ColdPlugVFIO:              h.coldPlugVFIO(),
-		PCIeRootPort:              h.PCIeRootPort,
 		DisableVhostNet:           h.DisableVhostNet,
 		EnableVhostUserStore:      h.EnableVhostUserStore,
 		VhostUserStorePath:        h.vhostUserStorePath(),
@@ -907,7 +913,6 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
 		SNPGuestPolicy:            h.getSnpGuestPolicy(),
 		SEVCertChainPath:          h.SEVCertChainPath,
 		DisableGuestSeLinux:       h.DisableGuestSeLinux,
-		RootfsType:                rootfsType,
 	}, nil
 }

@@ -1034,11 +1039,12 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
 		return vc.HypervisorConfig{}, err
 	}

-	if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus {
-		return vc.HypervisorConfig{}, errors.New("clh only support virtio-fs or virtio-fs-nydus")
+	if sharedFS != config.VirtioFS && sharedFS != config.VirtioFSNydus && sharedFS != config.NoSharedFS {
+		return vc.HypervisorConfig{},
+			fmt.Errorf("Cloud Hypervisor does not support %s shared filesystem option", sharedFS)
 	}

-	if h.VirtioFSDaemon == "" {
+	if (sharedFS == config.VirtioFS || sharedFS == config.VirtioFSNydus) && h.VirtioFSDaemon == "" {
 		return vc.HypervisorConfig{},
 			fmt.Errorf("cannot enable %s without daemon path in configuration file", sharedFS)
 	}
@@ -1084,7 +1090,7 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
 		Msize9p:                        h.msize9p(),
 		HotplugVFIOOnRootBus:           h.HotplugVFIOOnRootBus,
 		ColdPlugVFIO:                   h.coldPlugVFIO(),
-		PCIeRootPort:                   h.PCIeRootPort,
+		HotPlugVFIO:                    h.hotPlugVFIO(),
 		DisableVhostNet:                true,
 		GuestHookPath:                  h.guestHookPath(),
 		VirtioFSExtraArgs:              h.VirtioFSExtraArgs,
@@ -1302,6 +1308,7 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
 		KernelPath:                defaultKernelPath,
 		ImagePath:                 defaultImagePath,
 		InitrdPath:                defaultInitrdPath,
+		RootfsType:                defaultRootfsType,
 		FirmwarePath:              defaultFirmwarePath,
 		FirmwareVolumePath:        defaultFirmwareVolumePath,
 		MachineAccelerators:       defaultMachineAccelerators,
@@ -1330,9 +1337,10 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
 		Msize9p:                   defaultMsize9p,
 		HotplugVFIOOnRootBus:      defaultHotplugVFIOOnRootBus,
 		ColdPlugVFIO:              defaultColdPlugVFIO,
-		PCIeRootPort:              defaultPCIeRootPort,
+		HotPlugVFIO:               defaultHotPlugVFIO,
 		GuestHookPath:             defaultGuestHookPath,
 		VhostUserStorePath:        defaultVhostUserStorePath,
+		VhostUserDeviceReconnect:  defaultVhostUserDeviceReconnect,
 		VirtioFSCache:             defaultVirtioFSCacheMode,
 		DisableImageNvdimm:        defaultDisableImageNvdimm,
 		RxRateLimiterMaxRate:      defaultRxRateLimiterMaxRate,
@@ -1352,8 +1360,6 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig {
 		SEVGuestPolicy:            defaultSEVGuestPolicy,
 		SNPGuestPolicy:            defaultSNPGuestPolicy,
 		SEVCertChainPath:          defaultSEVCertChainPath,
-		VhostUserDeviceReconnect:  defaultVhostUserDeviceReconnect,
-		RootfsType:                defaultRootfsType,
 	}
 }

@@ -1711,9 +1717,10 @@ func checkConfig(config oci.RuntimeConfig) error {
 		return err
 	}

+	hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO
 	coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO
 	machineType := config.HypervisorConfig.HypervisorMachineType
-	if err := checkPCIeConfig(coldPlugVFIO, machineType); err != nil {
+	if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType); err != nil {
 		return err
 	}

@@ -1723,18 +1730,32 @@ func checkConfig(config oci.RuntimeConfig) error {
 // checkPCIeConfig ensures the PCIe configuration is valid.
 // Only allow one of the following settings for cold-plug:
 // no-port, root-port, switch-port
-func checkPCIeConfig(vfioPort hv.PCIePort, machineType string) error {
+func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string) error {
 	// Currently only QEMU q35 supports advanced PCIe topologies
 	// firecracker, dragonball do not have right now any PCIe support
 	if machineType != "q35" {
 		return nil
 	}
-	if vfioPort == hv.NoPort || vfioPort == hv.RootPort || vfioPort == hv.SwitchPort {
+
+	if coldPlug != config.NoPort && hotPlug != config.NoPort {
+		return fmt.Errorf("invalid hot-plug=%s and cold-plug=%s settings, only one of them can be set", coldPlug, hotPlug)
+	}
+	if coldPlug == config.NoPort && hotPlug == config.NoPort {
+		return nil
+	}
+	var port config.PCIePort
+	if coldPlug != config.NoPort {
+		port = coldPlug
+	}
+	if hotPlug != config.NoPort {
+		port = hotPlug
+	}
+	if port == config.NoPort || port == config.BridgePort || port == config.RootPort || port == config.SwitchPort {
 		return nil
 	}

-	return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s",
-		vfioPort, hv.NoPort, hv.RootPort, hv.SwitchPort)
+	return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s, %s",
+		coldPlug, config.NoPort, config.BridgePort, config.RootPort, config.SwitchPort)
 }

 // checkNetNsConfig performs sanity checks on disable_new_netns config.
--- a/src/runtime/pkg/katautils/config_test.go
+++ b/src/runtime/pkg/katautils/config_test.go
@@ -18,8 +18,8 @@ import (
 	"syscall"
 	"testing"

+	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/govmm"
-	hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
 	ktu "github.com/kata-containers/kata-containers/src/runtime/pkg/katatestutils"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
 	vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
@@ -63,15 +63,16 @@ func createConfig(configPath string, fileData string) error {

 // createAllRuntimeConfigFiles creates all files necessary to call
 // loadConfiguration().
-func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConfig, err error) {
+func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntimeConfig, err error) {
 	if dir == "" {
-		return config, fmt.Errorf("BUG: need directory")
+		return testConfig, fmt.Errorf("BUG: need directory")
 	}

 	if hypervisor == "" {
-		return config, fmt.Errorf("BUG: need hypervisor")
+		return testConfig, fmt.Errorf("BUG: need hypervisor")
 	}
-	var coldPlugVFIO hv.PCIePort
+	var hotPlugVFIO config.PCIePort
+	var coldPlugVFIO config.PCIePort
 	hypervisorPath := path.Join(dir, "hypervisor")
 	kernelPath := path.Join(dir, "kernel")
 	kernelParams := "foo=bar xyz"
@@ -85,8 +86,8 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
 	blockDeviceAIO := "io_uring"
 	enableIOThreads := true
 	hotplugVFIOOnRootBus := true
-	pcieRootPort := uint32(2)
-	coldPlugVFIO = hv.RootPort
+	hotPlugVFIO = config.NoPort
+	coldPlugVFIO = config.BridgePort
 	disableNewNetNs := false
 	sharedFS := "virtio-9p"
 	virtioFSdaemon := path.Join(dir, "virtiofsd")
@@ -108,7 +109,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
 		BlockDeviceAIO:       blockDeviceAIO,
 		EnableIOThreads:      enableIOThreads,
 		HotplugVFIOOnRootBus: hotplugVFIOOnRootBus,
-		PCIeRootPort:         pcieRootPort,
+		HotPlugVFIO:          hotPlugVFIO,
 		ColdPlugVFIO:         coldPlugVFIO,
 		DisableNewNetNs:      disableNewNetNs,
 		DefaultVCPUCount:     defaultVCPUCount,
@@ -134,7 +135,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
 	configPath := path.Join(dir, "runtime.toml")
 	err = createConfig(configPath, runtimeConfigFileData)
 	if err != nil {
-		return config, err
+		return testConfig, err
 	}

 	configPathLink := path.Join(filepath.Dir(configPath), "link-to-configuration.toml")
@@ -142,7 +143,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
 	// create a link to the config file
 	err = syscall.Symlink(configPath, configPathLink)
 	if err != nil {
-		return config, err
+		return testConfig, err
 	}

 	files := []string{hypervisorPath, kernelPath, imagePath}
@@ -151,7 +152,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
 		// create the resource (which must be >0 bytes)
 		err := WriteFile(file, "foo", testFileMode)
 		if err != nil {
-			return config, err
+			return testConfig, err
 		}
 	}

@@ -172,7 +173,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
 		DefaultBridges:        defaultBridgesCount,
 		EnableIOThreads:       enableIOThreads,
 		HotplugVFIOOnRootBus:  hotplugVFIOOnRootBus,
-		PCIeRootPort:          pcieRootPort,
+		HotPlugVFIO:           hotPlugVFIO,
 		ColdPlugVFIO:          coldPlugVFIO,
 		Msize9p:               defaultMsize9p,
 		MemSlots:              defaultMemSlots,
@@ -217,10 +218,10 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf

 	err = SetKernelParams(&runtimeConfig)
 	if err != nil {
-		return config, err
+		return testConfig, err
 	}

-	config = testRuntimeConfig{
+	rtimeConfig := testRuntimeConfig{
 		RuntimeConfig:     runtimeConfig,
 		RuntimeConfigFile: configPath,
 		ConfigPath:        configPath,
@@ -229,7 +230,7 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (config testRuntimeConf
 		LogPath:           logPath,
 	}

-	return config, nil
+	return rtimeConfig, nil
 }

 // testLoadConfiguration accepts an optional function that can be used
@@ -570,6 +571,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {
 		BlockDeviceAIO:        defaultBlockDeviceAIO,
 		DisableGuestSeLinux:   defaultDisableGuestSeLinux,
 		SNPGuestPolicy:        defaultSNPGuestPolicy,
+		HotPlugVFIO:           defaultHotPlugVFIO,
 		ColdPlugVFIO:          defaultColdPlugVFIO,
 	}

@@ -604,7 +606,7 @@ func TestMinimalRuntimeConfig(t *testing.T) {

 func TestNewQemuHypervisorConfig(t *testing.T) {
 	dir := t.TempDir()
-	var coldPlugVFIO hv.PCIePort
+	var coldPlugVFIO config.PCIePort
 	hypervisorPath := path.Join(dir, "hypervisor")
 	kernelPath := path.Join(dir, "kernel")
 	imagePath := path.Join(dir, "image")
@@ -612,8 +614,7 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
 	disableBlock := true
 	enableIOThreads := true
 	hotplugVFIOOnRootBus := true
-	pcieRootPort := uint32(2)
-	coldPlugVFIO = hv.RootPort
+	coldPlugVFIO = config.BridgePort
 	orgVHostVSockDevicePath := utils.VHostVSockDevicePath
 	blockDeviceAIO := "io_uring"
 	defer func() {
@@ -632,7 +633,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
 		DisableBlockDeviceUse: disableBlock,
 		EnableIOThreads:       enableIOThreads,
 		HotplugVFIOOnRootBus:  hotplugVFIOOnRootBus,
-		PCIeRootPort:          pcieRootPort,
 		ColdPlugVFIO:          coldPlugVFIO,
 		RxRateLimiterMaxRate:  rxRateLimiterMaxRate,
 		TxRateLimiterMaxRate:  txRateLimiterMaxRate,
@@ -688,10 +688,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) {
 		t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus)
 	}

-	if config.PCIeRootPort != pcieRootPort {
-		t.Errorf("Expected value for PCIeRootPort %v, got %v", pcieRootPort, config.PCIeRootPort)
-	}
-
 	if config.RxRateLimiterMaxRate != rxRateLimiterMaxRate {
 		t.Errorf("Expected value for rx rate limiter %v, got %v", rxRateLimiterMaxRate, config.RxRateLimiterMaxRate)
 	}
@@ -814,7 +810,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
 	disableBlock := true
 	enableIOThreads := true
 	hotplugVFIOOnRootBus := true
-	pcieRootPort := uint32(2)

 	hypervisor := hypervisor{
 		Path:                  hypervisorPath,
@@ -825,7 +820,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) {
 		DisableBlockDeviceUse: disableBlock,
 		EnableIOThreads:       enableIOThreads,
 		HotplugVFIOOnRootBus:  hotplugVFIOOnRootBus,
-		PCIeRootPort:          pcieRootPort,
 	}

 	_, err := newQemuHypervisorConfig(hypervisor)
--- a/src/runtime/pkg/oci/utils.go
+++ b/src/runtime/pkg/oci/utils.go
@@ -460,6 +460,10 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
 		return err
 	}

+	if err := addHypervisorHotColdPlugVfioOverrides(ocispec, config); err != nil {
+		return err
+	}
+
 	if value, ok := ocispec.Annotations[vcAnnotations.MachineType]; ok {
 		if value != "" {
 			config.HypervisorConfig.HypervisorMachineType = value
@@ -515,12 +519,6 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig,
 		return err
 	}

-	if err := newAnnotationConfiguration(ocispec, vcAnnotations.PCIeRootPort).setUint(func(pcieRootPort uint64) {
-		config.HypervisorConfig.PCIeRootPort = uint32(pcieRootPort)
-	}); err != nil {
-		return err
-	}
-
 	if value, ok := ocispec.Annotations[vcAnnotations.EntropySource]; ok {
 		if !checkPathIsInGlobs(runtime.HypervisorConfig.EntropySourceList, value) {
 			return fmt.Errorf("entropy source %v required from annotation is not valid", value)
@@ -583,6 +581,37 @@ func addHypervisorPathOverrides(ocispec specs.Spec, config *vc.SandboxConfig, ru
 	return nil
 }

+func addHypervisorPCIePortOverride(value string) (config.PCIePort, error) {
+	if value == "" {
+		return config.NoPort, nil
+	}
+	port := config.PCIePort(value)
+	if port.Invalid() {
+		return config.InvalidPort, fmt.Errorf("Invalid PCIe port \"%v\" specified in annotation", value)
+	}
+	return port, nil
+}
+
+func addHypervisorHotColdPlugVfioOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) error {
+
+	var err error
+	if value, ok := ocispec.Annotations[vcAnnotations.HotPlugVFIO]; ok {
+		if sbConfig.HypervisorConfig.HotPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil {
+			return err
+		}
+		// If hot-plug is specified disable cold-plug and vice versa
+		sbConfig.HypervisorConfig.ColdPlugVFIO = config.NoPort
+	}
+	if value, ok := ocispec.Annotations[vcAnnotations.ColdPlugVFIO]; ok {
+		if sbConfig.HypervisorConfig.ColdPlugVFIO, err = addHypervisorPCIePortOverride(value); err != nil {
+			return err
+		}
+		// If cold-plug is specified disable hot-plug and vice versa
+		sbConfig.HypervisorConfig.HotPlugVFIO = config.NoPort
+	}
+	return nil
+}
+
 func addHypervisorMemoryOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig, runtime RuntimeConfig) error {

 	if err := newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMemory).setUintWithCheck(func(memorySz uint64) error {
--- a/src/runtime/pkg/oci/utils_test.go
+++ b/src/runtime/pkg/oci/utils_test.go
@@ -599,7 +599,7 @@ func TestContainerPipeSizeAnnotation(t *testing.T) {
 func TestAddHypervisorAnnotations(t *testing.T) {
 	assert := assert.New(t)

-	config := vc.SandboxConfig{
+	sbConfig := vc.SandboxConfig{
 		Annotations: make(map[string]string),
 	}

@@ -628,8 +628,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
 	runtimeConfig.HypervisorConfig.VirtioFSDaemonList = []string{"/bin/*ls*"}

 	ocispec.Annotations[vcAnnotations.KernelParams] = "vsyscall=emulate iommu=on"
-	addHypervisorConfigOverrides(ocispec, &config, runtimeConfig)
-	assert.Exactly(expectedHyperConfig, config.HypervisorConfig)
+	addHypervisorConfigOverrides(ocispec, &sbConfig, runtimeConfig)
+	assert.Exactly(expectedHyperConfig, sbConfig.HypervisorConfig)

 	ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1"
 	ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1"
@@ -660,7 +660,8 @@ func TestAddHypervisorAnnotations(t *testing.T) {
 	ocispec.Annotations[vcAnnotations.GuestHookPath] = "/usr/bin/"
 	ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true"
 	ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true"
-	ocispec.Annotations[vcAnnotations.PCIeRootPort] = "2"
+	ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = config.BridgePort
+	ocispec.Annotations[vcAnnotations.HotPlugVFIO] = config.NoPort
 	ocispec.Annotations[vcAnnotations.IOMMUPlatform] = "true"
 	ocispec.Annotations[vcAnnotations.SGXEPC] = "64Mi"
 	ocispec.Annotations[vcAnnotations.UseLegacySerial] = "true"
@@ -668,55 +669,58 @@ func TestAddHypervisorAnnotations(t *testing.T) {
 	ocispec.Annotations[vcAnnotations.RxRateLimiterMaxRate] = "10000000"
 	ocispec.Annotations[vcAnnotations.TxRateLimiterMaxRate] = "10000000"

-	addAnnotations(ocispec, &config, runtimeConfig)
-	assert.Equal(config.HypervisorConfig.NumVCPUs, uint32(1))
-	assert.Equal(config.HypervisorConfig.DefaultMaxVCPUs, uint32(1))
-	assert.Equal(config.HypervisorConfig.MemorySize, uint32(1024))
-	assert.Equal(config.HypervisorConfig.MemSlots, uint32(20))
-	assert.Equal(config.HypervisorConfig.MemOffset, uint64(512))
-	assert.Equal(config.HypervisorConfig.VirtioMem, true)
-	assert.Equal(config.HypervisorConfig.MemPrealloc, true)
-	assert.Equal(config.HypervisorConfig.FileBackedMemRootDir, "/dev/shm")
-	assert.Equal(config.HypervisorConfig.HugePages, true)
-	assert.Equal(config.HypervisorConfig.IOMMU, true)
-	assert.Equal(config.HypervisorConfig.BlockDeviceDriver, "virtio-scsi")
-	assert.Equal(config.HypervisorConfig.BlockDeviceAIO, "io_uring")
-	assert.Equal(config.HypervisorConfig.DisableBlockDeviceUse, true)
-	assert.Equal(config.HypervisorConfig.EnableIOThreads, true)
-	assert.Equal(config.HypervisorConfig.BlockDeviceCacheSet, true)
-	assert.Equal(config.HypervisorConfig.BlockDeviceCacheDirect, true)
-	assert.Equal(config.HypervisorConfig.BlockDeviceCacheNoflush, true)
-	assert.Equal(config.HypervisorConfig.SharedFS, "virtio-fs")
-	assert.Equal(config.HypervisorConfig.VirtioFSDaemon, "/bin/false")
-	assert.Equal(config.HypervisorConfig.VirtioFSCache, "auto")
-	assert.ElementsMatch(config.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"})
-	assert.Equal(config.HypervisorConfig.Msize9p, uint32(512))
-	assert.Equal(config.HypervisorConfig.HypervisorMachineType, "q35")
-	assert.Equal(config.HypervisorConfig.MachineAccelerators, "nofw")
-	assert.Equal(config.HypervisorConfig.CPUFeatures, "pmu=off")
-	assert.Equal(config.HypervisorConfig.DisableVhostNet, true)
-	assert.Equal(config.HypervisorConfig.GuestHookPath, "/usr/bin/")
-	assert.Equal(config.HypervisorConfig.DisableImageNvdimm, true)
-	assert.Equal(config.HypervisorConfig.HotplugVFIOOnRootBus, true)
-	assert.Equal(config.HypervisorConfig.PCIeRootPort, uint32(2))
-	assert.Equal(config.HypervisorConfig.IOMMUPlatform, true)
-	assert.Equal(config.HypervisorConfig.SGXEPCSize, int64(67108864))
-	assert.Equal(config.HypervisorConfig.LegacySerial, true)
-	assert.Equal(config.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000))
-	assert.Equal(config.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000))
+	err := addAnnotations(ocispec, &sbConfig, runtimeConfig)
+	assert.NoError(err)
+
+	assert.Equal(sbConfig.HypervisorConfig.NumVCPUs, uint32(1))
+	assert.Equal(sbConfig.HypervisorConfig.DefaultMaxVCPUs, uint32(1))
+	assert.Equal(sbConfig.HypervisorConfig.MemorySize, uint32(1024))
+	assert.Equal(sbConfig.HypervisorConfig.MemSlots, uint32(20))
+	assert.Equal(sbConfig.HypervisorConfig.MemOffset, uint64(512))
+	assert.Equal(sbConfig.HypervisorConfig.VirtioMem, true)
+	assert.Equal(sbConfig.HypervisorConfig.MemPrealloc, true)
+	assert.Equal(sbConfig.HypervisorConfig.FileBackedMemRootDir, "/dev/shm")
+	assert.Equal(sbConfig.HypervisorConfig.HugePages, true)
+	assert.Equal(sbConfig.HypervisorConfig.IOMMU, true)
+	assert.Equal(sbConfig.HypervisorConfig.BlockDeviceDriver, "virtio-scsi")
+	assert.Equal(sbConfig.HypervisorConfig.BlockDeviceAIO, "io_uring")
+	assert.Equal(sbConfig.HypervisorConfig.DisableBlockDeviceUse, true)
+	assert.Equal(sbConfig.HypervisorConfig.EnableIOThreads, true)
+	assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheSet, true)
+	assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheDirect, true)
+	assert.Equal(sbConfig.HypervisorConfig.BlockDeviceCacheNoflush, true)
+	assert.Equal(sbConfig.HypervisorConfig.SharedFS, "virtio-fs")
+	assert.Equal(sbConfig.HypervisorConfig.VirtioFSDaemon, "/bin/false")
+	assert.Equal(sbConfig.HypervisorConfig.VirtioFSCache, "auto")
+	assert.ElementsMatch(sbConfig.HypervisorConfig.VirtioFSExtraArgs, [2]string{"arg0", "arg1"})
+	assert.Equal(sbConfig.HypervisorConfig.Msize9p, uint32(512))
+	assert.Equal(sbConfig.HypervisorConfig.HypervisorMachineType, "q35")
+	assert.Equal(sbConfig.HypervisorConfig.MachineAccelerators, "nofw")
+	assert.Equal(sbConfig.HypervisorConfig.CPUFeatures, "pmu=off")
+	assert.Equal(sbConfig.HypervisorConfig.DisableVhostNet, true)
+	assert.Equal(sbConfig.HypervisorConfig.GuestHookPath, "/usr/bin/")
+	assert.Equal(sbConfig.HypervisorConfig.DisableImageNvdimm, true)
+	assert.Equal(sbConfig.HypervisorConfig.HotplugVFIOOnRootBus, true)
+	assert.Equal(string(sbConfig.HypervisorConfig.ColdPlugVFIO), string(config.BridgePort))
+	assert.Equal(string(sbConfig.HypervisorConfig.HotPlugVFIO), string(config.NoPort))
+	assert.Equal(sbConfig.HypervisorConfig.IOMMUPlatform, true)
+	assert.Equal(sbConfig.HypervisorConfig.SGXEPCSize, int64(67108864))
+	assert.Equal(sbConfig.HypervisorConfig.LegacySerial, true)
+	assert.Equal(sbConfig.HypervisorConfig.RxRateLimiterMaxRate, uint64(10000000))
+	assert.Equal(sbConfig.HypervisorConfig.TxRateLimiterMaxRate, uint64(10000000))

 	// In case an absurd large value is provided, the config value if not over-ridden
 	ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "655536"
-	err := addAnnotations(ocispec, &config, runtimeConfig)
+	err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
 	assert.Error(err)

 	ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "-1"
-	err = addAnnotations(ocispec, &config, runtimeConfig)
+	err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
 	assert.Error(err)

 	ocispec.Annotations[vcAnnotations.DefaultVCPUs] = "1"
 	ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "-1"
-	err = addAnnotations(ocispec, &config, runtimeConfig)
+	err = addAnnotations(ocispec, &sbConfig, runtimeConfig)
 	assert.Error(err)

 	ocispec.Annotations[vcAnnotations.DefaultMaxVCPUs] = "1"
--- a/src/runtime/virtcontainers/acrn.go
+++ b/src/runtime/virtcontainers/acrn.go
@@ -90,7 +90,7 @@ func (a *Acrn) Capabilities(ctx context.Context) types.Capabilities {
 	span, _ := katatrace.Trace(ctx, a.Logger(), "Capabilities", acrnTracingTags, map[string]string{"sandbox_id": a.id})
 	defer span.End()

-	return a.arch.capabilities()
+	return a.arch.capabilities(a.config)
 }

 func (a *Acrn) HypervisorConfig() HypervisorConfig {
--- a/src/runtime/virtcontainers/acrn_arch_base.go
+++ b/src/runtime/virtcontainers/acrn_arch_base.go
@@ -33,7 +33,7 @@ type acrnArch interface {
 	kernelParameters(debug bool) []Param

 	//capabilities returns the capabilities supported by acrn
-	capabilities() types.Capabilities
+	capabilities(config HypervisorConfig) types.Capabilities

 	// memoryTopology returns the memory topology using the given amount of memoryMb and hostMemoryMb
 	memoryTopology(memMb uint64) Memory
@@ -361,7 +361,7 @@ func (a *acrnArchBase) memoryTopology(memoryMb uint64) Memory {
 	return memory
 }

-func (a *acrnArchBase) capabilities() types.Capabilities {
+func (a *acrnArchBase) capabilities(config HypervisorConfig) types.Capabilities {
 	var caps types.Capabilities

 	caps.SetBlockDeviceSupport()
--- a/src/runtime/virtcontainers/acrn_arch_base_test.go
+++ b/src/runtime/virtcontainers/acrn_arch_base_test.go
@@ -83,8 +83,9 @@ func TestAcrnArchBaseKernelParameters(t *testing.T) {
 func TestAcrnArchBaseCapabilities(t *testing.T) {
 	assert := assert.New(t)
 	acrnArchBase := newAcrnArchBase()
+	config := HypervisorConfig{}

-	c := acrnArchBase.capabilities()
+	c := acrnArchBase.capabilities(config)
 	assert.True(c.IsBlockDeviceSupported())
 	assert.True(c.IsBlockDeviceHotplugSupported())
 	assert.False(c.IsFsSharingSupported())
--- a/src/runtime/virtcontainers/clh.go
+++ b/src/runtime/virtcontainers/clh.go
@@ -349,6 +349,10 @@ func (clh *cloudHypervisor) createVirtiofsDaemon(sharedPath string) (VirtiofsDae
 }

 func (clh *cloudHypervisor) setupVirtiofsDaemon(ctx context.Context) error {
+	if clh.config.SharedFS == config.NoSharedFS {
+		return nil
+	}
+
 	if clh.config.SharedFS == config.Virtio9P {
 		return errors.New("cloud-hypervisor only supports virtio based file sharing")
 	}
@@ -860,12 +864,12 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
 	defer cancel()

 	// Create the clh device config via the constructor to ensure default values are properly assigned
-	clhDevice := *chclient.NewDeviceConfig(*(*device).GetSysfsDev())
+	clhDevice := *chclient.NewDeviceConfig(device.SysfsDev)
 	pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice)
 	if err != nil {
 		return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err))
 	}
-	clh.devicesIds[*(*device).GetID()] = pciInfo.GetId()
+	clh.devicesIds[device.ID] = pciInfo.GetId()

 	// clh doesn't use bridges, so the PCI path is simply the slot
 	// number of the device.  This will break if clh starts using
@@ -882,14 +886,11 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
 		return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf)
 	}

-	guestPciPath, err := types.PciPathFromString(tokens[0])
-
-	pciDevice, ok := (*device).(config.VFIOPCIDev)
-	if !ok {
+	if device.Type == config.VFIOAPDeviceMediatedType {
 		return fmt.Errorf("VFIO device %+v is not PCI, only PCI is supported in Cloud Hypervisor", device)
 	}
-	pciDevice.GuestPciPath = guestPciPath
-	*device = pciDevice
+
+	device.GuestPciPath, err = types.PciPathFromString(tokens[0])

 	return err
 }
@@ -933,7 +934,7 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int
 	case BlockDev:
 		deviceID = clhDriveIndexToID(devInfo.(*config.BlockDrive).Index)
 	case VfioDev:
-		deviceID = *devInfo.(config.VFIODev).GetID()
+		deviceID = devInfo.(*config.VFIODev).ID
 	default:
 		clh.Logger().WithFields(log.Fields{"devInfo": devInfo,
 			"deviceType": devType}).Error("HotplugRemoveDevice: unsupported device")
@@ -1210,7 +1211,9 @@ func (clh *cloudHypervisor) Capabilities(ctx context.Context) types.Capabilities

 	clh.Logger().WithField("function", "Capabilities").Info("get Capabilities")
 	var caps types.Capabilities
-	caps.SetFsSharingSupport()
+	if clh.config.SharedFS != config.NoSharedFS {
+		caps.SetFsSharingSupport()
+	}
 	caps.SetBlockDeviceHotplugSupport()
 	return caps
 }
--- a/src/runtime/virtcontainers/clh_test.go
+++ b/src/runtime/virtcontainers/clh_test.go
@@ -673,7 +673,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) {
 	_, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev)
 	assert.NoError(err, "Hotplug remove block device expected no error")

-	_, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIOPCIDev{}, VfioDev)
+	_, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIODev{}, VfioDev)
 	assert.NoError(err, "Hotplug remove vfio block device expected no error")

 	_, err = clh.HotplugRemoveDevice(context.Background(), nil, NetDev)
@@ -726,3 +726,30 @@ func TestClhSetConfig(t *testing.T) {

 	assert.Equal(clh.config, config)
 }
+
+func TestClhCapabilities(t *testing.T) {
+	assert := assert.New(t)
+
+	hConfig, err := newClhConfig()
+	assert.NoError(err)
+
+	clh := &cloudHypervisor{}
+	assert.Equal(clh.config, HypervisorConfig{})
+
+	hConfig.SharedFS = config.VirtioFS
+
+	err = clh.setConfig(&hConfig)
+	assert.NoError(err)
+
+	var ctx context.Context
+	c := clh.Capabilities(ctx)
+	assert.True(c.IsFsSharingSupported())
+
+	hConfig.SharedFS = config.NoSharedFS
+
+	err = clh.setConfig(&hConfig)
+	assert.NoError(err)
+
+	c = clh.Capabilities(ctx)
+	assert.False(c.IsFsSharingSupported())
+}
--- a/src/runtime/virtcontainers/documentation/api/1.0/api.md
+++ b/src/runtime/virtcontainers/documentation/api/1.0/api.md
@@ -288,12 +288,12 @@ type HypervisorConfig struct {
 	// root bus instead of a bridge.
 	HotplugVFIOOnRootBus bool

-	// PCIeRootPort is used to indicate the number of PCIe Root Port devices
-	// The PCIe Root Port device is used to hot-plug the PCIe device
-	PCIeRootPort uint32
+	// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
+	// root port, switch, bridge or no port
+	HotPlugVFIO hv.PCIePort

 	// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
-	// root port, switch or no port
+	// root port, switch, bridge or no port
 	ColdPlugVFIO hv.PCIePort

 	// BootToBeTemplate used to indicate if the VM is created to be a template VM
--- a/src/runtime/virtcontainers/hypervisor.go
+++ b/src/runtime/virtcontainers/hypervisor.go
@@ -389,7 +389,6 @@ type HypervisorConfig struct {
 	Gid                            uint32
 	SEVGuestPolicy                 uint32
 	SNPGuestPolicy                 uint64
-	PCIeRootPort                   uint32
 	NumVCPUs                       uint32
 	RemoteHypervisorTimeout        uint32
 	IOMMUPlatform                  bool
@@ -420,7 +419,10 @@ type HypervisorConfig struct {
 	DisableSeLinux                 bool
 	DisableGuestSeLinux            bool
 	LegacySerial                   bool
-	ColdPlugVFIO                   hv.PCIePort
+	HotPlugVFIO                    config.PCIePort
+	ColdPlugVFIO                   config.PCIePort
+	VFIODevices                    []config.DeviceInfo
+	VhostUserBlkDevices            []config.DeviceInfo
 }

 // vcpu mapping from vcpu number to thread number
--- a/src/runtime/virtcontainers/kata_agent.go
+++ b/src/runtime/virtcontainers/kata_agent.go
@@ -21,6 +21,7 @@ import (
 	"github.com/docker/go-units"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/api"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
+	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
 	volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/uuid"
@@ -1148,7 +1149,7 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
 		ContainerPath: dev.ContainerPath,
 		Type:          kataVfioPciDevType,
 		Id:            groupNum,
-		Options:       nil,
+		Options:       make([]string, len(devList)),
 	}

 	// We always pass the device information to the agent, since
@@ -1158,16 +1159,16 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c *
 	if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel {
 		kataDevice.Type = kataVfioPciGuestKernelDevType
 	}
+	for i, dev := range devList {
+		if dev.Type == config.VFIOAPDeviceMediatedType {
+			kataDevice.Type = kataVfioApDevType
+			kataDevice.Options = dev.APDevices
+		} else {

-	if (*devList[0]).GetType() == config.VFIOAPDeviceMediatedType {
-		kataDevice.Type = kataVfioApDevType
-		kataDevice.Options = (*devList[0]).(config.VFIOAPDev).APDevices
-	} else {
-		kataDevice.Options = make([]string, len(devList))
-		for i, device := range devList {
-			pciDevice := (*device).(config.VFIOPCIDev)
-			kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDevice.BDF, pciDevice.GuestPciPath)
+			devBDF := drivers.GetBDF(dev.BDF)
+			kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", devBDF, dev.GuestPciPath)
 		}
+
 	}

 	return kataDevice
@@ -1354,7 +1355,6 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co
 	if _, err = k.sendReq(ctx, req); err != nil {
 		return nil, err
 	}
-
 	return buildProcessFromExecID(req.ExecId)
 }

--- a/src/runtime/virtcontainers/persist.go
+++ b/src/runtime/virtcontainers/persist.go
@@ -245,7 +245,6 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) {
 		DisableNestingChecks:    sconfig.HypervisorConfig.DisableNestingChecks,
 		DisableImageNvdimm:      sconfig.HypervisorConfig.DisableImageNvdimm,
 		HotplugVFIOOnRootBus:    sconfig.HypervisorConfig.HotplugVFIOOnRootBus,
-		PCIeRootPort:            sconfig.HypervisorConfig.PCIeRootPort,
 		BootToBeTemplate:        sconfig.HypervisorConfig.BootToBeTemplate,
 		BootFromTemplate:        sconfig.HypervisorConfig.BootFromTemplate,
 		DisableVhostNet:         sconfig.HypervisorConfig.DisableVhostNet,
@@ -487,8 +486,8 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) {
 		DisableNestingChecks:    hconf.DisableNestingChecks,
 		DisableImageNvdimm:      hconf.DisableImageNvdimm,
 		HotplugVFIOOnRootBus:    hconf.HotplugVFIOOnRootBus,
+		HotPlugVFIO:             hconf.HotPlugVFIO,
 		ColdPlugVFIO:            hconf.ColdPlugVFIO,
-		PCIeRootPort:            hconf.PCIeRootPort,
 		BootToBeTemplate:        hconf.BootToBeTemplate,
 		BootFromTemplate:        hconf.BootFromTemplate,
 		DisableVhostNet:         hconf.DisableVhostNet,
--- a/src/runtime/virtcontainers/persist/api/config.go
+++ b/src/runtime/virtcontainers/persist/api/config.go
@@ -7,7 +7,7 @@
 package persistapi

 import (
-	hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
+	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 )
@@ -131,10 +131,6 @@ type HypervisorConfig struct {
 	// Enable SGX. Hardware-based isolation and memory encryption.
 	SGXEPCSize int64

-	// PCIeRootPort is used to indicate the number of PCIe Root Port devices
-	// The PCIe Root Port device is used to hot-plug the PCIe device
-	PCIeRootPort uint32
-
 	// NumVCPUs specifies default number of vCPUs for the VM.
 	NumVCPUs uint32

@@ -199,9 +195,13 @@ type HypervisorConfig struct {
 	// root bus instead of a bridge.
 	HotplugVFIOOnRootBus bool

+	// HotPlugVFIO is used to indicate if devices need to be hotplugged on the
+	// root, switch, bridge or no-port
+	HotPlugVFIO config.PCIePort
+
 	// ColdPlugVFIO is used to indicate if devices need to be coldplugged on the
-	// root port or a switch or no-port
-	ColdPlugVFIO hv.PCIePort
+	// root, bridge, switch or no-port
+	ColdPlugVFIO config.PCIePort

 	// BootToBeTemplate used to indicate if the VM is created to be a template VM
 	BootToBeTemplate bool
--- a/src/runtime/virtcontainers/pkg/annotations/annotations.go
+++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go
@@ -143,9 +143,11 @@ const (
 	// root bus instead of a bridge.
 	HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus"

-	// PCIeRootPort is used to indicate the number of PCIe Root Port devices
-	// The PCIe Root Port device is used to hot-plug the PCIe device
-	PCIeRootPort = kataAnnotHypervisorPrefix + "pcie_root_port"
+	// ColdPlugVFIO is a sandbox annotation used to indicate if devices need to be coldplugged.
+	ColdPlugVFIO = kataAnnotHypervisorPrefix + "cold_plug_vfio"
+
+	// HotPlugVFIO is a sandbox annotation used to indicate if devices need to be hotplugged.
+	HotPlugVFIO = kataAnnotHypervisorPrefix + "hot_plug_vfio"

 	// EntropySource is a sandbox annotation to specify the path to a host source of
 	// entropy (/dev/random, /dev/urandom or real hardware RNG device)
--- a/src/runtime/virtcontainers/qemu.go
+++ b/src/runtime/virtcontainers/qemu.go
@@ -66,6 +66,11 @@ const romFile = ""
 // Default value is false.
 const defaultDisableModern = false

+// A deeper PCIe topology than 5 is already not advisable just for the sake
+// of having enough buffer we limit ourselves to 10 and exit if we reach
+// the root bus
+const maxPCIeTopoDepth = 10
+
 type qmpChannel struct {
 	qmp     *govmmQemu.QMP
 	ctx     context.Context
@@ -76,15 +81,15 @@ type qmpChannel struct {

 // QemuState keeps Qemu's state
 type QemuState struct {
-	UUID    string
-	Bridges []types.Bridge
-	// HotpluggedCPUs is the list of CPUs that were hot-added
+	UUID                 string
+	HotPlugVFIO          config.PCIePort
+	Bridges              []types.Bridge
 	HotpluggedVCPUs      []hv.CPUDevice
 	HotpluggedMemory     int
 	VirtiofsDaemonPid    int
-	PCIeRootPort         int
 	HotplugVFIOOnRootBus bool
-	ColdPlugVFIO         hv.PCIePort
+	HotplugVFIO          config.PCIePort
+	ColdPlugVFIO         config.PCIePort
 }

 // qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
@@ -207,7 +212,7 @@ func (q *qemu) Capabilities(ctx context.Context) types.Capabilities {
 	span, _ := katatrace.Trace(ctx, q.Logger(), "Capabilities", qemuTracingTags, map[string]string{"sandbox_id": q.id})
 	defer span.End()

-	return q.arch.capabilities()
+	return q.arch.capabilities(q.config)
 }

 func (q *qemu) HypervisorConfig() HypervisorConfig {
@@ -278,10 +283,10 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso

 		q.Logger().Debug("Creating UUID")
 		q.state.UUID = uuid.Generate().String()
-
+		q.state.HotPlugVFIO = q.config.HotPlugVFIO
 		q.state.ColdPlugVFIO = q.config.ColdPlugVFIO
 		q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus
-		q.state.PCIeRootPort = int(q.config.PCIeRootPort)
+		q.state.HotPlugVFIO = q.config.HotPlugVFIO

 		// The path might already exist, but in case of VM templating,
 		// we have to create it since the sandbox has not created it yet.
@@ -727,27 +732,12 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi
 		}
 	}

-	// Add PCIe Root Port devices to hypervisor
-	// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port.
-	// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
-	memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory()
-
-	if hypervisorConfig.PCIeRootPort > 0 {
-		qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort, memSize32bit, memSize64bit)
-	}
-
-	// The default OVMF MMIO aperture is too small for some PCIe devices
-	// with huge BARs so we need to increase it.
-	// memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
-	if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
-		pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
-		fwCfg := govmmQemu.FwCfg{
-			Name: "opt/ovmf/X-PciMmio64Mb",
-			Str:  pciMmio64Mb,
+	if machine.Type == QemuQ35 || machine.Type == QemuVirt {
+		if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig, machine.Type); err != nil {
+			q.Logger().WithError(err).Errorf("Cannot create PCIe topology")
+			return err
 		}
-		qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
 	}
-
 	q.qemuConfig = qemuConfig

 	q.virtiofsDaemon, err = q.createVirtiofsDaemon(hypervisorConfig.SharedPath)
@@ -773,6 +763,101 @@ func (q *qemu) checkBpfEnabled() {
 	}
 }

+// If a user uses 8 GPUs with 4 devices in each IOMMU Group that means we need
+// to hotplug 32 devices. We do not have enough PCIe root bus slots to
+// accomplish this task. Kata will use already some slots for vfio-xxxx-pci
+// devices.
+// Max PCI slots per root bus is 32
+// Max PCIe root ports is 16
+// Max PCIe switch ports is 16
+// There is only 64kB of IO memory each root,switch port will consume 4k hence
+// only 16 ports possible.
+func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig, machineType string) error {
+
+	// If no-port set just return no need to add PCIe Root Port or PCIe Switches
+	if hypervisorConfig.HotPlugVFIO == config.NoPort && hypervisorConfig.ColdPlugVFIO == config.NoPort && machineType == QemuQ35 {
+		return nil
+	}
+
+	// Add PCIe Root Port or PCIe Switches to the hypervisor
+	// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged
+	// into a PCIe Root Port or PCIe Switch.
+	// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
+
+	// Deduce the right values for mem-reserve and pref-64-reserve memory regions
+	memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory()
+
+	// The default OVMF MMIO aperture is too small for some PCIe devices
+	// with huge BARs so we need to increase it.
+	// memSize64bit is in bytes, convert to MB, OVMF expects MB as a string
+	if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") {
+		pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024))
+		fwCfg := govmmQemu.FwCfg{
+			Name: "opt/ovmf/X-PciMmio64Mb",
+			Str:  pciMmio64Mb,
+		}
+		qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg)
+	}
+
+	// Get the number of hot(cold)-pluggable ports needed from the provided
+	// VFIO devices and VhostUserBlockDevices
+	var numOfPluggablePorts uint32 = 0
+	for _, dev := range hypervisorConfig.VFIODevices {
+		var err error
+		dev.HostPath, err = config.GetHostPath(dev, false, "")
+		if err != nil {
+			return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err)
+		}
+		devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev)
+		if err != nil {
+			return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err)
+		}
+		for _, vfioDevice := range devicesPerIOMMUGroup {
+			if drivers.IsPCIeDevice(vfioDevice.BDF) {
+				numOfPluggablePorts = numOfPluggablePorts + 1
+			}
+		}
+	}
+	vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus)
+	vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort)
+
+	numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices)
+
+	// If number of PCIe root ports > 16 then bail out otherwise we may
+	// use up all slots or IO memory on the root bus and vfio-XXX-pci devices
+	// cannot be added which are crucial for Kata max slots on root bus is 32
+	// max slots on the complete pci(e) topology is 256 in QEMU
+	if vfioOnRootPort {
+		// On Arm the vhost-user-block device is a PCIe device we need
+		// to account for it in the number of pluggable ports
+		if machineType == QemuVirt {
+			numOfPluggablePorts = numOfPluggablePorts + uint32(numOfVhostUserBlockDevices)
+		}
+		if numOfPluggablePorts > maxPCIeRootPort {
+			return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
+		}
+		qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
+		return nil
+	}
+	if vfioOnSwitchPort {
+		// On Arm the vhost-user-block device is a PCIe device we need
+		// to account for it in the number of pluggable ports
+		if machineType == QemuVirt {
+			numOfPluggableRootPorts := uint32(numOfVhostUserBlockDevices)
+			if numOfPluggableRootPorts > maxPCIeRootPort {
+				return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort)
+			}
+			qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfPluggableRootPorts, memSize32bit, memSize64bit)
+		}
+		if numOfPluggablePorts > maxPCIeSwitchPort {
+			return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeSwitchPort)
+		}
+		qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfPluggablePorts, memSize32bit, memSize64bit)
+		return nil
+	}
+	return nil
+}
+
 func (q *qemu) vhostFSSocketPath(id string) (string, error) {
 	return utils.BuildSocketPath(q.config.VMStorePath, id, vhostFSSocket)
 }
@@ -1612,6 +1697,7 @@ func (q *qemu) hotplugAddBlockDevice(ctx context.Context, drive *config.BlockDri
 }

 func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.VhostUserDeviceAttrs, op Operation, devID string) (err error) {
+
 	err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false, vAttr.ReconnectTime)
 	if err != nil {
 		return err
@@ -1629,18 +1715,14 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V

 	switch machineType {
 	case QemuVirt:
-		if q.state.PCIeRootPort <= 0 {
-			return fmt.Errorf("Vhost-user-blk device is a PCIe device if machine type is virt. Need to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for virt")
-		}
-
 		//The addr of a dev is corresponding with device:function for PCIe in qemu which starting from 0
 		//Since the dev is the first and only one on this bus(root port), it should be 0.
 		addr := "00"

-		bridgeId := fmt.Sprintf("%s%d", pcieRootPortPrefix, len(drivers.AllPCIeDevs))
-		drivers.AllPCIeDevs[devID] = true
+		bridgeID := fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, len(config.PCIeDevices[config.RootPort]))
+		config.PCIeDevices[config.RootPort][devID] = true

-		bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeId)
+		bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID)
 		bridgeSlot, err := q.qomGetSlot(bridgeQomPath)
 		if err != nil {
 			return err
@@ -1656,7 +1738,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V
 			return err
 		}

-		if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeId); err != nil {
+		if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeID); err != nil {
 			return err
 		}

@@ -1770,41 +1852,108 @@ func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) {

 // Query QMP to find a device's PCI path given its QOM path or ID
 func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) {
-	// XXX: For now we assume there's exactly one bridge, since
-	// that's always how we configure qemu from Kata for now.  It
-	// would be good to generalize this to different PCI
-	// topologies
+
+	var slots []types.PciSlot
+
 	devSlot, err := q.qomGetSlot(qemuID)
 	if err != nil {
 		return types.PciPath{}, err
 	}
+	slots = append(slots, devSlot)

-	busq, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qemuID, "parent_bus")
+	// This only works for Q35 and Virt
+	r, _ := regexp.Compile(`^/machine/.*/pcie.0`)
+
+	var parentPath = qemuID
+	// We do not want to use a forever loop here, a deeper PCIe topology
+	// than 5 is already not advisable just for the sake of having enough
+	// buffer we limit ourselves to 10 and leave the loop early if we hit
+	// the root bus.
+	for i := 1; i <= maxPCIeTopoDepth; i++ {
+		parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus")
+		if err != nil {
+			return types.PciPath{}, err
+		}
+
+		busQOM, ok := parenBusQOM.(string)
+		if !ok {
+			return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM)
+		}
+
+		// If we hit /machine/q35/pcie.0 we're done this is the root bus
+		// we climbed the complete hierarchy
+		if r.Match([]byte(busQOM)) {
+			break
+		}
+
+		// `bus` is the QOM path of the QOM bus object, but we need
+		// the PCI parent_bus which manages that bus.  There doesn't seem
+		// to be a way to get that other than to simply drop the last
+		// path component.
+		idx := strings.LastIndex(busQOM, "/")
+		if idx == -1 {
+			return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM)
+		}
+		parentBus := busQOM[:idx]
+
+		parentSlot, err := q.qomGetSlot(parentBus)
+		if err != nil {
+			return types.PciPath{}, err
+		}
+
+		// Prepend the slots, since we're climbing the hierarchy
+		slots = append([]types.PciSlot{parentSlot}, slots...)
+		parentPath = parentBus
+	}
+	return types.PciPathFromSlots(slots...)
+}
+
+func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) {
+	return q.executeVFIODeviceAdd(device)
+}
+
+func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.VFIODev) (err error) {
+	return q.executeVFIODeviceAdd(device)
+}
+
+func (q *qemu) hotplugVFIODeviceBridgePort(ctx context.Context, device *config.VFIODev) (err error) {
+	addr, bridge, err := q.arch.addDeviceToBridge(ctx, device.ID, types.PCI)
 	if err != nil {
-		return types.PciPath{}, err
+		return err
 	}

-	bus, ok := busq.(string)
-	if !ok {
-		return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, busq)
-	}
+	defer func() {
+		if err != nil {
+			q.arch.removeDeviceFromBridge(device.ID)
+		}
+	}()
+	return q.executePCIVFIODeviceAdd(device, addr, bridge.ID)
+}

-	// `bus` is the QOM path of the QOM bus object, but we need
-	// the PCI bridge which manages that bus.  There doesn't seem
-	// to be a way to get that other than to simply drop the last
-	// path component.
-	idx := strings.LastIndex(bus, "/")
-	if idx == -1 {
-		return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", bus)
+func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, bridgeID string) error {
+	switch device.Type {
+	case config.VFIOPCIDeviceNormalType:
+		return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, addr, bridgeID, romFile)
+	case config.VFIOPCIDeviceMediatedType:
+		return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile)
+	case config.VFIOAPDeviceMediatedType:
+		return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
+	default:
+		return fmt.Errorf("Incorrect VFIO device type found")
 	}
-	bridge := bus[:idx]
+}

-	bridgeSlot, err := q.qomGetSlot(bridge)
-	if err != nil {
-		return types.PciPath{}, err
+func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error {
+	switch device.Type {
+	case config.VFIOPCIDeviceNormalType:
+		return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, device.Bus, romFile)
+	case config.VFIOPCIDeviceMediatedType:
+		return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile)
+	case config.VFIOAPDeviceMediatedType:
+		return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev)
+	default:
+		return fmt.Errorf("Incorrect VFIO device type found")
 	}
-
-	return types.PciPathFromSlots(bridgeSlot, devSlot)
 }

 func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op Operation) (err error) {
@@ -1812,109 +1961,53 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op
 		return err
 	}

-	devID := *(*device).GetID()
-	machineType := q.HypervisorConfig().HypervisorMachineType
-
 	if op == AddDevice {
-
 		buf, _ := json.Marshal(device)
 		q.Logger().WithFields(logrus.Fields{
-			"machine-type":             machineType,
-			"hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus,
-			"pcie-root-port":           q.state.PCIeRootPort,
-			"device-info":              string(buf),
+			"machine-type":  q.HypervisorConfig().HypervisorMachineType,
+			"hot-plug-vfio": q.state.HotPlugVFIO,
+			"device-info":   string(buf),
 		}).Info("Start hot-plug VFIO device")
-
-		// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
-		// for pc machine type instead of bridge. This is useful for devices that require
-		// a large PCI BAR which is a currently a limitation with PCI bridges.
-		if q.state.HotplugVFIOOnRootBus {
-			switch (*device).GetType() {
-			case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
-				// In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port.
-				pciDevice, ok := (*device).(config.VFIOPCIDev)
-				if !ok {
-					return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
-				}
-				switch machineType {
-				case QemuQ35:
-					if pciDevice.IsPCIe && q.state.PCIeRootPort <= 0 {
-						q.Logger().WithField("dev-id", (*device).GetID()).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35")
-						pciDevice.Bus = ""
-					}
-				default:
-					pciDevice.Bus = ""
-				}
-				*device = pciDevice
-
-				if pciDevice.Type == config.VFIOPCIDeviceNormalType {
-					err = q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, pciDevice.Bus, romFile)
-				} else {
-					err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), "", pciDevice.Bus, romFile)
-				}
-			case config.VFIOAPDeviceMediatedType:
-				err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
-			}
+		// In case MachineType is q35, a PCIe device is hotplugged on
+		// a PCIe Root Port or alternatively on a PCIe Switch Port
+		if q.HypervisorConfig().HypervisorMachineType != QemuQ35 && q.HypervisorConfig().HypervisorMachineType != QemuVirt {
+			device.Bus = ""
 		} else {
-			addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI)
+			var err error
+			// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
+			// for pc machine type instead of bridge. This is useful for devices that require
+			// a large PCI BAR which is a currently a limitation with PCI bridges.
+			if q.state.HotPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus {
+				err = q.hotplugVFIODeviceRootPort(ctx, device)
+			} else if q.state.HotPlugVFIO == config.SwitchPort {
+				err = q.hotplugVFIODeviceSwitchPort(ctx, device)
+			} else {
+				err = q.hotplugVFIODeviceBridgePort(ctx, device)
+			}
 			if err != nil {
 				return err
 			}
-
-			defer func() {
-				if err != nil {
-					q.arch.removeDeviceFromBridge(devID)
-				}
-			}()
-
-			switch (*device).GetType() {
-			case config.VFIOPCIDeviceNormalType:
-				pciDevice, ok := (*device).(config.VFIOPCIDev)
-				if !ok {
-					return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
-				}
-				err = q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, addr, bridge.ID, romFile)
-			case config.VFIOPCIDeviceMediatedType:
-				err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), addr, bridge.ID, romFile)
-			case config.VFIOAPDeviceMediatedType:
-				err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev())
-			default:
-				return fmt.Errorf("Incorrect VFIO device type found")
-			}
-		}
-		if err != nil {
-			return err
-		}
-
-		switch (*device).GetType() {
-		case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
-			pciDevice, ok := (*device).(config.VFIOPCIDev)
-			if !ok {
-				return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device)
-			}
-			// XXX: Depending on whether we're doing root port or
-			// bridge hotplug, and how the bridge is set up in
-			// other parts of the code, we may or may not already
-			// have information about the slot number of the
-			// bridge and or the device.  For simplicity, just
-			// query both of them back from qemu
-			guestPciPath, err := q.qomGetPciPath(devID)
-			pciDevice.GuestPciPath = guestPciPath
-			*device = pciDevice
-			return err
 		}
+		// XXX: Depending on whether we're doing root port or
+		// bridge hotplug, and how the bridge is set up in
+		// other parts of the code, we may or may not already
+		// have information about the slot number of the
+		// bridge and or the device.  For simplicity, just
+		// query both of them back from qemu
+		device.GuestPciPath, err = q.qomGetPciPath(device.ID)
 		return err
-	} else {
-		q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device")
-
-		if !q.state.HotplugVFIOOnRootBus {
-			if err := q.arch.removeDeviceFromBridge(devID); err != nil {
-				return err
-			}
-		}
-
-		return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID)
 	}
+
+	q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device")
+
+	if !q.state.HotplugVFIOOnRootBus {
+		if err := q.arch.removeDeviceFromBridge(device.ID); err != nil {
+			return err
+		}
+	}
+
+	return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID)
+
 }

 func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error {
@@ -2612,7 +2705,7 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin
 	for i := uint32(0); i < number; i++ {
 		devices = append(devices,
 			govmmQemu.PCIeRootPortDevice{
-				ID:            fmt.Sprintf("%s%d", pcieRootPortPrefix, i),
+				ID:            fmt.Sprintf("%s%d", config.PCIeRootPortPrefix, i),
 				Bus:           bus,
 				Chassis:       chassis,
 				Slot:          strconv.FormatUint(uint64(i), 10),
@@ -2626,6 +2719,79 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin
 	return devices
 }

+// gollangci-lint enforces multi-line comments to be a block comment
+// not multiple single line comments ...
+/*  pcie.0 bus
+//  -------------------------------------------------
+//                           |
+//                     -------------
+//                     | Root Port |
+//                     -------------
+//  -------------------------|------------------------
+//  |                 -----------------              |
+//  |    PCI Express  | Upstream Port |              |
+//  |      Switch     -----------------              |
+//  |                  |            |                |
+//  |    -------------------    -------------------  |
+//  |    | Downstream Port |    | Downstream Port |  |
+//  |    -------------------    -------------------  |
+//  -------------|-----------------------|------------
+//          -------------           --------------
+//          | GPU/ACCEL |           | IB/ETH NIC |
+//          -------------           --------------
+*/
+// genericAppendPCIeSwitch adds a PCIe Swtich
+func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
+
+	// Q35, Virt have the correct PCIe support,
+	// hence ignore all other machines
+	if machineType != QemuQ35 && machineType != QemuVirt {
+		return devices
+	}
+
+	// Using an own ID for the root port, so we do not clash with already
+	// existing root ports adding "s" for switch prefix
+	pcieRootPort := govmmQemu.PCIeRootPortDevice{
+		ID:            fmt.Sprintf("%s%s%d", config.PCIeSwitchPortPrefix, config.PCIeRootPortPrefix, 0),
+		Bus:           defaultBridgeBus,
+		Chassis:       "1",
+		Slot:          strconv.FormatUint(uint64(0), 10),
+		Multifunction: false,
+		Addr:          "0",
+		MemReserve:    fmt.Sprintf("%dB", memSize32bit),
+		Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
+	}
+
+	devices = append(devices, pcieRootPort)
+
+	pcieSwitchUpstreamPort := govmmQemu.PCIeSwitchUpstreamPortDevice{
+		ID:  fmt.Sprintf("%s%d", config.PCIeSwitchUpstreamPortPrefix, 0),
+		Bus: pcieRootPort.ID,
+	}
+	devices = append(devices, pcieSwitchUpstreamPort)
+
+	currentChassis, err := strconv.Atoi(pcieRootPort.Chassis)
+	if err != nil {
+		return devices
+	}
+	nextChassis := currentChassis + 1
+
+	for i := uint32(0); i < number; i++ {
+
+		pcieSwitchDownstreamPort := govmmQemu.PCIeSwitchDownstreamPortDevice{
+			ID:      fmt.Sprintf("%s%d", config.PCIeSwitchhDownstreamPortPrefix, i),
+			Bus:     pcieSwitchUpstreamPort.ID,
+			Chassis: fmt.Sprintf("%d", nextChassis),
+			Slot:    strconv.FormatUint(uint64(i), 10),
+			// TODO: MemReserve:    fmt.Sprintf("%dB", memSize32bit),
+			// TODO: Pref64Reserve: fmt.Sprintf("%dB", memSize64bit),
+		}
+		devices = append(devices, pcieSwitchDownstreamPort)
+	}
+
+	return devices
+}
+
 func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) {
 	span, _ := katatrace.Trace(ctx, q.Logger(), "GetThreadIDs", qemuTracingTags, map[string]string{"sandbox_id": q.id})
 	defer span.End()
@@ -2801,7 +2967,6 @@ func (q *qemu) Save() (s hv.HypervisorState) {
 	s.UUID = q.state.UUID
 	s.HotpluggedMemory = q.state.HotpluggedMemory
 	s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus
-	s.PCIeRootPort = q.state.PCIeRootPort

 	for _, bridge := range q.arch.getBridges() {
 		s.Bridges = append(s.Bridges, hv.Bridge{
@@ -2825,7 +2990,6 @@ func (q *qemu) Load(s hv.HypervisorState) {
 	q.state.HotpluggedMemory = s.HotpluggedMemory
 	q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus
 	q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid
-	q.state.PCIeRootPort = s.PCIeRootPort

 	for _, bridge := range s.Bridges {
 		q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr))
--- a/src/runtime/virtcontainers/qemu_amd64.go
+++ b/src/runtime/virtcontainers/qemu_amd64.go
@@ -26,6 +26,7 @@ import (
 	"google.golang.org/grpc/credentials/insecure"

 	"github.com/intel-go/cpuid"
+	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
 	govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
 )

@@ -182,7 +183,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
 	return q, nil
 }

-func (q *qemuAmd64) capabilities() types.Capabilities {
+func (q *qemuAmd64) capabilities(hConfig HypervisorConfig) types.Capabilities {
 	var caps types.Capabilities

 	if q.qemuMachine.Type == QemuQ35 ||
@@ -191,7 +192,9 @@ func (q *qemuAmd64) capabilities() types.Capabilities {
 	}

 	caps.SetMultiQueueSupport()
-	caps.SetFsSharingSupport()
+	if hConfig.SharedFS != config.NoSharedFS {
+		caps.SetFsSharingSupport()
+	}

 	return caps
 }
@@ -323,6 +326,7 @@ func (q *qemuAmd64) appendProtectionDevice(devices []govmmQemu.Device, firmware,
 				ReducedPhysBits: 1,
 			}), "", nil
 	case noneProtection:
+
 		return devices, firmware, nil

 	default:
--- a/src/runtime/virtcontainers/qemu_amd64_test.go
+++ b/src/runtime/virtcontainers/qemu_amd64_test.go
@@ -42,13 +42,14 @@ func TestQemuAmd64BadMachineType(t *testing.T) {

 func TestQemuAmd64Capabilities(t *testing.T) {
 	assert := assert.New(t)
+	config := HypervisorConfig{}

 	amd64 := newTestQemu(assert, QemuQ35)
-	caps := amd64.capabilities()
+	caps := amd64.capabilities(config)
 	assert.True(caps.IsBlockDeviceHotplugSupported())

 	amd64 = newTestQemu(assert, QemuMicrovm)
-	caps = amd64.capabilities()
+	caps = amd64.capabilities(config)
 	assert.False(caps.IsBlockDeviceHotplugSupported())
 }

--- a/src/runtime/virtcontainers/qemu_arch_base.go
+++ b/src/runtime/virtcontainers/qemu_arch_base.go
@@ -61,7 +61,7 @@ type qemuArch interface {
 	kernelParameters(debug bool) []Param

 	//capabilities returns the capabilities supported by QEMU
-	capabilities() types.Capabilities
+	capabilities(config HypervisorConfig) types.Capabilities

 	// bridges sets the number bridges for the machine type
 	bridges(number uint32)
@@ -150,6 +150,9 @@ type qemuArch interface {
 	// appendPCIeRootPortDevice appends a pcie-root-port device to pcie.0 bus
 	appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device

+	// appendPCIeSwitch appends a ioh3420 device to a pcie-root-port
+	appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device
+
 	// append vIOMMU device
 	appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Device, error)

@@ -204,7 +207,8 @@ const (
 	defaultBridgeBus          = "pcie.0"
 	defaultPCBridgeBus        = "pci.0"
 	maxDevIDSize              = 31
-	pcieRootPortPrefix        = "rp"
+	maxPCIeRootPort           = 16 // Limitation from QEMU
+	maxPCIeSwitchPort         = 16 // Limitation from QEMU
 )

 // This is the PCI start address assigned to the first bridge that
@@ -313,11 +317,13 @@ func (q *qemuArchBase) kernelParameters(debug bool) []Param {
 	return params
 }

-func (q *qemuArchBase) capabilities() types.Capabilities {
+func (q *qemuArchBase) capabilities(hConfig HypervisorConfig) types.Capabilities {
 	var caps types.Capabilities
 	caps.SetBlockDeviceHotplugSupport()
 	caps.SetMultiQueueSupport()
-	caps.SetFsSharingSupport()
+	if hConfig.SharedFS != config.NoSharedFS {
+		caps.SetFsSharingSupport()
+	}
 	return caps
 }

@@ -708,17 +714,17 @@ func (q *qemuArchBase) appendVhostUserDevice(ctx context.Context, devices []govm
 }

 func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device {
-	pciDevice := vfioDev.(config.VFIOPCIDev)
-	if pciDevice.BDF == "" {
+
+	if vfioDev.BDF == "" {
 		return devices
 	}

 	devices = append(devices,
 		govmmQemu.VFIODevice{
-			BDF:      pciDevice.BDF,
-			VendorID: pciDevice.VendorID,
-			DeviceID: pciDevice.DeviceID,
-			Bus:      pciDevice.Bus,
+			BDF:      vfioDev.BDF,
+			VendorID: vfioDev.VendorID,
+			DeviceID: vfioDev.DeviceID,
+			Bus:      vfioDev.Bus,
 		},
 	)

@@ -834,6 +840,13 @@ func (q *qemuArchBase) appendPCIeRootPortDevice(devices []govmmQemu.Device, numb
 	return genericAppendPCIeRootPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
 }

+// appendPCIeSwitchPortDevice appends a PCIe Switch with <number> ports
+func (q *qemuArchBase) appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device {
+	return genericAppendPCIeSwitchPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit)
+}
+
+// getBARsMaxAddressableMemory we need to know the BAR sizes to configure the
+// PCIe Root Port or PCIe Downstream Port attaching a device with huge BARs.
 func (q *qemuArchBase) getBARsMaxAddressableMemory() (uint64, uint64) {

 	pci := nvpci.New()
--- a/src/runtime/virtcontainers/qemu_arch_base_test.go
+++ b/src/runtime/virtcontainers/qemu_arch_base_test.go
@@ -117,9 +117,16 @@ func TestQemuArchBaseKernelParameters(t *testing.T) {
 func TestQemuArchBaseCapabilities(t *testing.T) {
 	assert := assert.New(t)
 	qemuArchBase := newQemuArchBase()
+	hConfig := HypervisorConfig{}
+	hConfig.SharedFS = config.VirtioFS

-	c := qemuArchBase.capabilities()
+	c := qemuArchBase.capabilities(hConfig)
 	assert.True(c.IsBlockDeviceHotplugSupported())
+	assert.True(c.IsFsSharingSupported())
+
+	hConfig.SharedFS = config.NoSharedFS
+	c = qemuArchBase.capabilities(hConfig)
+	assert.False(c.IsFsSharingSupported())
 }

 func TestQemuArchBaseBridges(t *testing.T) {
@@ -463,7 +470,7 @@ func TestQemuArchBaseAppendVFIODevice(t *testing.T) {
 		},
 	}

-	vfDevice := config.VFIOPCIDev{
+	vfDevice := config.VFIODev{
 		BDF: bdf,
 	}

@@ -483,7 +490,7 @@ func TestQemuArchBaseAppendVFIODeviceWithVendorDeviceID(t *testing.T) {
 		},
 	}

-	vfDevice := config.VFIOPCIDev{
+	vfDevice := config.VFIODev{
 		BDF:      bdf,
 		VendorID: vendorID,
 		DeviceID: deviceID,
--- a/src/runtime/virtcontainers/qemu_ppc64le.go
+++ b/src/runtime/virtcontainers/qemu_ppc64le.go
@@ -11,6 +11,7 @@ import (
 	"fmt"
 	"time"

+	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
 	govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
 	"github.com/sirupsen/logrus"
@@ -97,7 +98,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) {
 	return q, nil
 }

-func (q *qemuPPC64le) capabilities() types.Capabilities {
+func (q *qemuPPC64le) capabilities(hConfig HypervisorConfig) types.Capabilities {
 	var caps types.Capabilities

 	// pseries machine type supports hotplugging drives
@@ -106,7 +107,9 @@ func (q *qemuPPC64le) capabilities() types.Capabilities {
 	}

 	caps.SetMultiQueueSupport()
-	caps.SetFsSharingSupport()
+	if hConfig.SharedFS != config.NoSharedFS {
+		caps.SetFsSharingSupport()
+	}

 	return caps
 }
--- a/src/runtime/virtcontainers/qemu_test.go
+++ b/src/runtime/virtcontainers/qemu_test.go
@@ -111,9 +111,6 @@ func TestQemuCreateVM(t *testing.T) {
 	config6 := newQemuConfig()
 	config6.DisableGuestSeLinux = false

-	config7 := newQemuConfig()
-	config7.PCIeRootPort = 1
-
 	config8 := newQemuConfig()
 	config8.EnableVhostUserStore = true
 	config8.HugePages = true
@@ -161,7 +158,6 @@ func TestQemuCreateVM(t *testing.T) {
 		{config3, false, true},
 		{config5, false, true},
 		{config6, false, false},
-		{config7, false, true},
 		{config8, false, true},
 		{config9, true, false},
 		{config10, false, true},
--- a/src/runtime/virtcontainers/sandbox.go
+++ b/src/runtime/virtcontainers/sandbox.go
@@ -36,7 +36,6 @@ import (
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers"
 	deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager"
-	hv "github.com/kata-containers/kata-containers/src/runtime/pkg/hypervisors"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
 	resCtrl "github.com/kata-containers/kata-containers/src/runtime/pkg/resourcecontrol"
 	exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental"
@@ -106,15 +105,10 @@ type HypervisorPidKey struct{}

 // SandboxStatus describes a sandbox status.
 type SandboxStatus struct {
-	ContainersStatus []ContainerStatus
-
-	// Annotations allow clients to store arbitrary values,
-	// for example to add additional status values required
-	// to support particular specifications.
-	Annotations map[string]string
-
+	Annotations      map[string]string
 	ID               string
 	Hypervisor       HypervisorType
+	ContainersStatus []ContainerStatus
 	State            types.SandboxState
 	HypervisorConfig HypervisorConfig
 }
@@ -530,6 +524,7 @@ func createSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Fac
 	return s, nil
 }

+//nolint:gocyclo
 func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factory) (sb *Sandbox, retErr error) {
 	span, ctx := katatrace.Trace(ctx, nil, "newSandbox", sandboxTracingTags, map[string]string{"sandbox_id": sandboxConfig.ID})
 	defer span.End()
@@ -630,22 +625,49 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor

 	// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
 	// until we have TDISP/IDE PCIe support.
-	coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != hv.NoPort)
-	var devs []config.DeviceInfo
+	coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort)
+	// Aggregate all the containner devices for hot-plug and use them to dedcue
+	// the correct amount of ports to reserve for the hypervisor.
+	hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
+
+	var vfioDevices []config.DeviceInfo
+	// vhost-user-block device is a PCIe device in Virt, keep track of it
+	// for correct number of PCIe root ports.
+	var vhostUserBlkDevices []config.DeviceInfo
+
 	for cnt, containers := range sandboxConfig.Containers {
 		for dev, device := range containers.DeviceInfos {
-			if coldPlugVFIO && deviceManager.IsVFIO(device.ContainerPath) {
+
+			if deviceManager.IsVhostUserBlk(device) {
+				vhostUserBlkDevices = append(vhostUserBlkDevices, device)
+				continue
+			}
+			isVFIO := deviceManager.IsVFIO(device.ContainerPath)
+			if hotPlugVFIO && isVFIO {
+				vfioDevices = append(vfioDevices, device)
+				sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
+			}
+			if coldPlugVFIO && isVFIO {
 				device.ColdPlug = true
-				devs = append(devs, device)
+				device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO
+				vfioDevices = append(vfioDevices, device)
 				// We need to remove the devices marked for cold-plug
 				// otherwise at the container level the kata-agent
 				// will try to hot-plug them.
-				infos := sandboxConfig.Containers[cnt].DeviceInfos
-				infos = append(infos[:dev], infos[dev+1:]...)
-				sandboxConfig.Containers[cnt].DeviceInfos = infos
+				sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging"
 			}
 		}
+		var filteredDevices []config.DeviceInfo
+		for _, device := range containers.DeviceInfos {
+			if device.ID != "remove-we-are-cold-plugging" {
+				filteredDevices = append(filteredDevices, device)
+			}
+		}
+		sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices
+
 	}
+	sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
+	sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices

 	// store doesn't require hypervisor to be stored immediately
 	if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil {
@@ -660,7 +682,7 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
 		return s, nil
 	}

-	for _, dev := range devs {
+	for _, dev := range vfioDevices {
 		_, err := s.AddDevice(ctx, dev)
 		if err != nil {
 			s.Logger().WithError(err).Debug("Cannot cold-plug add device")
@@ -1723,7 +1745,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
 	defer span.End()

 	for i := range s.config.Containers {
-
 		c, err := newContainer(ctx, s, &s.config.Containers[i])
 		if err != nil {
 			return err
@@ -1742,7 +1763,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
 	if err := s.updateResources(ctx); err != nil {
 		return err
 	}
-
 	if err := s.resourceControllerUpdate(ctx); err != nil {
 		return err
 	}
@@ -1754,7 +1774,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
 	if err := s.storeSandbox(ctx); err != nil {
 		return err
 	}
-
 	return nil
 }

@@ -1918,15 +1937,11 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
 		// adding a group of VFIO devices
 		for _, dev := range vfioDevices {
 			if _, err := s.hypervisor.HotplugAddDevice(ctx, dev, VfioDev); err != nil {
-				bdf := ""
-				if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
-					bdf = pciDevice.BDF
-				}
 				s.Logger().
 					WithFields(logrus.Fields{
 						"sandbox":         s.id,
-						"vfio-device-ID":  (*dev).GetID(),
-						"vfio-device-BDF": bdf,
+						"vfio-device-ID":  dev.ID,
+						"vfio-device-BDF": dev.BDF,
 					}).WithError(err).Error("failed to hotplug VFIO device")
 				return err
 			}
@@ -1941,6 +1956,7 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy
 		return err
 	case config.VhostUserBlk:
 		vhostUserBlkDevice, ok := device.(*drivers.VhostUserBlkDevice)
+
 		if !ok {
 			return fmt.Errorf("device type mismatch, expect device type to be %s", devType)
 		}
@@ -1975,15 +1991,11 @@ func (s *Sandbox) HotplugRemoveDevice(ctx context.Context, device api.Device, de
 		// remove a group of VFIO devices
 		for _, dev := range vfioDevices {
 			if _, err := s.hypervisor.HotplugRemoveDevice(ctx, dev, VfioDev); err != nil {
-				bdf := ""
-				if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok {
-					bdf = pciDevice.BDF
-				}
 				s.Logger().WithError(err).
 					WithFields(logrus.Fields{
 						"sandbox":         s.id,
-						"vfio-device-ID":  (*dev).GetID(),
-						"vfio-device-BDF": bdf,
+						"vfio-device-ID":  dev.ID,
+						"vfio-device-BDF": dev.BDF,
 					}).Error("failed to hot unplug VFIO device")
 				return err
 			}
--- a/src/runtime/virtcontainers/sandbox_test.go
+++ b/src/runtime/virtcontainers/sandbox_test.go
@@ -593,11 +593,11 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) {
 	_, err = os.Create(deviceFile)
 	assert.Nil(t, err)

-	savedIOMMUPath := config.SysIOMMUPath
-	config.SysIOMMUPath = tmpDir
+	savedIOMMUPath := config.SysIOMMUGroupPath
+	config.SysIOMMUGroupPath = tmpDir

 	defer func() {
-		config.SysIOMMUPath = savedIOMMUPath
+		config.SysIOMMUGroupPath = savedIOMMUPath
 	}()

 	dm := manager.NewDeviceManager(config.VirtioSCSI, false, "", 0, nil)
--- a/tests/common.bash
+++ b/tests/common.bash
@@ -240,3 +240,17 @@ restart_containerd_service() {
 	clean_env_ctr
 	return 0
 }
+
+# @path_results: path to the input metric-results folder
+# @tarball_fname: path and filename to the output tarball
+function compress_metrics_results_dir()
+{
+	local path_results="${1:-results}"
+	local tarball_fname="${2:-}"
+
+	[ -z "${tarball_fname}" ] && die "Missing the tarball filename or the path to save the tarball results is incorrect."
+	[ ! -d "${path_results}" ] && die "Missing path to the results folder."
+
+	cd "${path_results}" && tar -czf "${tarball_fname}" *.json && cd -
+	info "tarball generated: ${tarball_fname}"
+}
--- a/tests/integration/gha-run.sh
+++ b/tests/integration/gha-run.sh
@@ -31,13 +31,16 @@ function login_azure() {
 }

 function create_cluster() {
+    # First, ensure that the cluster didn't fail to get cleaned up from a previous run.
+    delete_cluster || true
+
    az aks create \
        -g "kataCI" \
        -n "$(_print_cluster_name)" \
        -s "Standard_D4s_v5" \
        --node-count 1 \
        --generate-ssh-keys \
-        $([ "${KATA_HOST_OS}" = "cbl-mariner" ] && echo "--os-sku mariner --workload-runtime KataMshvVmIsolation")
+        $([ "${KATA_HOST_OS}" = "cbl-mariner" ] && echo "--os-sku AzureLinux --workload-runtime KataMshvVmIsolation")
 }

 function install_bats() {
@@ -55,10 +58,28 @@ function get_cluster_credentials() {
        -n "$(_print_cluster_name)"
 }

+function ensure_yq() {
+    : "${GOPATH:=${GITHUB_WORKSPACE}}"
+    export GOPATH
+    export PATH="${GOPATH}/bin:${PATH}"
+    INSTALL_IN_GOPATH=true "${repo_root_dir}/ci/install_yq.sh"
+}
+
 function run_tests() {
    platform="${1}"
+    ensure_yq
+
+    # Emsure we're in the default namespace
+    kubectl config set-context --current --namespace=default
+
+    # Delete any spurious tests namespace that was left behind
+    kubectl delete namespace kata-containers-k8s-tests &> /dev/null || true

    sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}|g" "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml"
+    if [ "${KATA_HOST_OS}" = "cbl-mariner" ]; then
+        yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[+].name' "HOST_OS"
+        yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[-1].value' "${KATA_HOST_OS}"
+    fi
    cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml"
    cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" | grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" || die "Failed to setup the tests image"

@@ -80,6 +101,10 @@ function run_tests() {
        sleep 60s
    fi

+    # Create a new namespace for the tests and switch to it
+    kubectl apply -f ${integration_dir}/kubernetes/runtimeclass_workloads/tests-namespace.yaml
+    kubectl config set-context --current --namespace=kata-containers-k8s-tests
+
    pushd "${integration_dir}/kubernetes"
    bash setup.sh
    bash run_kubernetes_tests.sh
@@ -89,6 +114,10 @@ function run_tests() {
 function cleanup() {
    platform="${1}"

+    # Switch back to the default namespace and delete the tests one
+    kubectl config set-context --current --namespace=default
+    kubectl delete namespace kata-containers-k8s-tests
+
    if [ "${platform}" = "tdx" ]; then
        deploy_spec="-k "${tools_dir}/packaging/kata-deploy/kata-deploy/overlays/k3s""
        cleanup_spec="-k "${tools_dir}/packaging/kata-deploy/kata-cleanup/overlays/k3s""
@@ -115,11 +144,12 @@ function delete_cluster() {
    az aks delete \
        -g "kataCI" \
        -n "$(_print_cluster_name)" \
-        --yes \
-        --no-wait
+        --yes
 }

 function main() {
+    export KATA_HOST_OS="${KATA_HOST_OS:-}"
+
    action="${1:-}"

    case "${action}" in
--- a/tests/integration/kubernetes/k8s-pod-quota.bats
+++ b/tests/integration/kubernetes/k8s-pod-quota.bats
@@ -14,13 +14,12 @@ setup() {
@test "Pod quota" {
 	resource_name="pod-quota"
 	deployment_name="deploymenttest"
-	namespace="test-quota-ns"

 	# Create the resourcequota
 	kubectl create -f "${pod_config_dir}/resource-quota.yaml"

 	# View information about resourcequota
-	kubectl get -n "$namespace" resourcequota "$resource_name" \
+	kubectl get resourcequota "$resource_name" \
 		--output=yaml | grep 'pods: "2"'

 	# Create deployment
@@ -28,10 +27,9 @@ setup() {

 	# View deployment
 	kubectl wait --for=condition=Available --timeout=$timeout \
-		-n "$namespace" deployment/${deployment_name}
+		deployment/${deployment_name}
 }

 teardown() {
-	kubectl delete -n "$namespace" deployment "$deployment_name"
 	kubectl delete -f "${pod_config_dir}/resource-quota.yaml"
 }
--- a/tests/integration/kubernetes/run_kubernetes_tests.sh
+++ b/tests/integration/kubernetes/run_kubernetes_tests.sh
@@ -54,10 +54,6 @@ else
 	)
 fi

-if [ ${KATA_HOST_OS} == "cbl-mariner" ]; then
-	exit 0
-fi
-
 # we may need to skip a few test cases when running on non-x86_64 arch
 arch_config_file="${kubernetes_dir}/filter_out_per_arch/${TARGET_ARCH}.yaml"
 if [ -f "${arch_config_file}" ]; then
--- a/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml
+++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-custom-dns.yaml
@@ -6,7 +6,6 @@
 apiVersion: v1
 kind: Pod
 metadata:
-  namespace: default
  name: custom-dns-test
 spec:
  terminationGracePeriodSeconds: 0
--- a/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml
+++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-oom.yaml
@@ -8,7 +8,6 @@ apiVersion: v1
 kind: Pod
 metadata:
  name: pod-oom
-  namespace: default
 spec:
  runtimeClassName: kata
  restartPolicy: Never
--- a/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml
+++ b/tests/integration/kubernetes/runtimeclass_workloads/pod-quota-deployment.yaml
@@ -7,7 +7,6 @@ apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: deploymenttest
-  namespace: test-quota-ns
 spec:
  selector:
    matchLabels:
--- a/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml
+++ b/tests/integration/kubernetes/runtimeclass_workloads/resource-quota.yaml
@@ -14,7 +14,6 @@ items:
  kind: ResourceQuota
  metadata:
    name: pod-quota
-    namespace: test-quota-ns
  spec:
    hard:
      pods: "2"
--- a/tests/integration/kubernetes/runtimeclass_workloads/tests-namespace.yaml
+++ b/tests/integration/kubernetes/runtimeclass_workloads/tests-namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: kata-containers-k8s-tests
--- a/tests/integration/kubernetes/setup.sh
+++ b/tests/integration/kubernetes/setup.sh
@@ -13,8 +13,24 @@ set_runtime_class() {
    sed -i -e "s|runtimeClassName: kata|runtimeClassName: kata-${KATA_HYPERVISOR}|" ${kubernetes_dir}/runtimeclass_workloads/*.yaml
 }

+set_kernel_path() {
+    if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then
+        mariner_kernel_path="/usr/share/cloud-hypervisor/vmlinux.bin"
+        find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.kernel]' "${mariner_kernel_path}" \;
+    fi
+}
+
+set_initrd_path() {
+    if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then
+        initrd_path="/opt/kata/share/kata-containers/kata-containers-initrd-cbl-mariner.img"
+        find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.initrd]' "${initrd_path}" \;
+    fi
+}
+
 main() {
    set_runtime_class
+    set_kernel_path
+    set_initrd_path
 }

 main "$@"
--- a/tests/metrics/README.md
+++ b/tests/metrics/README.md
@@ -55,6 +55,8 @@ For further details see the [time tests documentation](time).
 Tests that measure the size and overheads of the runtime. Generally this is looking at
 memory footprint sizes, but could also cover disk space or even CPU consumption.

+For further details see the [density tests documentation](density).
+
 ### Networking

 Tests relating to networking. General items could include:
--- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml
+++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This file contains baseline expectations
+# for checked results by checkmetrics tool.
+#
+# values set specifically for packet.com c1.small worker.
+
+[[metric]]
+name = "boot-times"
+type = "json"
+description = "measure container lifecycle timings"
+# Min and Max values to set a 'range' that
+# the median of the CSV Results data must fall
+# within (inclusive)
+checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result"
+checktype = "mean"
+midval = 0.42
+minpercent = 20.0
+maxpercent = 20.0
+
+[[metric]]
+name = "memory-footprint"
+type = "json"
+description = "measure memory usage"
+# Min and Max values to set a 'range' that
+# the median of the CSV Results data must fall
+# within (inclusive)
+checkvar = ".\"memory-footprint\".Results | .[] | .average.Result"
+checktype = "mean"
+midval = 2518364.00
+minpercent = 20.0
+maxpercent = 20.0
--- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml
+++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# This file contains baseline expectations
+# for checked results by checkmetrics tool.
+#
+# values set specifically for Equinix m3.small.x86.
+
+[[metric]]
+name = "boot-times"
+type = "json"
+description = "measure container lifecycle timings"
+# Min and Max values to set a 'range' that
+# the median of the CSV Results data must fall
+# within (inclusive)
+checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result"
+checktype = "mean"
+midval = 0.61
+minpercent = 20.0
+maxpercent = 20.0
+
+[[metric]]
+name = "memory-footprint"
+type = "json"
+description = "measure memory usage"
+# Min and Max values to set a 'range' that
+# the median of the CSV Results data must fall
+# within (inclusive)
+checkvar = ".\"memory-footprint\".Results | .[] | .average.Result"
+checktype = "mean"
+midval = 2435844.00
+minpercent = 20.0
+maxpercent = 20.0
--- a/tests/metrics/density/README.md
+++ b/tests/metrics/density/README.md
@@ -0,0 +1,53 @@
+# Kata Containers density metrics tests
+
+This directory contains a number of tests to help measure container
+memory footprint. Some measures are based around the
+[PSS](https://en.wikipedia.org/wiki/Proportional_set_size) of the runtime
+components, and others look at the system level (`free` and `/proc/meminfo`
+for instance) impact.
+
+## `memory_usage`
+
+This test measures the PSS footprint of the runtime components whilst
+launching a number of small ([BusyBox](https://hub.docker.com/_/busybox/)) containers
+using ctr.
+
+## `fast_footprint`
+
+This test takes system level resource measurements after launching a number of
+containers in parallel and optionally waiting for KSM to settle its memory
+compaction cycles.
+
+The script is quite configurable via environment variables, including:
+
+* Which container workload to run.
+* How many containers to launch.
+* How many containers are launched in parallel.
+* How long to wait until taking the measures.
+
+See the script itself for more details.
+
+This test shares many config options with the `footprint_data` test. Thus, referring
+to the [footprint test documentation](footprint_data.md) may be useful.
+
+> *Note:* If this test finds KSM is enabled on the host, it will wait for KSM
+> to "settle" before taking the final measurement. If your KSM is not configured
+> to process all the allocated VM memory fast enough, the test will hit a timeout
+> and proceed to take the final measurement anyway.
+
+## `footprint_data`
+
+Similar to the `fast_footprint` test, but this test launches the containers
+sequentially and takes a system level measurement between each launch. Thus,
+this test provides finer grained information on system scaling, but takes
+significantly longer to run than the `fast_footprint` test. If you are only
+interested in the final figure or the average impact, you may be better running
+the `fast_footprint` test.
+
+For more details see the [footprint test documentation](footprint_data.md).
+
+## `memory_usage_inside_container`
+
+Measures the memory statistics *inside* the container. This allows evaluation of
+the overhead the VM kernel and rootfs are having on the memory that was requested
+by the container co-ordination system, and thus supplied to the VM.
--- a/tests/metrics/density/fast_footprint.sh
+++ b/tests/metrics/density/fast_footprint.sh
@@ -0,0 +1,433 @@
+#!/bin/bash
+# Copyright (c) 2017-2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# A script to gather memory 'footprint' information as we launch more
+# and more containers
+#
+# The script gathers information about both user and kernel space consumption
+# Output is into a .json file, named using some of the config component names
+# (such as footprint-busybox.json)
+
+# Pull in some common, useful, items
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+source "${SCRIPT_PATH}/../lib/common.bash"
+
+# Note that all vars that can be set from outside the script (that is,
+# passed in the ENV), use the ':-' setting to allow being over-ridden
+
+# Default sleep, in seconds, to let containers come up and finish their
+# initialisation before we take the measures. Some of the larger
+# containers can take a number of seconds to get running.
+PAYLOAD_SLEEP="${PAYLOAD_SLEEP:-10}"
+
+# How long, in seconds, do we wait for KSM to 'settle down', before we
+# timeout and just continue anyway.
+KSM_WAIT_TIME="${KSM_WAIT_TIME:-300}"
+
+# How long, in seconds, do we poll for ctr to complete launching all the
+# containers?
+CTR_POLL_TIMEOUT="${CTR_POLL_TIMEOUT:-300}"
+
+# How many containers do we launch in parallel before taking the PAYLOAD_SLEEP
+# nap
+PARALLELISM="${PARALLELISM:-10}"
+
+### The default config - run a small busybox image
+# Define what we will be running (app under test)
+#  Default is we run busybox, as a 'small' workload
+PAYLOAD="${PAYLOAD:-quay.io/prometheus/busybox:latest}"
+PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}"
+
+###
+# which RUNTIME we use is picked up from the env in
+# common.bash. You can over-ride by setting RUNTIME in your env
+
+###
+# Define the cutoff checks for when we stop running the test
+  # Run up to this many containers
+NUM_CONTAINERS="${NUM_CONTAINERS:-100}"
+  # Run until we have consumed this much memory (from MemFree)
+MAX_MEMORY_CONSUMED="${MAX_MEMORY_CONSUMED:-256*1024*1024*1024}"
+  # Run until we have this much MemFree left
+MIN_MEMORY_FREE="${MIN_MEMORY_FREE:-2*1024*1024*1024}"
+
+# Tools we need to have installed in order to operate
+REQUIRED_COMMANDS="smem awk"
+
+# If we 'dump' the system caches before we measure then we get less
+# noise in the results - they show more what our un-reclaimable footprint is
+DUMP_CACHES="${DUMP_CACHES:-1}"
+
+# Affects the name of the file to store the results in
+TEST_NAME="${TEST_NAME:-fast-footprint-busybox}"
+
+############# end of configurable items ###################
+
+# vars to remember where we started so we can calc diffs
+base_mem_avail=0
+base_mem_free=0
+
+# dump the kernel caches, so we get a more precise (or just different)
+# view of what our footprint really is.
+function dump_caches() {
+	sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
+}
+
+function init() {
+	restart_containerd_service
+
+	check_cmds $REQUIRED_COMMANDS
+	sudo -E "${CTR_EXE}" image pull "$PAYLOAD"
+
+	# Modify the test name if running with KSM enabled
+	check_for_ksm
+
+	# Use the common init func to get to a known state
+	init_env
+
+	# Prepare to start storing results
+	metrics_json_init
+
+	# Store up baseline measures
+	base_mem_avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
+	base_mem_free=$(get_memfree)
+
+	# Store our configuration for this run
+	save_config
+}
+
+save_config(){
+	metrics_json_start_array
+
+	local json="$(cat << EOF
+	{
+		"testname": "${TEST_NAME}",
+		"payload": "${PAYLOAD}",
+		"payload_args": "${PAYLOAD_ARGS}",
+		"payload_sleep": ${PAYLOAD_SLEEP},
+		"ksm_settle_time": ${KSM_WAIT_TIME},
+		"num_containers": ${NUM_CONTAINERS},
+		"parallelism": ${PARALLELISM},
+		"max_memory_consumed": "${MAX_MEMORY_CONSUMED}",
+		"min_memory_free": "${MIN_MEMORY_FREE}",
+		"dump_caches": "${DUMP_CACHES}"
+	}
+EOF
+)"
+	metrics_json_add_array_element "$json"
+	metrics_json_end_array "Config"
+}
+
+function cleanup() {
+	# Finish storing the results
+	metrics_json_save
+
+	clean_env_ctr
+}
+
+# helper function to get USS of process in arg1
+function get_proc_uss() {
+	item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $4}')
+	((item*=1024))
+	echo $item
+}
+
+# helper function to get PSS of process in arg1
+function get_proc_pss() {
+	item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $5}')
+	((item*=1024))
+	echo $item
+}
+
+# Get the PSS for the whole of userspace (all processes)
+#  This allows us to see if we had any impact on the rest of the system, for instance
+#  dockerd grows as we launch containers, so we should account for that in our total
+#  memory breakdown
+function grab_all_pss() {
+	item=$(sudo smem -t | tail -1 | awk '{print $5}')
+	((item*=1024))
+
+	local json="$(cat << EOF
+		"all_pss": {
+			"pss": $item,
+			"Units": "KB"
+		}
+EOF
+)"
+
+	metrics_json_add_array_fragment "$json"
+}
+
+function grab_user_smem() {
+	# userspace
+	item=$(sudo smem -w | head -5 | tail -1 | awk '{print $3}')
+	((item*=1024))
+
+	local json="$(cat << EOF
+		"user_smem": {
+			"userspace": $item,
+			"Units": "KB"
+		}
+EOF
+)"
+
+	metrics_json_add_array_fragment "$json"
+}
+
+function grab_slab() {
+	# Grabbing slab total from meminfo is easier than doing the math
+	# on slabinfo
+	item=$(fgrep "Slab:" /proc/meminfo | awk '{print $2}')
+	((item*=1024))
+
+	local json="$(cat << EOF
+		"slab": {
+			"slab": $item,
+			"Units": "KB"
+		}
+EOF
+)"
+
+	metrics_json_add_array_fragment "$json"
+}
+
+function get_memfree() {
+	mem_free=$(sudo smem -w | head -6 | tail -1 | awk '{print $4}')
+	((mem_free*=1024))
+	echo $mem_free
+}
+
+function grab_system() {
+
+	# avail memory, from 'free'
+	local avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
+	local avail_decr=$((base_mem_avail-avail))
+
+	# cached memory, from 'free'
+	local cached=$(free -b | head -2 | tail -1 | awk '{print $6}')
+
+	# free memory from smem
+	local smem_free=$(get_memfree)
+	local free_decr=$((base_mem_free-item))
+
+	# Anon pages
+	local anon=$(fgrep "AnonPages:" /proc/meminfo | awk '{print $2}')
+	((anon*=1024))
+
+	# Mapped pages
+	local mapped=$(egrep "^Mapped:" /proc/meminfo | awk '{print $2}')
+	((mapped*=1024))
+
+	# Cached
+	local meminfo_cached=$(grep "^Cached:" /proc/meminfo | awk '{print $2}')
+	((meminfo_cached*=1024))
+
+	local json="$(cat << EOF
+		"system": {
+			"avail": $avail,
+			"avail_decr": $avail_decr,
+			"cached": $cached,
+			"smem_free": $smem_free,
+			"free_decr": $free_decr,
+			"anon": $anon,
+			"mapped": $mapped,
+			"meminfo_cached": $meminfo_cached,
+			"Units": "KB"
+		}
+EOF
+)"
+
+	metrics_json_add_array_fragment "$json"
+}
+
+function grab_stats() {
+	# If configured, dump the caches so we get a more stable
+	# view of what our static footprint really is
+	if [[ "$DUMP_CACHES" ]] ; then
+		dump_caches
+	fi
+
+	# user space data
+		# PSS taken all userspace
+	grab_all_pss
+		# user as reported by smem
+	grab_user_smem
+
+	# System overview data
+		# System free and cached
+	grab_system
+
+	# kernel data
+		# The 'total kernel space taken' we can work out as:
+		# ktotal = ((free-avail)-user)
+		# So, we don't grab that number from smem, as that is what it does
+		# internally anyhow.
+		# Still try to grab any finer kernel details that we can though
+
+		# totals from slabinfo
+	grab_slab
+
+	metrics_json_close_array_element
+}
+
+function check_limits() {
+	mem_free=$(get_memfree)
+	if ((mem_free <= MIN_MEMORY_FREE)); then
+		echo 1
+		return
+	fi
+
+	mem_consumed=$((base_mem_avail-mem_free))
+	if ((mem_consumed >= MAX_MEMORY_CONSUMED)); then
+		echo 1
+		return
+	fi
+
+	echo 0
+}
+
+launch_containers() {
+	local parloops leftovers
+
+	(( parloops=${NUM_CONTAINERS}/${PARALLELISM} ))
+	(( leftovers=${NUM_CONTAINERS} - (${parloops}*${PARALLELISM}) ))
+
+	echo "Launching ${parloops}x${PARALLELISM} containers + ${leftovers} etras"
+
+	containers=()
+
+	local iter n
+	for iter in $(seq 1 $parloops); do
+		echo "Launch iteration ${iter}"
+		for n in $(seq 1 $PARALLELISM); do
+			containers+=($(random_name))
+			sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS &
+		done
+
+		if [[ $PAYLOAD_SLEEP ]]; then
+			sleep $PAYLOAD_SLEEP
+		fi
+
+		# check if we have hit one of our limits and need to wrap up the tests
+		if (($(check_limits))); then
+			echo "Ran out of resources, check_limits failed"
+			return
+		fi
+	done
+
+	for n in $(seq 1 $leftovers); do
+		containers+=($(random_name))
+		sudo -E "${CTR_EXE}" run -d --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS &
+	done
+}
+
+wait_containers() {
+	local t numcontainers
+	# nap 3s between checks
+	local step=3
+
+	for ((t=0; t<${CTR_POLL_TIMEOUT}; t+=step)); do
+
+		numcontainers=$(sudo -E "${CTR_EXE}" c list -q | wc -l)
+
+		if (( numcontainers >=  ${NUM_CONTAINERS} )); then
+			echo "All containers now launched (${t}s)"
+				return
+		else
+			echo "Waiting for containers to launch (${numcontainers} at ${t}s)"
+		fi
+		sleep ${step}
+	done
+
+	echo "Timed out waiting for containers to launch (${t}s)"
+	cleanup
+	die "Timed out waiting for containers to launch (${t}s)"
+}
+
+function go() {
+	# Init the json cycle for this save
+	metrics_json_start_array
+
+	# Grab the first set of stats before we run any containers.
+	grab_stats
+
+	launch_containers
+	wait_containers
+
+	if [ $ksm_on == "1" ]; then
+		echo "Wating for KSM to settle..."
+		wait_ksm_settle ${KSM_WAIT_TIME}
+	fi
+
+	grab_stats
+
+	# Wrap up the results array
+	metrics_json_end_array "Results"
+}
+
+function show_vars()
+{
+	echo -e "\nEvironment variables:"
+	echo -e "\tName (default)"
+	echo -e "\t\tDescription"
+	echo -e "\tPAYLOAD (${PAYLOAD})"
+	echo -e "\t\tThe ctr image to run"
+	echo -e "\tPAYLOAD_ARGS (${PAYLOAD_ARGS})"
+	echo -e "\t\tAny extra arguments passed into the docker 'run' command"
+	echo -e "\tPAYLOAD_SLEEP (${PAYLOAD_SLEEP})"
+	echo -e "\t\tSeconds to sleep between launch and measurement, to allow settling"
+	echo -e "\tKSM_WAIT_TIME (${KSM_WAIT_TIME})"
+	echo -e "\t\tSeconds to wait for KSM to settle before we take the final measure"
+	echo -e "\tCTR_POLL_TIMEOUT (${CTR_POLL_TIMEOUT})"
+	echo -e "\t\tSeconds to poll for ctr to finish launching containers"
+	echo -e "\tPARALLELISM (${PARALLELISM})"
+	echo -e "\t\tNumber of containers we launch in parallel"
+	echo -e "\tNUM_CONTAINERS (${NUM_CONTAINERS})"
+	echo -e "\t\tThe total number of containers to run"
+	echo -e "\tMAX_MEMORY_CONSUMED (${MAX_MEMORY_CONSUMED})"
+	echo -e "\t\tThe maximum amount of memory to be consumed before terminating"
+	echo -e "\tMIN_MEMORY_FREE (${MIN_MEMORY_FREE})"
+	echo -e "\t\tThe minimum amount of memory allowed to be free before terminating"
+	echo -e "\tDUMP_CACHES (${DUMP_CACHES})"
+	echo -e "\t\tA flag to note if the system caches should be dumped before capturing stats"
+	echo -e "\tTEST_NAME (${TEST_NAME})"
+	echo -e "\t\tCan be set to over-ride the default JSON results filename"
+
+}
+
+function help()
+{
+	usage=$(cat << EOF
+Usage: $0 [-h] [options]
+   Description:
+	Launch a series of workloads and take memory metric measurements after
+	each launch.
+   Options:
+        -h,    Help page.
+EOF
+)
+	echo "$usage"
+	show_vars
+}
+
+function main() {
+
+	local OPTIND
+	while getopts "h" opt;do
+		case ${opt} in
+		h)
+		    help
+		    exit 0;
+		    ;;
+		esac
+	done
+	shift $((OPTIND-1))
+
+	init
+	go
+	cleanup
+}
+
+main "$@"
--- a/tests/metrics/density/footprint_data.md
+++ b/tests/metrics/density/footprint_data.md
@@ -0,0 +1,87 @@
+# Footprint data script details
+
+The `footprint_data.sh` script runs a number of identical containers sequentially
+via ctr and takes a number of memory related measurements after each
+launch. The script is generally not used in a CI type environment, but is intended
+to be run and analyzed manually.
+
+You can configure the script by setting a number of environment variables.
+
+The following sections list details of the configurable variables, along with a
+small example invocation script.
+
+## Variables
+Environment variables can take effect in two ways.
+
+Some variables affect how the payload is executed. The `RUNTIME` and `PAYLOAD`
+arguments directly affect the payload execution with the following line in
+the script:
+
+`$ ctr run --memory-limit $PAYLOAD_RUNTIME_ARGS --rm --runtime=$CONTAINERD_RUNTIME $PAYLOAD $NAME sh -c $PAYLOAD_ARGS`
+
+Other settings affect how memory footprint is measured and the test termination
+conditions.
+
+| Variable | Function
+| -------- | --------
+| `PAYLOAD` | The ctr image to run
+| `PAYLOAD_ARGS` | Any arguments passed into the ctr image
+| `PAYLOAD_RUNTIME_ARGS` | Any extra arguments passed into the ctr `run` command
+| `PAYLOAD_SLEEP` | Seconds to sleep between launch and measurement, to allow settling
+| `MAX_NUM_CONTAINERS` | The maximum number of containers to run before terminating
+| `MAX_MEMORY_CONSUMED` | The maximum amount of memory to be consumed before terminating
+| `MIN_MEMORY_FREE` | The minimum amount of memory allowed to be free before terminating
+| `DUMP_CACHES` | A flag to note if the system caches should be dumped before capturing stats
+| `DATAFILE` | Can be set to over-ride the default JSON results filename
+
+## Output files
+The names of the JSON files generated by the test are dictated by some of the parameters
+the test is utilising. The default filename is generated in the form of:
+`footprint-${PAYLOAD}[-ksm].json`
+
+## Measurements
+The test measures, calculates, and stores a number of data items:
+
+| Item | Description
+| ---- | -----------
+| `uss` | USS for all the VM runtime components
+| `pss` | PSS for all the VM runtime components
+| `all_pss` | PSS of all of userspace - to monitor if we had other impact on the system
+| `user_smem` | `smem` "userspace" consumption value
+| `avail` | "available" memory from `free`
+| `avail_decr` | "available" memory decrease since start of test
+| `cached` | "Cached" memory from `/proc/meminfo`
+| `smem_free` | Free memory as reported by `smem`
+| `free_decr` | Decrease in Free memory reported by `smem` since start of test
+| `anon` | `AnonPages` as reported from `/proc/meminfo`
+| `mapped` | Mapped pages as reported from `/proc/meminfo`
+| `cached` | Cached pages as reported from `/proc/meminfo`
+| `slab` | Slab as reported from `/proc/meminfo`
+
+## Example script
+The following script is an example of how to configure the environment variables and
+invoke the test script to run a number of different container tests.
+
+```
+#!/bin/bash
+
+set -e
+set -x
+
+export MAX_NUM_CONTAINERS=10
+export MAX_MEMORY_CONSUMED=6*1024*1024*1024
+
+function run() {
+	###
+	# Define what we will be running (app under test)
+	#  Default is we run busybox, as a 'small' workload
+	export PAYLOAD="quay.io/prometheus/busybox:latest"
+	export PAYLOAD_ARGS="tail -f /dev/null"
+	export PAYLOAD_SLEEP=10
+	export PAYLOAD_RUNTIME_ARGS="5120"
+	sudo -E bash $(pwd)/density/footprint_data.sh
+}
+
+export CONTAINERD_RUNTIME=io.containerd.kata.v2
+run
+```
--- a/tests/metrics/density/footprint_data.sh
+++ b/tests/metrics/density/footprint_data.sh
@@ -0,0 +1,360 @@
+#!/bin/bash
+# Copyright (c) 2017-2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# A script to gather memory 'footprint' information as we launch more
+# and more containers
+#
+# The script gathers information about both user and kernel space consumption
+# Output is into a .json file, named using some of the config component names
+# (such as footprint-busybox.json)
+
+# Pull in some common, useful, items
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+source "${SCRIPT_PATH}/../lib/common.bash"
+
+KSM_ENABLE_FILE="/sys/kernel/mm/ksm/run"
+
+# Note that all vars that can be set from outside the script (that is,
+# passed in the ENV), use the ':-' setting to allow being over-ridden
+
+# Default sleep for 10s to let containers come up and finish their
+# initialisation before we take the measures. Some of the larger
+# containers can take a number of seconds to get running.
+PAYLOAD_SLEEP="${PAYLOAD_SLEEP:-10}"
+
+### The default config - run a small busybox image
+# Define what we will be running (app under test)
+#  Default is we run busybox, as a 'small' workload
+PAYLOAD="${PAYLOAD:-quay.io/prometheus/busybox:latest}"
+PAYLOAD_ARGS="${PAYLOAD_ARGS:-tail -f /dev/null}"
+
+###
+# Define the cutoff checks for when we stop running the test
+  # Run up to this many containers
+MAX_NUM_CONTAINERS="${MAX_NUM_CONTAINERS:-10}"
+  # Run until we have consumed this much memory (from MemFree)
+MAX_MEMORY_CONSUMED="${MAX_MEMORY_CONSUMED:-6*1024*1024*1024}"
+  # Run until we have this much MemFree left
+MIN_MEMORY_FREE="${MIN_MEMORY_FREE:-2*1024*1024*1024}"
+
+# Tools we need to have installed in order to operate
+REQUIRED_COMMANDS="smem awk"
+
+# If we 'dump' the system caches before we measure then we get less
+# noise in the results - they show more what our un-reclaimable footprint is
+DUMP_CACHES="${DUMP_CACHES:-1}"
+
+# Affects the name of the file to store the results in
+TEST_NAME="${TEST_NAME:-footprint-busybox}"
+
+############# end of configurable items ###################
+
+# vars to remember where we started so we can calc diffs
+base_mem_avail=0
+base_mem_free=0
+
+# dump the kernel caches, so we get a more precise (or just different)
+# view of what our footprint really is.
+function dump_caches() {
+	sudo bash -c "echo 3 > /proc/sys/vm/drop_caches"
+}
+
+function init() {
+	restart_containerd_service
+
+	check_cmds $REQUIRED_COMMANDS
+	sudo -E "${CTR_EXE}" image pull "$PAYLOAD"
+
+	# Modify the test name if running with KSM enabled
+	check_for_ksm
+
+	# Use the common init func to get to a known state
+	init_env
+
+	# Prepare to start storing results
+	metrics_json_init
+
+	# Store up baseline measures
+	base_mem_avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
+	base_mem_free=$(get_memfree)
+
+	# Store our configuration for this run
+	save_config
+}
+
+save_config(){
+	metrics_json_start_array
+
+	local json="$(cat << EOF
+	{
+		"testname": "${TEST_NAME}",
+		"payload": "${PAYLOAD}",
+		"payload_args": "${PAYLOAD_ARGS}",
+		"payload_sleep": ${PAYLOAD_SLEEP},
+		"max_containers": ${MAX_NUM_CONTAINERS},
+		"max_memory_consumed": "${MAX_MEMORY_CONSUMED}",
+		"min_memory_free": "${MIN_MEMORY_FREE}",
+		"dump_caches": "${DUMP_CACHES}"
+	}
+EOF
+)"
+	metrics_json_add_array_element "$json"
+	metrics_json_end_array "Config"
+}
+
+function cleanup() {
+	# Finish storing the results
+	metrics_json_save
+
+	clean_env_ctr
+}
+
+# helper function to get USS of process in arg1
+function get_proc_uss() {
+	item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $4}')
+	((item*=1024))
+	echo $item
+}
+
+# helper function to get PSS of process in arg1
+function get_proc_pss() {
+	item=$(sudo smem -t -P "^$1" | tail -1 | awk '{print $5}')
+	((item*=1024))
+	echo $item
+}
+
+# Get the PSS for the whole of userspace (all processes)
+#  This allows us to see if we had any impact on the rest of the system, for instance
+#  containerd grows as we launch containers, so we should account for that in our total
+#  memory breakdown
+function grab_all_pss() {
+	item=$(sudo smem -t | tail -1 | awk '{print $5}')
+	((item*=1024))
+
+	local json="$(cat << EOF
+		"all_pss": {
+			"pss": $item,
+			"Units": "KB"
+		}
+EOF
+)"
+
+	metrics_json_add_array_fragment "$json"
+}
+
+function grab_user_smem() {
+	# userspace
+	item=$(sudo smem -w | head -5 | tail -1 | awk '{print $3}')
+	((item*=1024))
+
+	local json="$(cat << EOF
+		"user_smem": {
+			"userspace": $item,
+			"Units": "KB"
+		}
+EOF
+)"
+
+	metrics_json_add_array_fragment "$json"
+}
+
+function grab_slab() {
+	# Grabbing slab total from meminfo is easier than doing the math
+	# on slabinfo
+	item=$(fgrep "Slab:" /proc/meminfo | awk '{print $2}')
+	((item*=1024))
+
+	local json="$(cat << EOF
+		"slab": {
+			"slab": $item,
+			"Units": "KB"
+		}
+EOF
+)"
+
+	metrics_json_add_array_fragment "$json"
+}
+
+function get_memfree() {
+	mem_free=$(sudo smem -w | head -6 | tail -1 | awk '{print $4}')
+	((mem_free*=1024))
+	echo $mem_free
+}
+
+function grab_system() {
+	# avail memory, from 'free'
+	local avail=$(free -b | head -2 | tail -1 | awk '{print $7}')
+	local avail_decr=$((base_mem_avail-avail))
+
+	# cached memory, from 'free'
+	local cached=$(free -b | head -2 | tail -1 | awk '{print $6}')
+
+	# free memory from smem
+	local smem_free=$(get_memfree)
+	local free_decr=$((base_mem_free-item))
+
+	# Anon pages
+	local anon=$(fgrep "AnonPages:" /proc/meminfo | awk '{print $2}')
+	((anon*=1024))
+
+	# Mapped pages
+	local mapped=$(egrep "^Mapped:" /proc/meminfo | awk '{print $2}')
+	((mapped*=1024))
+
+	# Cached
+	local meminfo_cached=$(grep "^Cached:" /proc/meminfo | awk '{print $2}')
+	((meminfo_cached*=1024))
+
+	local json="$(cat << EOF
+		"system": {
+			"avail": $avail,
+			"avail_decr": $avail_decr,
+			"cached": $cached,
+			"smem_free": $smem_free,
+			"free_decr": $free_decr,
+			"anon": $anon,
+			"mapped": $mapped,
+			"meminfo_cached": $meminfo_cached,
+			"Units": "KB"
+		}
+EOF
+)"
+
+	metrics_json_add_array_fragment "$json"
+}
+
+function grab_stats() {
+	# If configured, dump the caches so we get a more stable
+	# view of what our static footprint really is
+	if [[ "$DUMP_CACHES" ]] ; then
+		dump_caches
+	fi
+
+	# user space data
+		# PSS taken all userspace
+	grab_all_pss
+		# user as reported by smem
+	grab_user_smem
+
+	# System overview data
+		# System free and cached
+	grab_system
+
+	# kernel data
+		# The 'total kernel space taken' we can work out as:
+		# ktotal = ((free-avail)-user)
+		# So, we don't grab that number from smem, as that is what it does
+		# internally anyhow.
+		# Still try to grab any finer kernel details that we can though
+
+		# totals from slabinfo
+	grab_slab
+
+	metrics_json_close_array_element
+}
+
+function check_limits() {
+	mem_free=$(get_memfree)
+	if ((mem_free <= MIN_MEMORY_FREE)); then
+		echo 1
+		return
+	fi
+
+	mem_consumed=$((base_mem_avail-mem_free))
+	if ((mem_consumed >= MAX_MEMORY_CONSUMED)); then
+		echo 1
+		return
+	fi
+
+	echo 0
+}
+
+function go() {
+	# Init the json cycle for this save
+	metrics_json_start_array
+
+	containers=()
+
+	for i in $(seq 1 $MAX_NUM_CONTAINERS); do
+		containers+=($(random_name))
+		sudo -E "${CTR_EXE}" run --rm --runtime=$CTR_RUNTIME $PAYLOAD ${containers[-1]} sh -c $PAYLOAD_ARGS
+
+		if [[ $PAYLOAD_SLEEP ]]; then
+			sleep $PAYLOAD_SLEEP
+		fi
+
+		grab_stats
+
+		# check if we have hit one of our limits and need to wrap up the tests
+		if (($(check_limits))); then
+			# Wrap up the results array
+			metrics_json_end_array "Results"
+			return
+		fi
+	done
+
+	# Wrap up the results array
+	metrics_json_end_array "Results"
+}
+
+
+function show_vars()
+{
+	echo -e "\nEvironment variables:"
+	echo -e "\tName (default)"
+	echo -e "\t\tDescription"
+	echo -e "\tPAYLOAD (${PAYLOAD})"
+	echo -e "\t\tThe ctr image to run"
+	echo -e "\tPAYLOAD_ARGS (${PAYLOAD_ARGS})"
+	echo -e "\t\tAny extra arguments passed into the ctr 'run' command"
+	echo -e "\tPAYLOAD_SLEEP (${PAYLOAD_SLEEP})"
+	echo -e "\t\tSeconds to sleep between launch and measurement, to allow settling"
+	echo -e "\tMAX_NUM_CONTAINERS (${MAX_NUM_CONTAINERS})"
+	echo -e "\t\tThe maximum number of containers to run before terminating"
+	echo -e "\tMAX_MEMORY_CONSUMED (${MAX_MEMORY_CONSUMED})"
+	echo -e "\t\tThe maximum amount of memory to be consumed before terminating"
+	echo -e "\tMIN_MEMORY_FREE (${MIN_MEMORY_FREE})"
+	echo -e "\t\tThe path to the ctr binary (for 'smem' measurements)"
+	echo -e "\tDUMP_CACHES (${DUMP_CACHES})"
+	echo -e "\t\tA flag to note if the system caches should be dumped before capturing stats"
+	echo -e "\tTEST_NAME (${TEST_NAME})"
+	echo -e "\t\tCan be set to over-ride the default JSON results filename"
+
+}
+
+function help()
+{
+	usage=$(cat << EOF
+Usage: $0 [-h] [options]
+   Description:
+	Launch a series of workloads and take memory metric measurements after
+	each launch.
+   Options:
+        -h,    Help page.
+EOF
+)
+	echo "$usage"
+	show_vars
+}
+
+function main() {
+
+	local OPTIND
+	while getopts "h" opt;do
+		case ${opt} in
+		h)
+		    help
+		    exit 0;
+		    ;;
+		esac
+	done
+	shift $((OPTIND-1))
+
+	init
+	go
+	cleanup
+}
+
+main "$@"
--- a/tests/metrics/density/memory_usage.sh
+++ b/tests/metrics/density/memory_usage.sh
@@ -0,0 +1,383 @@
+#!/bin/bash
+# Copyright (c) 2017-2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+#  Description of the test:
+#  This test launches a number of containers in idle mode,
+#  It will then sleep for a configurable period of time to allow
+#  any memory optimisations to 'settle, and then checks the
+#  amount of memory used by all the containers to come up with
+#  an average (using the PSS measurements)
+#  This test uses smem tool to get the memory used.
+
+set -e
+
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+source "${SCRIPT_PATH}/../lib/common.bash"
+
+# Busybox image: Choose a small workload image, this is
+# in order to measure the runtime footprint, not the workload
+# footprint.
+IMAGE='quay.io/prometheus/busybox:latest'
+
+CMD='tail -f /dev/null'
+NUM_CONTAINERS="$1"
+WAIT_TIME="$2"
+AUTO_MODE="$3"
+TEST_NAME="memory footprint"
+SMEM_BIN="smem"
+KSM_ENABLE_FILE="/sys/kernel/mm/ksm/run"
+MEM_TMP_FILE=$(mktemp meminfo.XXXXXXXXXX)
+PS_TMP_FILE=$(mktemp psinfo.XXXXXXXXXX)
+
+function remove_tmp_file() {
+	rm -rf "${MEM_TMP_FILE}" "${PS_TMP_FILE}"
+}
+
+trap remove_tmp_file EXIT
+
+# Show help about this script
+function help(){
+cat << EOF
+Usage: $0 <count> <wait_time> [auto]
+   Description:
+        <count>      : Number of containers to run.
+        <wait_time>  : Time in seconds to wait before taking
+                       metrics.
+        [auto]       : Optional 'auto KSM settle' mode
+                       waits for ksm pages_shared to settle down
+EOF
+}
+
+
+function get_runc_pss_memory(){
+	ctr_runc_shim_path="/usr/local/bin/containerd-shim-runc-v2"
+	get_pss_memory "${ctr_runc_shim_path}"
+}
+
+function get_runc_individual_memory() {
+	runc_process_result=$(cat "${MEM_TMP_FILE}" | tr "\n" " " | sed -e 's/\s$//g' | sed 's/ /, /g')
+
+	# Verify runc process result
+	if [ -z "${runc_process_result}" ];then
+		die "Runc process not found"
+	fi
+
+	read -r -a runc_values <<< "${runc_process_result}"
+
+	metrics_json_start_array
+
+	local json="$(cat << EOF
+	{
+		"runc individual results": [
+			$(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
+				printf '%s\n\t\t\t' "${runc_values[i]}"
+			done)
+		]
+	}
+EOF
+)"
+	metrics_json_add_array_element "$json"
+	metrics_json_end_array "Raw results"
+}
+
+# This function measures the PSS average
+# memory of a process.
+function get_pss_memory(){
+	ps="$1"
+	mem_amount=0
+	count=0
+	avg=0
+
+	if [ -z "${ps}" ]; then
+		die "No argument to get_pss_memory()"
+	fi
+
+	# Save all the processes names
+	# This will be help us to retrieve raw information
+	echo "${ps}" >> "${PS_TMP_FILE}"
+
+	data=$(sudo "${SMEM_BIN}" --no-header -P "^${ps}" -c "pss" | sed 's/[[:space:]]//g' | tr '\n' ' ' | sed 's/[[:blank:]]*$//')
+
+	# Save all the smem results
+	# This will help us to retrieve raw information
+	echo "${data}" >> "${MEM_TMP_FILE}"
+
+	gral_data=$(echo "${data// /+}" | bc)
+	for i in "${gral_data}"; do
+		if (( $i > 0 ));then
+			mem_amount=$(( i + mem_amount ))
+			(( count++ ))
+		fi
+	done
+
+	if (( "${count}" > 0 ));then
+		avg=$(bc -l <<< "scale=2; ${mem_amount} / ${count}")
+	fi
+
+	echo "${avg}"
+}
+
+function ppid() {
+	local pid
+	pid=$(ps -p "${1:-nopid}" -o ppid=)
+	echo "${pid//[[:blank:]]/}"
+}
+
+# This function measures the PSS average
+# memory of virtiofsd.
+# It is a special case of get_pss_memory,
+# virtiofsd forks itself so, smem sees the process
+# two times, this function sum both pss values:
+# pss_virtiofsd=pss_fork + pss_parent
+function get_pss_memory_virtiofsd() {
+	mem_amount=0
+	count=0
+	avg=0
+
+	virtiofsd_path=${1:-}
+	if [ -z "${virtiofsd_path}" ]; then
+		die "virtiofsd_path not provided"
+	fi
+
+	echo "${virtiofsd_path}" >> "${PS_TMP_FILE}"
+
+	virtiofsd_pids=$(ps aux | grep [v]irtiofsd | awk '{print $2}' | head -1)
+	data=$(sudo smem --no-header -P "^${virtiofsd_path}" -c pid -c "pid pss")
+
+	for p in "${virtiofsd_pids}"; do
+		parent_pid=$(ppid "${p}")
+		cmd="$(cat /proc/${p}/cmdline | tr -d '\0')"
+		cmd_parent="$(cat /proc/${parent_pid}/cmdline | tr -d '\0')"
+		if [ "${cmd}" != "${cmd_parent}" ]; then
+			pss_parent=$(printf "%s" "${data}" | grep "\s^${p}" | awk '{print $2}')
+
+			fork=$(pgrep -P "${p}")
+
+			pss_fork=$(printf "%s" "${data}" | grep "^\s*${fork}" | awk '{print $2}')
+			pss_process=$((pss_fork + pss_parent))
+
+			# Save all the smem results
+			# This will help us to retrieve raw information
+			echo "${pss_process}" >>"${MEM_TMP_FILE}"
+
+			if ((pss_process > 0)); then
+				mem_amount=$((pss_process + mem_amount))
+				((count++))
+			fi
+		fi
+	done
+
+	if (( "${count}" > 0 ));then
+		avg=$(bc -l <<< "scale=2; ${mem_amount} / ${count}")
+	fi
+	echo "${avg}"
+}
+
+function get_individual_memory(){
+	# Getting all the individual container information
+	first_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==1' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g')
+	first_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==1' | sed 's/ /, /g')
+
+	second_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==2' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g')
+	second_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==2' | sed 's/ /, /g')
+
+	third_process_name=$(cat "${PS_TMP_FILE}" | awk 'NR==3' | awk -F "/" '{print $NF}' | sed 's/[[:space:]]//g')
+	third_process_result=$(cat "${MEM_TMP_FILE}" | awk 'NR==3' | sed 's/ /, /g')
+
+	read -r -a first_values <<< "${first_process_result}"
+	read -r -a second_values <<< "${second_process_result}"
+	read -r -a third_values <<< "${third_process_result}"
+
+	metrics_json_start_array
+
+	local json="$(cat << EOF
+	{
+		"${first_process_name} memory": [
+			$(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
+				[ -n "${first_values[i]}" ] &&
+				printf '%s\n\t\t\t' "${first_values[i]}"
+			done)
+		],
+		"${second_process_name} memory": [
+			$(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
+				[ -n "${second_values[i]}" ] &&
+				printf '%s\n\t\t\t' "${second_values[i]}"
+			done)
+		],
+		"${third_process_name} memory": [
+			$(for ((i=0;i<"${NUM_CONTAINERS[@]}";++i)); do
+				[ -n "${third_values[i]}" ] &&
+				printf '%s\n\t\t\t' "${third_values[i]}"
+			done)
+		]
+	}
+EOF
+)"
+	metrics_json_add_array_element "$json"
+	metrics_json_end_array "Raw results"
+}
+
+# Try to work out the 'average memory footprint' of a container.
+function get_memory_usage(){
+	hypervisor_mem=0
+	virtiofsd_mem=0
+	shim_mem=0
+	memory_usage=0
+
+	containers=()
+
+	info "Creating ${NUM_CONTAINERS} containers"
+	for ((i=1; i<="${NUM_CONTAINERS}"; i++)); do
+		containers+=($(random_name))
+		sudo "${CTR_EXE}" run --runtime "${CTR_RUNTIME}" -d "${IMAGE}" "${containers[-1]}" sh -c "${CMD}"
+	done
+
+	if [ "${AUTO_MODE}" == "auto" ]; then
+		if (( ksm_on != 1 )); then
+			die "KSM not enabled, cannot use auto mode"
+		fi
+
+		echo "Entering KSM settle auto detect mode..."
+		wait_ksm_settle "${WAIT_TIME}"
+	else
+		# If KSM is enabled, then you normally want to sleep long enough to
+		# let it do its work and for the numbers to 'settle'.
+		echo "napping ${WAIT_TIME} s"
+		sleep "${WAIT_TIME}"
+	fi
+
+	metrics_json_start_array
+	# Check the runtime in order in order to determine which process will
+	# be measured about PSS
+	if [ "${RUNTIME}" == "runc" ]; then
+		runc_workload_mem="$(get_runc_pss_memory)"
+		memory_usage="${runc_workload_mem}"
+
+	local json="$(cat << EOF
+	{
+		"average": {
+			"Result": ${memory_usage},
+			"Units" : "KB"
+		},
+		"runc": {
+			"Result": ${runc_workload_mem},
+			"Units" : "KB"
+		}
+	}
+EOF
+)"
+
+	else [ "$RUNTIME" == "kata-runtime" ] || [ "$RUNTIME" == "kata-qemu" ]
+		# Get PSS memory of VM runtime components.
+		# And check that the smem search has found the process - we get a "0"
+		#  back if that procedure fails (such as if a process has changed its name
+		#  or is not running when expected to be so)
+		# As an added bonus - this script must be run as root.
+		# Now if you do not have enough rights
+		#  the smem failure to read the stats will also be trapped.
+
+		hypervisor_mem="$(get_pss_memory ${HYPERVISOR_PATH})"
+		if [ "${hypervisor_mem}" == "0" ]; then
+			die "Failed to find PSS for ${HYPERVISOR_PATH}"
+		fi
+
+		virtiofsd_mem="$(get_pss_memory_virtiofsd ${VIRTIOFSD_PATH})"
+		if [ "${virtiofsd_mem}" == "0" ]; then
+			echo >&2 "WARNING: Failed to find PSS for ${VIRTIOFSD_PATH}"
+		fi
+		shim_mem="$(get_pss_memory ${SHIM_PATH})"
+		if [ "${shim_mem}" == "0" ]; then
+			die "Failed to find PSS for ${SHIM_PATH}"
+		fi
+
+		mem_usage="$(bc -l <<< "scale=2; ${hypervisor_mem} +${virtiofsd_mem} + ${shim_mem}")"
+		memory_usage="${mem_usage}"
+
+	local json="$(cat << EOF
+	{
+		"average": {
+			"Result": ${mem_usage},
+			"Units" : "KB"
+		},
+		"qemus": {
+			"Result": ${hypervisor_mem},
+			"Units" : "KB"
+		},
+		"virtiofsds": {
+			"Result": ${virtiofsd_mem},
+			"Units" : "KB"
+		},
+		"shims": {
+			"Result": ${shim_mem},
+			"Units" : "KB"
+		}
+	}
+EOF
+)"
+	fi
+
+	metrics_json_add_array_element "$json"
+	metrics_json_end_array "Results"
+
+	clean_env_ctr
+}
+
+function save_config(){
+	metrics_json_start_array
+
+	local json="$(cat << EOF
+	{
+		"containers": "${NUM_CONTAINERS}",
+		"ksm": "${ksm_on}",
+		"auto": "${AUTO_MODE}",
+		"waittime": "${WAIT_TIME}",
+		"image": "${IMAGE}",
+		"command": "${CMD}"
+	}
+EOF
+
+)"
+	metrics_json_add_array_element "$json"
+	metrics_json_end_array "Config"
+}
+
+function main(){
+	# Verify enough arguments
+	if [ $# != 2 ] && [ $# != 3 ];then
+		echo >&2 "error: Not enough arguments [$@]"
+		help
+		exit 1
+	fi
+
+	#Check for KSM before reporting test name, as it can modify it
+	check_for_ksm
+
+	init_env
+
+	check_cmds "${SMEM_BIN}" bc
+	check_images "${IMAGE}"
+
+	if [ "${CTR_RUNTIME}" == "io.containerd.kata.v2" ]; then
+		export RUNTIME="kata-runtime"
+        elif [ "${CTR_RUNTIME}" == "io.containerd.runc.v2" ]; then
+		export RUNTIME="runc"
+        else
+		die "Unknown runtime ${CTR_RUNTIME}"
+	fi
+
+	metrics_json_init
+	save_config
+	get_memory_usage
+
+	if [ "$RUNTIME" == "runc" ]; then
+		get_runc_individual_memory
+	elif [ "$RUNTIME" == "kata-runtime" ]; then
+		get_individual_memory
+	fi
+
+	metrics_json_save
+}
+
+main "$@"
--- a/tests/metrics/density/memory_usage_inside_container.sh
+++ b/tests/metrics/density/memory_usage_inside_container.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+# Copyright (c) 2017-2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+#  Description of the test:
+#  This test launches a busybox container and inside
+#  memory free, memory available and total memory
+#  is measured by using /proc/meminfo.
+
+set -e
+
+# General env
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+source "${SCRIPT_PATH}/../lib/common.bash"
+
+TEST_NAME="memory footprint inside container"
+VERSIONS_FILE="${SCRIPT_PATH}/../../versions.yaml"
+IMAGE='quay.io/prometheus/busybox:latest'
+CMD="sleep 10; cat /proc/meminfo"
+# We specify here in 'k', as that then matches the results we get from the meminfo,
+# which makes later direct comparison easier.
+MEMSIZE=${MEMSIZE:-$((2048*1024))}
+
+# this variable determines the number of attempts when a test
+# result is considered not valid (a zero value or a negative value)
+MAX_FAILED_ATTEMPTS=3
+memtotalAvg=0
+units_memtotal=""
+memfreeAvg=0
+units_memfree=""
+memavailableAvg=0
+units_memavailable=""
+
+# count_iters: is the index of the current iteration
+count_iters=0
+
+# valid_result: if value stored is '1' the result is valid, '0' otherwise
+valid_result=0
+
+parse_results() {
+	local raw_results="${1}"
+
+	# Variables used for sum cummulative values in the case of two or more reps.
+	# and used to compute average results for 'json' output format.
+	local memtotal_acu="${2:-0}"
+	local memfree_acu="${3:-0}"
+	local memavailable_acu="${4:-0}"
+
+	local memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $2}')
+	units_memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $3}')
+
+	local memfree=$(echo "$raw_results" | awk '/MemFree/ {print $2}')
+	units_memfree=$(echo "$raw_results" | awk '/MemFree/ {print $3}')
+
+	local memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $2}')
+	units_memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $3}')
+
+	# check results: if any result is zero or negative, it is considered as invalid, and the test will be repeated.
+	if ((  $(echo "$memtotal <= 0" | bc -l) )) || ((  $(echo "$memfree <= 0" | bc -l) )) || ((  $(echo "$memavailable <= 0" | bc -l) )); then
+		MAX_FAILED_ATTEMPTS=$((MAX_FAILED_ATTEMPTS-1))
+		valid_result=0
+		info "Skipping invalid result:  memtotal: $memtotal  memfree: $memfree  memavailable: $memavailable"
+		return 0
+	fi
+
+	memtotalAvg=$((memtotal+memtotal_acu))
+	memfreeAvg=$((memfree+memfree_acu))
+	memavailableAvg=$((memavailable+memavailable_acu))
+	valid_result=1
+	info "Iteration# $count_iters  memtotal: $memtotal  memfree: $memfree  memavailable: $memavailable"
+}
+
+store_results_json() {
+	metrics_json_start_array
+	memtotalAvg=$(echo "scale=2; $memtotalAvg / $count_iters" | bc)
+	memfreeAvg=$(echo "scale=2; $memfreeAvg / $count_iters" | bc)
+	memavailableAvg=$(echo "scale=2; $memavailableAvg / $count_iters" | bc)
+
+	local json="$(cat << EOF
+	{
+		"memrequest": {
+			"Result" : ${MEMSIZE},
+			"Units"  : "Kb"
+		},
+		"memtotal": {
+			"Result" : ${memtotalAvg},
+			"Units"  : "${units_memtotal}"
+		},
+		"memfree": {
+			"Result" : ${memfreeAvg},
+			"Units"  : "${units_memfree}"
+		},
+		"memavailable": {
+			"Result" : ${memavailableAvg},
+			"Units"  : "${units_memavailable}"
+		},
+		"repetitions": {
+			"Result" : ${count_iters}
+		}
+	}
+EOF
+)"
+	metrics_json_add_array_element "$json"
+	metrics_json_end_array "Results"
+	metrics_json_save
+}
+
+function main() {
+	# switch to select output format
+	local num_iterations=${1:-1}
+	info "Iterations: $num_iterations"
+
+	# Check tools/commands dependencies
+	cmds=("awk" "ctr")
+	init_env
+	check_cmds "${cmds[@]}"
+	check_images "${IMAGE}"
+	metrics_json_init
+	while [  $count_iters -lt $num_iterations ]; do
+		local output=$(sudo -E "${CTR_EXE}" run --memory-limit $((MEMSIZE*1024)) --rm --runtime=$CTR_RUNTIME $IMAGE busybox sh -c "$CMD" 2>&1)
+		parse_results "${output}" "${memtotalAvg}" "${memfreeAvg}" "${memavailableAvg}"
+
+		# quit if number of attempts exceeds the allowed value.
+		[ ${MAX_FAILED_ATTEMPTS} -eq 0 ] && die "Max number of attempts exceeded."
+		[ ${valid_result} -eq 1 ] && count_iters=$((count_iters+1))
+	done
+	store_results_json
+	clean_env_ctr
+}
+
+# Parameters
+# @1: num_iterations {integer}
+main "$@"
--- a/tests/metrics/gha-run.sh
+++ b/tests/metrics/gha-run.sh
@@ -9,24 +9,28 @@ set -o errexit
 set -o nounset
 set -o pipefail

-kata_tarball_dir=${2:-kata-artifacts}
+kata_tarball_dir="${2:-kata-artifacts}"
 metrics_dir="$(dirname "$(readlink -f "$0")")"
 source "${metrics_dir}/../common.bash"
+source "${metrics_dir}/lib/common.bash"

-create_symbolic_links() {
-	hypervisor="${1:-qemu}"
+declare -r results_dir="${metrics_dir}/results"
+declare -r checkmetrics_dir="${metrics_dir}/cmd/checkmetrics"
+declare -r checkmetrics_config_dir="${checkmetrics_dir}/ci_worker"
+
+function create_symbolic_links() {
 	local link_configuration_file="/opt/kata/share/defaults/kata-containers/configuration.toml"
-	local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${hypervisor}.toml"
+	local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${KATA_HYPERVISOR}.toml"

-	if [ ${hypervisor} != 'qemu' ] && [ ${hypervisor} != 'clh' ]; then
-		die "Failed to set the configuration.toml: '${hypervisor}' is not recognized as a valid hypervisor name."
+	if [ "${KATA_HYPERVISOR}" != 'qemu' ] && [ "${KATA_HYPERVISOR}" != 'clh' ]; then
+		die "Failed to set the configuration.toml: '${KATA_HYPERVISOR}' is not recognized as a valid hypervisor name."
 	fi

 	sudo ln -sf "${source_configuration_file}" "${link_configuration_file}"
 }

 # Configures containerd
-overwrite_containerd_config() {
+function overwrite_containerd_config() {
 	containerd_config="/etc/containerd/config.toml"
 	sudo rm "${containerd_config}"
 	sudo tee "${containerd_config}" << EOF
@@ -44,7 +48,7 @@ version = 2
 EOF
 }

-install_kata() {
+function install_kata() {
 	local kata_tarball="kata-static.tar.xz"
 	declare -r katadir="/opt/kata"
 	declare -r destdir="/"
@@ -53,7 +57,7 @@ install_kata() {
 	# Removing previous kata installation
 	sudo rm -rf "${katadir}"

-	pushd ${kata_tarball_dir}
+	pushd "${kata_tarball_dir}"
 	sudo tar -xvf "${kata_tarball}" -C "${destdir}"
 	popd

@@ -64,17 +68,26 @@ install_kata() {

 	check_containerd_config_for_kata
 	restart_containerd_service
+	install_checkmetrics
 }

-check_containerd_config_for_kata() {
+function install_checkmetrics() {
+	# Ensure we have the latest checkmetrics
+	pushd "${checkmetrics_dir}"
+	make
+	sudo make install
+	popd
+}
+
+function check_containerd_config_for_kata() {
 	# check containerd config
 	declare -r line1="default_runtime_name = \"kata\""
 	declare -r line2="runtime_type = \"io.containerd.kata.v2\""
 	declare -r num_lines_containerd=2
 	declare -r containerd_path="/etc/containerd/config.toml"
-	local count_matches=$(grep -ic  "$line1\|$line2" ${containerd_path})
+	local count_matches=$(grep -ic  "$line1\|$line2" "${containerd_path}")

-	if [ $count_matches = $num_lines_containerd ]; then
+	if [ "${count_matches}" = "${num_lines_containerd}" ]; then
 		info "containerd ok"
 	else
 		info "overwriting containerd configuration w/ a valid one"
@@ -82,21 +95,62 @@ check_containerd_config_for_kata() {
 	fi
 }

+function check_metrics() {
+	local cm_base_file="${checkmetrics_config_dir}/checkmetrics-json-${KATA_HYPERVISOR}-kata-metric8.toml"
+	checkmetrics --debug --percentage --basefile "${cm_base_file}" --metricsdir "${results_dir}"
+	cm_result=$?
+	if [ "${cm_result}" != 0 ]; then
+		die "run-metrics-ci: checkmetrics FAILED (${cm_result})"
+	fi
+}
+
+function make_tarball_results() {
+	compress_metrics_results_dir "${metrics_dir}/results" "${GITHUB_WORKSPACE}/results-${KATA_HYPERVISOR}.tar.gz"
+}
+
 function run_test_launchtimes() {
-	hypervisor="${1}"
+	info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor"

-	info "Running Launch Time test using ${hypervisor} hypervisor"
-
-	create_symbolic_links "${hypervisor}"
+	create_symbolic_links
 	bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20
 }

+function run_test_memory_usage() {
+	info "Running memory-usage test using ${KATA_HYPERVISOR} hypervisor"
+
+	create_symbolic_links
+	bash tests/metrics/density/memory_usage.sh 20 5
+
+	check_metrics
+}
+
+function run_test_memory_usage_inside_container() {
+	info "Running memory-usage inside the container test using ${KATA_HYPERVISOR} hypervisor"
+
+	# ToDo: remove the exit once the metrics workflow is stable
+	exit 0
+	create_symbolic_links
+	bash tests/metrics/density/memory_usage_inside_container.sh 5
+}
+
+function run_test_blogbench() {
+	info "Running Blogbench test using ${KATA_HYPERVISOR} hypervisor"
+
+	# ToDo: remove the exit once the metrics workflow is stable
+	exit 0
+	create_symbolic_links
+	bash tests/metrics/storage/blogbench.sh
+}
+
 function main() {
 	action="${1:-}"
 	case "${action}" in
 		install-kata) install_kata ;;
-		run-test-launchtimes-qemu) run_test_launchtimes "qemu" ;;
-		run-test-launchtimes-clh) run_test_launchtimes "clh" ;;
+		make-tarball-results) make_tarball_results ;;
+		run-test-launchtimes) run_test_launchtimes ;;
+		run-test-memory-usage) run_test_memory_usage ;;
+		run-test-memory-usage-inside-container) run_test_memory_usage_inside_container ;;
+		run-test-blogbench) run_test_blogbench ;;
 		*) >&2 die "Invalid argument" ;;
 	esac
 }
--- a/tests/metrics/lib/common.bash
+++ b/tests/metrics/lib/common.bash
@@ -47,14 +47,14 @@ quay.io/libpod"
 #
 # cmds=(“cmd1” “cmd2”)
 # check_cmds "${cmds[@]}"
-check_cmds()
+function check_cmds()
 {
 	local cmd req_cmds=( "$@" )
 	for cmd in "${req_cmds[@]}"; do
 		if ! command -v "$cmd" > /dev/null 2>&1; then
 			die "command $cmd not available"
 		fi
-		echo "command: $cmd: yes"
+		info "command: $cmd: yes"
 	done
 }

@@ -68,19 +68,20 @@ check_cmds()
 #
 # images=(“img1” “img2”)
 # check_imgs "${images[@]}"
-check_images()
+function check_images()
 {
 	local img req_images=( "$@" )
 	for img in "${req_images[@]}"; do
-		echo "ctr pull'ing: $img"
+		info "ctr pull'ing: $img"
 		if ! sudo "${CTR_EXE}" image pull "$img"; then
 			die "Failed to pull image $img"
 		fi
-		echo "ctr pull'd: $img"
+		info "ctr pull'd: $img"
 	done
 }

-generate_build_dockerfile() {
+function generate_build_dockerfile()
+{
 	local dockerfile="$1"
 	local image="$2"
 	local map_key="$3"
@@ -99,14 +100,14 @@ generate_build_dockerfile() {
 # This function performs a build on the image names
 # passed in, to ensure that we have the latest changes from
 # the dockerfiles
-build_dockerfile_image()
+function build_dockerfile_image()
 {
 	local image="$1"
 	local dockerfile_path="$2"
 	local dockerfile_dir=${2%/*}

 	if [ -f "$dockerfile_path" ]; then
-		echo "docker building $image"
+		info "docker building $image"
 		if ! sudo "${DOCKER_EXE}" build --build-arg http_proxy="${http_proxy}" --build-arg https_proxy="${https_proxy}" --label "$image" --tag "${image}" -f "$dockerfile_path" "$dockerfile_dir"; then
 			die "Failed to docker build image $image"
 		fi
@@ -119,7 +120,7 @@ build_dockerfile_image()

 # This function removes the ctr image, builds a new one using a dockerfile
 # and imports the image from docker to ctr
-check_ctr_images()
+function check_ctr_images()
 {
 	local ctr_image="$1"
 	local dockerfile_path="$2"
@@ -138,7 +139,7 @@ check_ctr_images()

 # A one time (per uber test cycle) init that tries to get the
 # system to a 'known state' as much as possible
-metrics_onetime_init()
+function metrics_onetime_init()
 {
 	# The onetime init must be called once, and only once
 	if [ ! -z "$onetime_init_done" ]; then
@@ -155,14 +156,14 @@ metrics_onetime_init()

 # Print a banner to the logs noting clearly which test
 # we are about to run
-test_banner()
+function test_banner()
 {
-	echo -e "\n===== starting test [$1] ====="
+	info -e "\n===== starting test [$1] ====="
 }

 # Initialization/verification environment. This function makes
 # minimal steps for metrics/tests execution.
-init_env()
+function init_env()
 {
 	test_banner "${TEST_NAME}"

@@ -183,7 +184,8 @@ init_env()
 # This function checks if there are containers or
 # shim/proxy/hypervisor processes up, if found, they are
 # killed to start test with clean environment.
-kill_processes_before_start() {
+function kill_processes_before_start()
+{
 	DOCKER_PROCS=$(sudo "${DOCKER_EXE}" ps -q)
 	[[ -n "${DOCKER_PROCS}" ]] && clean_env

@@ -195,26 +197,29 @@ kill_processes_before_start() {

 # Generate a random name - generally used when creating containers, but can
 # be used for any other appropriate purpose
-random_name() {
+function random_name()
+{
 	mktemp -u kata-XXXXXX
 }

-show_system_ctr_state() {
-	echo "Showing system state:"
-	echo " --Check containers--"
+function show_system_ctr_state()
+{
+	info "Showing system state:"
+	info " --Check containers--"
 	sudo "${CTR_EXE}" c list
-	echo " --Check tasks--"
+	info " --Check tasks--"
 	sudo "${CTR_EXE}" task list

 	local processes="containerd-shim-kata-v2"

 	for p in ${processes}; do
-		echo " --pgrep ${p}--"
+		info " --pgrep ${p}--"
 		pgrep -a ${p}
 	done
 }

-common_init(){
+function common_init()
+{
 	if [ "$CTR_RUNTIME" == "io.containerd.kata.v2" ] || [ "$RUNTIME" == "containerd-shim-kata-v2" ]; then
 		extract_kata_env
 	else
@@ -225,17 +230,18 @@ common_init(){
 	fi
 }

-
 # Save the current KSM settings so we can restore them later
-save_ksm_settings(){
-	echo "saving KSM settings"
+function save_ksm_settings()
+{
+	info "saving KSM settings"
 	ksm_stored_run=$(cat ${KSM_ENABLE_FILE})
 	ksm_stored_pages=$(cat ${KSM_ENABLE_FILE})
 	ksm_stored_sleep=$(cat ${KSM_ENABLE_FILE})
 }

-set_ksm_aggressive(){
-	echo "setting KSM to aggressive mode"
+function set_ksm_aggressive()
+{
+	info "setting KSM to aggressive mode"
 	# Flip the run off/on to ensure a restart/rescan
 	sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}"
 	sudo bash -c "echo ${KSM_AGGRESIVE_PAGES} > ${KSM_PAGES_FILE}"
@@ -245,7 +251,7 @@ set_ksm_aggressive(){
 	if [ "${KATA_HYPERVISOR}" == "qemu" ]; then
 		# Disable virtio-fs and save whether it was enabled previously
 		set_virtio_out=$(sudo -E PATH="$PATH" "${LIB_DIR}/../../.ci/set_kata_config.sh" shared_fs virtio-9p)
-		echo "${set_virtio_out}"
+		info "${set_virtio_out}"
 		grep -q "already" <<< "${set_virtio_out}" || was_virtio_fs=true;
 	fi
 }
@@ -256,8 +262,9 @@ restore_virtio_fs(){
 		info "Not restoring virtio-fs since it wasn't enabled previously"
 }

-restore_ksm_settings(){
-	echo "restoring KSM settings"
+function restore_ksm_settings()
+{
+	info "restoring KSM settings"
 	# First turn off the run to ensure if we are then re-enabling
 	# that any changes take effect
 	sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}"
@@ -267,15 +274,17 @@ restore_ksm_settings(){
 	[ "${KATA_HYPERVISOR}" == "qemu" ] && restore_virtio_fs
 }

-disable_ksm(){
-	echo "disabling KSM"
+function disable_ksm()
+{
+	info "disabling KSM"
 	sudo bash -c "echo 0 > ${KSM_ENABLE_FILE}"
 	[ "${KATA_HYPERVISOR}" == "qemu" ] && restore_virtio_fs
 }

 # See if KSM is enabled.
 # If so, amend the test name to reflect that
-check_for_ksm(){
+function check_for_ksm()
+{
 	if [ ! -f ${KSM_ENABLE_FILE} ]; then
 		return
 	fi
@@ -294,7 +303,8 @@ check_for_ksm(){
 # a full scan has managed to do few new merges)
 #
 # arg1 - timeout in seconds
-wait_ksm_settle(){
+function wait_ksm_settle()
+{
 	[[ "$RUNTIME" == "runc" ]] || [[ "$CTR_RUNTIME" == "io.containerd.runc.v2" ]] && return
 	local t pcnt
 	local oldscan=-1 newscan
@@ -305,7 +315,7 @@ wait_ksm_settle(){
 	# Wait some time for KSM to kick in to avoid early dismissal
 	for ((t=0; t<5; t++)); do
 		pages=$(cat "${KSM_PAGES_SHARED}")
-		[[ "$pages" -ne 0 ]] && echo "Discovered KSM activity" && break
+		[[ "$pages" -ne 0 ]] && info "Discovered KSM activity" && break
 		sleep 1
 	done

@@ -315,13 +325,13 @@ wait_ksm_settle(){

 		newscan=$(cat /sys/kernel/mm/ksm/full_scans)
 		newpages=$(cat "${KSM_PAGES_SHARED}")
-		[[ "$newpages" -eq 0 ]] && echo "No need to wait for KSM to settle" && return
+		[[ "$newpages" -eq 0 ]] && info "No need to wait for KSM to settle" && return

 		if (( newscan != oldscan )); then
-			echo -e "\nnew full_scan ($oldscan to $newscan)"
+			info -e "\nnew full_scan ($oldscan to $newscan)"

 			# Do we have a previous scan to compare with
-			echo "check pages $oldpages to $newpages"
+			info "check pages $oldpages to $newpages"

 			if (( oldpages != -1 )); then
 				# avoid divide by zero problems
@@ -330,14 +340,14 @@ wait_ksm_settle(){
 					# abs()
 					pcnt=$(( $pcnt * -1 ))

-					echo "$oldpages to $newpages is ${pcnt}%"
+					info "$oldpages to $newpages is ${pcnt}%"

 					if (( $pcnt <= 5 )); then
-						echo "KSM stabilised at ${t}s"
+						info "KSM stabilised at ${t}s"
 						return
 					fi
 				else
-					echo "$oldpages KSM pages... waiting"
+					info "$oldpages KSM pages... waiting"
 				fi
 			fi
 			oldscan=$newscan
@@ -347,7 +357,7 @@ wait_ksm_settle(){
 		fi
 		sleep 1
 	done
-	echo "Timed out after ${1}s waiting for KSM to settle"
+	info "Timed out after ${1}s waiting for KSM to settle"
 }

 common_init
--- a/tests/metrics/storage/blogbench.sh
+++ b/tests/metrics/storage/blogbench.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+#
+# Copyright (c) 2018-2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Description of the test:
+# This test runs the 'blogbench', and extracts the 'scores' for reads
+# and writes
+# Note - the scores are *not* normalised for the number of iterations run,
+# they are total scores for all iterations (this is the blogbench default output)
+
+set -e
+
+# General env
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+source "${SCRIPT_PATH}/../lib/common.bash"
+
+TEST_NAME="blogbench"
+IMAGE="docker.io/library/local-blogbench:latest"
+DOCKERFILE="${SCRIPT_PATH}/blogbench_dockerfile/Dockerfile"
+
+# Number of iterations for blogbench to run - note, results are not
+# scaled to iterations - more iterations results in bigger results
+ITERATIONS="${ITERATIONS:-30}"
+
+# Directory to run the test on
+# This is run inside of the container
+TESTDIR="${TESTDIR:-/tmp}"
+CMD="blogbench -i ${ITERATIONS} -d ${TESTDIR}"
+
+function main() {
+	# Check tools/commands dependencies
+	cmds=("awk" "docker")
+
+	init_env
+	check_cmds "${cmds[@]}"
+	check_ctr_images "${IMAGE}" "${DOCKERFILE}"
+	metrics_json_init
+
+	local output=$(sudo -E ${CTR_EXE} run --rm --runtime=${CTR_RUNTIME} ${IMAGE} test ${CMD})
+
+	# Save configuration
+	metrics_json_start_array
+
+	local frequency=$(echo "${output}" | grep "Frequency" | cut -d "=" -f2 | cut -d ' ' -f2)
+	local iterations=$(echo "${output}" | grep -w "iterations" | cut -d ' ' -f3)
+	local spawing_writers=$(echo "${output}" | grep -w "writers" | cut -d ' ' -f2)
+	local spawing_rewriters=$(echo "${output}" | grep -w "rewriters" | cut -d ' ' -f2)
+	local spawing_commenters=$(echo "${output}" | grep -w "commenters" | cut -d ' ' -f2)
+	local spawing_readers=$(echo "${output}" | grep -w "readers" | cut -d ' ' -f2)
+
+	local json="$(cat << EOF
+	{
+		"Frequency" : ${frequency},
+		"Iterations" : ${iterations},
+		"Number of spawing writers" : ${spawing_writers},
+		"Number of spawing rewriters" : ${spawing_rewriters},
+		"Number of spawing commenters" : ${spawing_commenters},
+		"Number of spawing readers" : ${spawing_readers}
+	}
+EOF
+)"
+	metrics_json_add_array_element "${json}"
+	metrics_json_end_array "Config"
+
+	# Save results
+	metrics_json_start_array
+
+	local writes=$(tail -2 <<< "${output}" | head -1 | awk '{print $5}')
+	local reads=$(tail -1 <<< "${output}" | awk '{print $6}')
+
+	# Obtaining other Blogbench results
+	local -r data=$(echo "${output}" | tail -n +12 | head -n -3)
+	local nb_blogs=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $1} ' | tr '\t' ',' | sed '$ s/.$//')
+	local r_articles=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $2} ' | tr '\t' ',' | sed '$ s/.$//')
+	local w_articles=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $3} ' | tr '\t' ',' | sed '$ s/.$//')
+	local r_pictures=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $4} ' | tr '\t' ',' | sed '$ s/.$//')
+	local w_pictures=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $5} ' | tr '\t' ',' | sed '$ s/.$//')
+	local r_comments=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $6} ' | tr '\t' ',' | sed '$ s/.$//')
+	local w_comments=$(echo "${data}" | awk ' BEGIN {ORS="\t"} {print $7} ' | tr '\t' ',' | sed '$ s/.$//')
+
+	local json="$(cat << EOF
+	{
+		"write": {
+			"Result" : "${writes}",
+			"Units"  : "items"
+		},
+		"read": {
+			"Result" : "${reads}",
+			"Units"  : "items"
+		},
+		"Nb blogs": {
+			"Result" : "${nb_blogs}"
+		},
+		"R articles": {
+			"Result" : "${r_articles}"
+		},
+		"W articles": {
+			"Result" : "${w_articles}"
+		},
+		"R pictures": {
+			"Result" : "${r_pictures}"
+		},
+		"W pictures": {
+			"Result" : "${w_pictures}"
+		},
+		"R comments": {
+			"Result" : "${r_comments}"
+		},
+		"W comments": {
+			"Result" : "${w_comments}"
+		}
+	}
+EOF
+)"
+
+	metrics_json_add_array_element "${json}"
+	metrics_json_end_array "Results"
+	metrics_json_save
+	clean_env_ctr
+}
+
+main "$@"
--- a/Show More
+++ b/Show More