diff --git a/.github/workflows/PR-wip-checks.yaml b/.github/workflows/PR-wip-checks.yaml index 97c35145a..98195b886 100644 --- a/.github/workflows/PR-wip-checks.yaml +++ b/.github/workflows/PR-wip-checks.yaml @@ -9,6 +9,10 @@ on: - labeled - unlabeled +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: pr_wip_check: runs-on: ubuntu-latest diff --git a/.github/workflows/add-backport-label.yaml b/.github/workflows/add-backport-label.yaml index 3df518b54..790ff1721 100644 --- a/.github/workflows/add-backport-label.yaml +++ b/.github/workflows/add-backport-label.yaml @@ -10,6 +10,10 @@ on: - labeled - unlabeled +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check-issues: if: ${{ github.event.label.name != 'auto-backport' }} diff --git a/.github/workflows/add-issues-to-project.yaml b/.github/workflows/add-issues-to-project.yaml index 93c31e7a1..117e62600 100644 --- a/.github/workflows/add-issues-to-project.yaml +++ b/.github/workflows/add-issues-to-project.yaml @@ -11,6 +11,10 @@ on: - opened - reopened +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: add-new-issues-to-backlog: runs-on: ubuntu-latest diff --git a/.github/workflows/add-pr-sizing-label.yaml b/.github/workflows/add-pr-sizing-label.yaml index ffd9b06a9..313c9f285 100644 --- a/.github/workflows/add-pr-sizing-label.yaml +++ b/.github/workflows/add-pr-sizing-label.yaml @@ -12,6 +12,10 @@ on: - reopened - synchronize +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: add-pr-size-label: runs-on: ubuntu-latest diff --git a/.github/workflows/auto-backport.yaml b/.github/workflows/auto-backport.yaml index 6504dc488..e2be39022 100644 --- a/.github/workflows/auto-backport.yaml +++ b/.github/workflows/auto-backport.yaml @@ -2,6 +2,10 @@ on: pull_request_target: types: ["labeled", "closed"] +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: backport: name: Backport PR diff --git a/.github/workflows/build-kata-static-tarball-amd64.yaml b/.github/workflows/build-kata-static-tarball-amd64.yaml index f0f606850..869d49bc6 100644 --- a/.github/workflows/build-kata-static-tarball-amd64.yaml +++ b/.github/workflows/build-kata-static-tarball-amd64.yaml @@ -99,7 +99,7 @@ jobs: path: kata-artifacts - name: merge-artifacts run: | - ./tools/packaging/kata-deploy/local-build/kata-deploy-merge-builds.sh kata-artifacts + ./tools/packaging/kata-deploy/local-build/kata-deploy-merge-builds.sh kata-artifacts versions.yaml - name: store-artifacts uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/build-kata-static-tarball-arm64.yaml b/.github/workflows/build-kata-static-tarball-arm64.yaml index 2ad97a0ba..cafc6e020 100644 --- a/.github/workflows/build-kata-static-tarball-arm64.yaml +++ b/.github/workflows/build-kata-static-tarball-arm64.yaml @@ -2,6 +2,10 @@ name: CI | Build kata-static tarball for arm64 on: workflow_call: inputs: + stage: + required: false + type: string + default: test tarball-suffix: required: false type: string @@ -29,6 +33,8 @@ jobs: - rootfs-initrd - shim-v2 - virtiofsd + stage: + - ${{ inputs.stage }} steps: - name: Adjust a permission for repo run: | @@ -83,7 +89,7 @@ jobs: path: kata-artifacts - name: merge-artifacts run: | - ./tools/packaging/kata-deploy/local-build/kata-deploy-merge-builds.sh kata-artifacts + ./tools/packaging/kata-deploy/local-build/kata-deploy-merge-builds.sh kata-artifacts versions.yaml - name: store-artifacts uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/build-kata-static-tarball-s390x.yaml b/.github/workflows/build-kata-static-tarball-s390x.yaml index cf2831033..0fe7e9200 100644 --- a/.github/workflows/build-kata-static-tarball-s390x.yaml +++ b/.github/workflows/build-kata-static-tarball-s390x.yaml @@ -2,6 +2,10 @@ name: CI | Build kata-static tarball for s390x on: workflow_call: inputs: + stage: + required: false + type: string + default: test tarball-suffix: required: false type: string @@ -25,6 +29,8 @@ jobs: - rootfs-initrd - shim-v2 - virtiofsd + stage: + - ${{ inputs.stage }} steps: - name: Adjust a permission for repo run: | @@ -80,7 +86,7 @@ jobs: path: kata-artifacts - name: merge-artifacts run: | - ./tools/packaging/kata-deploy/local-build/kata-deploy-merge-builds.sh kata-artifacts + ./tools/packaging/kata-deploy/local-build/kata-deploy-merge-builds.sh kata-artifacts versions.yaml - name: store-artifacts uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/cargo-deny-runner.yaml b/.github/workflows/cargo-deny-runner.yaml index 65237c7be..21d3d1f53 100644 --- a/.github/workflows/cargo-deny-runner.yaml +++ b/.github/workflows/cargo-deny-runner.yaml @@ -7,6 +7,11 @@ on: - reopened - synchronize paths-ignore: [ '**.md', '**.png', '**.jpg', '**.jpeg', '**.svg', '/docs/**' ] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: cargo-deny-runner: runs-on: ubuntu-latest diff --git a/.github/workflows/cc-payload-amd64.yaml b/.github/workflows/cc-payload-amd64.yaml index 56649657b..e3ad241af 100644 --- a/.github/workflows/cc-payload-amd64.yaml +++ b/.github/workflows/cc-payload-amd64.yaml @@ -23,7 +23,7 @@ jobs: - ovmf - qemu-snp-experimental - qemu-tdx-experimental - - cc-sev-rootfs-initrd + - rootfs-initrd-sev - cc-tdx-td-shim - tdvf include: @@ -34,7 +34,7 @@ jobs: - measured_rootfs: yes asset: cc-rootfs-image - measured_rootfs: yes - asset: cc-tdx-rootfs-image + asset: rootfs-image-tdx steps: - uses: actions/checkout@v3 - name: Build ${{ matrix.asset }} diff --git a/.github/workflows/ci-nightly.yaml b/.github/workflows/ci-nightly.yaml index 9a47ce0e4..5c7676710 100644 --- a/.github/workflows/ci-nightly.yaml +++ b/.github/workflows/ci-nightly.yaml @@ -4,6 +4,10 @@ on: - cron: '0 0 * * *' workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: kata-containers-ci-on-push: uses: ./.github/workflows/ci.yaml diff --git a/.github/workflows/ci-on-push.yaml b/.github/workflows/ci-on-push.yaml index 6d4cc7fc0..99d483720 100644 --- a/.github/workflows/ci-on-push.yaml +++ b/.github/workflows/ci-on-push.yaml @@ -14,6 +14,11 @@ on: - labeled paths-ignore: - 'docs/**' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: kata-containers-ci-on-push: if: ${{ contains(github.event.pull_request.labels.*.name, 'ok-to-test') }} diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index faec7fca4..52a86b08d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -74,3 +74,24 @@ jobs: with: tarball-suffix: -${{ inputs.tag }} commit-hash: ${{ inputs.commit-hash }} + + run-cri-containerd-tests: + needs: build-kata-static-tarball-amd64 + uses: ./.github/workflows/run-cri-containerd-tests.yaml + with: + tarball-suffix: -${{ inputs.tag }} + commit-hash: ${{ inputs.commit-hash }} + + run-nydus-tests: + needs: build-kata-static-tarball-amd64 + uses: ./.github/workflows/run-nydus-tests.yaml + with: + tarball-suffix: -${{ inputs.tag }} + commit-hash: ${{ inputs.commit-hash }} + + run-vfio-tests: + needs: build-kata-static-tarball-amd64 + uses: ./.github/workflows/run-vfio-tests.yaml + with: + tarball-suffix: -${{ inputs.tag }} + commit-hash: ${{ inputs.commit-hash }} diff --git a/.github/workflows/commit-message-check.yaml b/.github/workflows/commit-message-check.yaml index 20be9f688..9a729be93 100644 --- a/.github/workflows/commit-message-check.yaml +++ b/.github/workflows/commit-message-check.yaml @@ -6,6 +6,10 @@ on: - reopened - synchronize +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + env: error_msg: |+ See the document below for help on formatting commits for the project. diff --git a/.github/workflows/darwin-tests.yaml b/.github/workflows/darwin-tests.yaml index bf8813776..02bbb0e72 100644 --- a/.github/workflows/darwin-tests.yaml +++ b/.github/workflows/darwin-tests.yaml @@ -6,6 +6,11 @@ on: - reopened - synchronize paths-ignore: [ '**.md', '**.png', '**.jpg', '**.jpeg', '**.svg', '/docs/**' ] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + name: Darwin tests jobs: test: diff --git a/.github/workflows/kata-runtime-classes-sync.yaml b/.github/workflows/kata-runtime-classes-sync.yaml new file mode 100644 index 000000000..9cb995df1 --- /dev/null +++ b/.github/workflows/kata-runtime-classes-sync.yaml @@ -0,0 +1,36 @@ +on: + pull_request: + types: + - opened + - edited + - reopened + - synchronize + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + kata-deploy-runtime-classes-check: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Ensure the split out runtime classes match the all-in-one file + run: | + pushd tools/packaging/kata-deploy/runtimeclasses/ + echo "::group::Combine runtime classes" + for runtimeClass in `find . -type f \( -name "*.yaml" -and -not -name "kata-runtimeClasses.yaml" \) | sort`; do + echo "Adding ${runtimeClass} to the resultingRuntimeClasses.yaml" + cat ${runtimeClass} >> resultingRuntimeClasses.yaml; + done + echo "::endgroup::" + echo "::group::Displaying the content of resultingRuntimeClasses.yaml" + cat resultingRuntimeClasses.yaml + echo "::endgroup::" + echo "" + echo "::group::Displaying the content of kata-runtimeClasses.yaml" + cat kata-runtimeClasses.yaml + echo "::endgroup::" + echo "" + diff resultingRuntimeClasses.yaml kata-runtimeClasses.yaml diff --git a/.github/workflows/payload-after-push.yaml b/.github/workflows/payload-after-push.yaml index 871d73388..46766c54b 100644 --- a/.github/workflows/payload-after-push.yaml +++ b/.github/workflows/payload-after-push.yaml @@ -5,6 +5,10 @@ on: - main - stable-* +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: build-assets-amd64: uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index a50313fd0..d732a6723 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -4,6 +4,10 @@ on: tags: - '[0-9]+.[0-9]+.[0-9]+*' +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: build-and-push-assets-amd64: uses: ./.github/workflows/release-amd64.yaml @@ -117,6 +121,21 @@ jobs: GITHUB_TOKEN=${{ secrets.GIT_UPLOAD_TOKEN }} hub release edit -m "" -a "${tarball}" "${tag}" popd + upload-versions-yaml: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: upload versions.yaml + env: + GITHUB_TOKEN: ${{ secrets.GIT_UPLOAD_TOKEN }} + run: | + tag=$(echo $GITHUB_REF | cut -d/ -f3-) + pushd $GITHUB_WORKSPACE + versions_file="kata-containers-$tag-versions.yaml" + cp versions.yaml ${versions_file} + hub release edit -m "" -a "${versions_file}" "${tag}" + popd + upload-cargo-vendored-tarball: needs: upload-multi-arch-static-tarball runs-on: ubuntu-latest diff --git a/.github/workflows/require-pr-porting-labels.yaml b/.github/workflows/require-pr-porting-labels.yaml index 585e86bc4..b16e5c371 100644 --- a/.github/workflows/require-pr-porting-labels.yaml +++ b/.github/workflows/require-pr-porting-labels.yaml @@ -15,6 +15,10 @@ on: branches: - main +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: check-pr-porting-labels: runs-on: ubuntu-latest diff --git a/.github/workflows/run-cri-containerd-tests.yaml b/.github/workflows/run-cri-containerd-tests.yaml new file mode 100644 index 000000000..4b439733b --- /dev/null +++ b/.github/workflows/run-cri-containerd-tests.yaml @@ -0,0 +1,42 @@ +name: CI | Run cri-containerd tests +on: + workflow_call: + inputs: + tarball-suffix: + required: false + type: string + commit-hash: + required: false + type: string + +jobs: + run-cri-containerd: + strategy: + fail-fast: true + matrix: + containerd_version: ['lts', 'active'] + vmm: ['clh', 'qemu'] + runs-on: garm-ubuntu-2204 + env: + CONTAINERD_VERSION: ${{ matrix.containerd_version }} + GOPATH: ${{ github.workspace }} + KATA_HYPERVISOR: ${{ matrix.vmm }} + steps: + - uses: actions/checkout@v3 + with: + ref: ${{ inputs.commit-hash }} + + - name: Install dependencies + run: bash tests/integration/cri-containerd/gha-run.sh install-dependencies + + - name: get-kata-tarball + uses: actions/download-artifact@v3 + with: + name: kata-static-tarball-amd64${{ inputs.tarball-suffix }} + path: kata-artifacts + + - name: Install kata + run: bash tests/integration/cri-containerd/gha-run.sh install-kata kata-artifacts + + - name: Run cri-containerd tests + run: bash tests/integration/cri-containerd/gha-run.sh run diff --git a/.github/workflows/run-k8s-tests-on-aks.yaml b/.github/workflows/run-k8s-tests-on-aks.yaml index d8658270a..130be1829 100644 --- a/.github/workflows/run-k8s-tests-on-aks.yaml +++ b/.github/workflows/run-k8s-tests-on-aks.yaml @@ -40,37 +40,43 @@ jobs: GH_PR_NUMBER: ${{ inputs.pr-number }} KATA_HOST_OS: ${{ matrix.host_os }} KATA_HYPERVISOR: ${{ matrix.vmm }} + USING_NFD: "false" steps: - uses: actions/checkout@v3 with: ref: ${{ inputs.commit-hash }} - name: Download Azure CLI - run: bash tests/integration/gha-run.sh install-azure-cli + run: bash tests/integration/kubernetes/gha-run.sh install-azure-cli - name: Log into the Azure account - run: bash tests/integration/gha-run.sh login-azure + run: bash tests/integration/kubernetes/gha-run.sh login-azure env: AZ_APPID: ${{ secrets.AZ_APPID }} AZ_PASSWORD: ${{ secrets.AZ_PASSWORD }} AZ_TENANT_ID: ${{ secrets.AZ_TENANT_ID }} - name: Create AKS cluster - run: bash tests/integration/gha-run.sh create-cluster + timeout-minutes: 10 + run: bash tests/integration/kubernetes/gha-run.sh create-cluster - name: Install `bats` - run: bash tests/integration/gha-run.sh install-bats + run: bash tests/integration/kubernetes/gha-run.sh install-bats - name: Install `kubectl` - run: bash tests/integration/gha-run.sh install-kubectl + run: bash tests/integration/kubernetes/gha-run.sh install-kubectl - name: Download credentials for the Kubernetes CLI to use them - run: bash tests/integration/gha-run.sh get-cluster-credentials + run: bash tests/integration/kubernetes/gha-run.sh get-cluster-credentials + - name: Deploy Kata + timeout-minutes: 10 + run: bash tests/integration/kubernetes/gha-run.sh deploy-kata-aks + - name: Run tests timeout-minutes: 60 - run: bash tests/integration/gha-run.sh run-tests-aks + run: bash tests/integration/kubernetes/gha-run.sh run-tests - name: Delete AKS cluster if: always() - run: bash tests/integration/gha-run.sh delete-cluster + run: bash tests/integration/kubernetes/gha-run.sh delete-cluster diff --git a/.github/workflows/run-k8s-tests-on-sev.yaml b/.github/workflows/run-k8s-tests-on-sev.yaml index 3fc4ca835..a48425e1f 100644 --- a/.github/workflows/run-k8s-tests-on-sev.yaml +++ b/.github/workflows/run-k8s-tests-on-sev.yaml @@ -29,15 +29,20 @@ jobs: DOCKER_TAG: ${{ inputs.tag }} KATA_HYPERVISOR: ${{ matrix.vmm }} KUBECONFIG: /home/kata/.kube/config + USING_NFD: "false" steps: - uses: actions/checkout@v3 with: ref: ${{ inputs.commit-hash }} + - name: Deploy Kata + timeout-minutes: 10 + run: bash tests/integration/kubernetes/gha-run.sh deploy-kata-sev + - name: Run tests timeout-minutes: 30 - run: bash tests/integration/gha-run.sh run-tests-sev + run: bash tests/integration/kubernetes/gha-run.sh run-tests - name: Delete kata-deploy if: always() - run: bash tests/integration/gha-run.sh cleanup-sev + run: bash tests/integration/kubernetes/gha-run.sh cleanup-sev diff --git a/.github/workflows/run-k8s-tests-on-snp.yaml b/.github/workflows/run-k8s-tests-on-snp.yaml index 8aa1763d2..7196a9a1b 100644 --- a/.github/workflows/run-k8s-tests-on-snp.yaml +++ b/.github/workflows/run-k8s-tests-on-snp.yaml @@ -29,15 +29,20 @@ jobs: DOCKER_TAG: ${{ inputs.tag }} KATA_HYPERVISOR: ${{ matrix.vmm }} KUBECONFIG: /home/kata/.kube/config + USING_NFD: "false" steps: - uses: actions/checkout@v3 with: ref: ${{ inputs.commit-hash }} + - name: Deploy Kata + timeout-minutes: 10 + run: bash tests/integration/kubernetes/gha-run.sh deploy-kata-snp + - name: Run tests timeout-minutes: 30 - run: bash tests/integration/gha-run.sh run-tests-snp - + run: bash tests/integration/kubernetes/gha-run.sh run-tests + - name: Delete kata-deploy if: always() - run: bash tests/integration/gha-run.sh cleanup-snp + run: bash tests/integration/kubernetes/gha-run.sh cleanup-snp diff --git a/.github/workflows/run-k8s-tests-on-tdx.yaml b/.github/workflows/run-k8s-tests-on-tdx.yaml index ccbc16db7..a3899177c 100644 --- a/.github/workflows/run-k8s-tests-on-tdx.yaml +++ b/.github/workflows/run-k8s-tests-on-tdx.yaml @@ -28,16 +28,20 @@ jobs: DOCKER_REPO: ${{ inputs.repo }} DOCKER_TAG: ${{ inputs.tag }} KATA_HYPERVISOR: ${{ matrix.vmm }} - KUBECONFIG: /etc/rancher/k3s/k3s.yaml + USING_NFD: "true" steps: - uses: actions/checkout@v3 with: ref: ${{ inputs.commit-hash }} + - name: Deploy Kata + timeout-minutes: 10 + run: bash tests/integration/kubernetes/gha-run.sh deploy-kata-tdx + - name: Run tests timeout-minutes: 30 - run: bash tests/integration/gha-run.sh run-tests-tdx - + run: bash tests/integration/kubernetes/gha-run.sh run-tests + - name: Delete kata-deploy if: always() - run: bash tests/integration/gha-run.sh cleanup-tdx + run: bash tests/integration/kubernetes/gha-run.sh cleanup-tdx diff --git a/.github/workflows/run-metrics.yaml b/.github/workflows/run-metrics.yaml index 92a5f8af9..f1ac9d61a 100644 --- a/.github/workflows/run-metrics.yaml +++ b/.github/workflows/run-metrics.yaml @@ -46,6 +46,9 @@ jobs: - name: run blogbench test run: bash tests/metrics/gha-run.sh run-test-blogbench + - name: run tensorflow test + run: bash tests/metrics/gha-run.sh run-test-tensorflow + - name: make metrics tarball ${{ matrix.vmm }} run: bash tests/metrics/gha-run.sh make-tarball-results diff --git a/.github/workflows/run-nydus-tests.yaml b/.github/workflows/run-nydus-tests.yaml new file mode 100644 index 000000000..647582c08 --- /dev/null +++ b/.github/workflows/run-nydus-tests.yaml @@ -0,0 +1,42 @@ +name: CI | Run nydus tests +on: + workflow_call: + inputs: + tarball-suffix: + required: false + type: string + commit-hash: + required: false + type: string + +jobs: + run-nydus: + strategy: + fail-fast: true + matrix: + containerd_version: ['lts', 'active'] + vmm: ['clh', 'qemu', 'dragonball'] + runs-on: garm-ubuntu-2204 + env: + CONTAINERD_VERSION: ${{ matrix.containerd_version }} + GOPATH: ${{ github.workspace }} + KATA_HYPERVISOR: ${{ matrix.vmm }} + steps: + - uses: actions/checkout@v3 + with: + ref: ${{ inputs.commit-hash }} + + - name: Install dependencies + run: bash tests/integration/nydus/gha-run.sh install-dependencies + + - name: get-kata-tarball + uses: actions/download-artifact@v3 + with: + name: kata-static-tarball-amd64${{ inputs.tarball-suffix }} + path: kata-artifacts + + - name: Install kata + run: bash tests/integration/nydus/gha-run.sh install-kata kata-artifacts + + - name: Run nydus tests + run: bash tests/integration/nydus/gha-run.sh run diff --git a/.github/workflows/run-vfio-tests.yaml b/.github/workflows/run-vfio-tests.yaml new file mode 100644 index 000000000..ba34d2088 --- /dev/null +++ b/.github/workflows/run-vfio-tests.yaml @@ -0,0 +1,37 @@ +name: CI | Run vfio tests +on: + workflow_call: + inputs: + tarball-suffix: + required: false + type: string + commit-hash: + required: false + type: string + +jobs: + run-vfio: + strategy: + fail-fast: false + matrix: + vmm: ['clh', 'qemu'] + runs-on: garm-ubuntu-2204 + env: + GOPATH: ${{ github.workspace }} + KATA_HYPERVISOR: ${{ matrix.vmm }} + steps: + - uses: actions/checkout@v3 + with: + ref: ${{ inputs.commit-hash }} + + - name: Install dependencies + run: bash tests/functional/vfio/gha-run.sh install-dependencies + + - name: get-kata-tarball + uses: actions/download-artifact@v3 + with: + name: kata-static-tarball-amd64${{ inputs.tarball-suffix }} + path: kata-artifacts + + - name: Run vfio tests + run: bash tests/functional/vfio/gha-run.sh run diff --git a/.github/workflows/static-checks-dragonball.yaml b/.github/workflows/static-checks-dragonball.yaml index 61e3fe2c4..d47689e3a 100644 --- a/.github/workflows/static-checks-dragonball.yaml +++ b/.github/workflows/static-checks-dragonball.yaml @@ -7,10 +7,14 @@ on: - synchronize paths-ignore: [ '**.md', '**.png', '**.jpg', '**.jpeg', '**.svg', '/docs/**' ] +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + name: Static checks dragonball jobs: test-dragonball: - runs-on: self-hosted + runs-on: dragonball env: RUST_BACKTRACE: "1" steps: diff --git a/.github/workflows/static-checks.yaml b/.github/workflows/static-checks.yaml index 616e9f5ab..bd9f76027 100644 --- a/.github/workflows/static-checks.yaml +++ b/.github/workflows/static-checks.yaml @@ -6,6 +6,10 @@ on: - reopened - synchronize +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + name: Static checks jobs: static-checks: diff --git a/Makefile b/Makefile index e70af93e4..0765ae2b6 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,10 @@ TOOLS += trace-forwarder STANDARD_TARGETS = build check clean install static-checks-build test vendor +# Variables for the build-and-publish-kata-debug target +KATA_DEBUG_REGISTRY ?= "" +KATA_DEBUG_TAG ?= "" + default: all include utils.mk @@ -44,6 +48,9 @@ static-checks: static-checks-build docs-url-alive-check: bash ci/docs-url-alive-check.sh +build-and-publish-kata-debug: + bash tools/packaging/kata-debug/kata-debug-build-and-upload-payload.sh ${KATA_DEBUG_REGISTRY} ${KATA_DEBUG_TAG} + .PHONY: \ all \ kata-tarball \ diff --git a/README.md b/README.md index 78a62179c..d34110056 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,7 @@ The table below lists the remaining parts of the project: | [packaging](tools/packaging) | infrastructure | Scripts and metadata for producing packaged binaries
(components, hypervisors, kernel and rootfs). | | [kernel](https://www.kernel.org) | kernel | Linux kernel used by the hypervisor to boot the guest image. Patches are stored [here](tools/packaging/kernel). | | [osbuilder](tools/osbuilder) | infrastructure | Tool to create "mini O/S" rootfs and initrd images and kernel for the hypervisor. | +| [kata-debug](tools/packaging/kata-debug/README.md) | infrastructure | Utility tool to gather Kata Containers debug information from Kubernetes clusters. | | [`agent-ctl`](src/tools/agent-ctl) | utility | Tool that provides low-level access for testing the agent. | | [`kata-ctl`](src/tools/kata-ctl) | utility | Tool that provides advanced commands and debug facilities. | | [`log-parser-rs`](src/tools/log-parser-rs) | utility | Tool that aid in analyzing logs from the kata runtime. | diff --git a/VERSION b/VERSION index bb48c8b0a..ed590bd2a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.2.0-alpha3 +3.2.0-alpha4 diff --git a/docs/design/architecture/kubernetes.md b/docs/design/architecture/kubernetes.md index be8165d75..8d4d3144c 100644 --- a/docs/design/architecture/kubernetes.md +++ b/docs/design/architecture/kubernetes.md @@ -3,11 +3,11 @@ [Kubernetes](https://github.com/kubernetes/kubernetes/), or K8s, is a popular open source container orchestration engine. In Kubernetes, a set of containers sharing resources such as networking, storage, mount, PID, etc. is called a -[pod](https://kubernetes.io/docs/user-guide/pods/). +[pod](https://kubernetes.io/docs/concepts/workloads/pods/). A node can have multiple pods, but at a minimum, a node within a Kubernetes cluster only needs to run a container runtime and a container agent (called a -[Kubelet](https://kubernetes.io/docs/admin/kubelet/)). +[Kubelet](https://kubernetes.io/docs/concepts/overview/components/#kubelet)). Kata Containers represents a Kubelet pod as a VM. diff --git a/src/agent/Cargo.lock b/src/agent/Cargo.lock index c984fbf47..9222389e2 100644 --- a/src/agent/Cargo.lock +++ b/src/agent/Cargo.lock @@ -2081,6 +2081,7 @@ dependencies = [ "slog", "slog-scope", "slog-stdlog", + "slog-term", "tempfile", "test-utils", "thiserror", @@ -2100,6 +2101,7 @@ dependencies = [ name = "kata-sys-util" version = "0.1.0" dependencies = [ + "anyhow", "byteorder", "cgroups-rs", "chrono", diff --git a/src/agent/Cargo.toml b/src/agent/Cargo.toml index 358412a85..cf8f97209 100644 --- a/src/agent/Cargo.toml +++ b/src/agent/Cargo.toml @@ -44,6 +44,7 @@ ipnetwork = "0.17.0" logging = { path = "../libs/logging" } slog = "2.5.2" slog-scope = "4.1.2" +slog-term = "2.9.0" # Redirect ttrpc log calls slog-stdlog = "4.0.0" diff --git a/src/agent/Makefile b/src/agent/Makefile index 69423edda..ba065b4d0 100644 --- a/src/agent/Makefile +++ b/src/agent/Makefile @@ -26,7 +26,7 @@ export VERSION_COMMIT := $(if $(COMMIT),$(VERSION)-$(COMMIT),$(VERSION)) EXTRA_RUSTFEATURES := ##VAR SECCOMP=yes|no define if agent enables seccomp feature -SECCOMP := yes +SECCOMP ?= yes # Enable seccomp feature of rust build ifeq ($(SECCOMP),yes) diff --git a/src/agent/rustjail/src/mount.rs b/src/agent/rustjail/src/mount.rs index d9ba15041..b822736dc 100644 --- a/src/agent/rustjail/src/mount.rs +++ b/src/agent/rustjail/src/mount.rs @@ -1118,6 +1118,7 @@ mod tests { use std::fs::create_dir; use std::fs::create_dir_all; use std::fs::remove_dir_all; + use std::fs::remove_file; use std::io; use std::os::unix::fs; use std::os::unix::io::AsRawFd; @@ -1333,14 +1334,9 @@ mod tests { fn test_mknod_dev() { skip_if_not_root!(); - let tempdir = tempdir().unwrap(); - - let olddir = unistd::getcwd().unwrap(); - defer!(let _ = unistd::chdir(&olddir);); - let _ = unistd::chdir(tempdir.path()); - + let path = "/dev/fifo-test"; let dev = oci::LinuxDevice { - path: "/fifo".to_string(), + path: path.to_string(), r#type: "c".to_string(), major: 0, minor: 0, @@ -1348,13 +1344,16 @@ mod tests { uid: Some(unistd::getuid().as_raw()), gid: Some(unistd::getgid().as_raw()), }; - let path = Path::new("fifo"); - let ret = mknod_dev(&dev, path); + let ret = mknod_dev(&dev, Path::new(path)); assert!(ret.is_ok(), "Should pass. Got: {:?}", ret); let ret = stat::stat(path); assert!(ret.is_ok(), "Should pass. Got: {:?}", ret); + + // clear test device node + let ret = remove_file(path); + assert!(ret.is_ok(), "Should pass, Got: {:?}", ret); } #[test] diff --git a/src/agent/rustjail/src/process.rs b/src/agent/rustjail/src/process.rs index 0e7fe73ef..cdecae130 100644 --- a/src/agent/rustjail/src/process.rs +++ b/src/agent/rustjail/src/process.rs @@ -161,7 +161,7 @@ impl Process { pub fn notify_term_close(&mut self) { let notify = self.term_exit_notifier.clone(); - notify.notify_one(); + notify.notify_waiters(); } pub fn close_stdin(&mut self) { diff --git a/src/agent/src/linux_abi.rs b/src/agent/src/linux_abi.rs index de131faf0..b87da3ceb 100644 --- a/src/agent/src/linux_abi.rs +++ b/src/agent/src/linux_abi.rs @@ -33,7 +33,7 @@ pub fn create_pci_root_bus_path() -> String { // check if there is pci bus path for acpi acpi_sysfs_dir.push_str(&acpi_root_bus_path); - if let Ok(_) = fs::metadata(&acpi_sysfs_dir) { + if fs::metadata(&acpi_sysfs_dir).is_ok() { return acpi_root_bus_path; } diff --git a/src/agent/src/mount.rs b/src/agent/src/mount.rs index 5b0d95c19..d80aea4c6 100644 --- a/src/agent/src/mount.rs +++ b/src/agent/src/mount.rs @@ -36,6 +36,7 @@ use crate::Sandbox; use crate::{ccw, device::get_virtio_blk_ccw_device_name}; use anyhow::{anyhow, Context, Result}; use slog::Logger; + use tracing::instrument; pub const TYPE_ROOTFS: &str = "rootfs"; @@ -145,6 +146,11 @@ pub const STORAGE_HANDLER_LIST: &[&str] = &[ DRIVER_WATCHABLE_BIND_TYPE, ]; +#[instrument] +pub fn get_mounts() -> Result { + fs::read_to_string("/proc/mounts") +} + #[instrument] pub fn baremount( source: &Path, @@ -168,6 +174,31 @@ pub fn baremount( return Err(anyhow!("need mount FS type")); } + let destination_str = destination.to_string_lossy(); + let mounts = get_mounts().unwrap_or_else(|_| String::new()); + let already_mounted = mounts + .lines() + .map(|line| line.split_whitespace().collect::>()) + .filter(|parts| parts.len() >= 3) // ensure we have at least [source}, destination, and fs_type + .any(|parts| { + // Check if source, destination and fs_type match any entry in /proc/mounts + // minimal check is for destination an fstype since source can have different names like: + // udev /dev devtmpfs + // dev /dev devtmpfs + // depending on which entity is mounting the dev/fs/pseudo-fs + parts[1] == destination_str && parts[2] == fs_type + }); + + if already_mounted { + slog_info!( + logger, + "{:?} is already mounted at {:?}", + source, + destination + ); + return Ok(()); + } + info!( logger, "baremount source={:?}, dest={:?}, fs_type={:?}, options={:?}, flags={:?}", @@ -725,6 +756,14 @@ pub fn recursive_ownership_change( mask |= EXEC_MASK; mask |= MODE_SETGID; } + + // We do not want to change the permission of the underlying file + // using symlink. Hence we skip symlinks from recursive ownership + // and permission changes. + if path.is_symlink() { + return Ok(()); + } + nix::unistd::chown(path, uid, gid)?; if gid.is_some() { @@ -1102,6 +1141,7 @@ fn parse_options(option_list: Vec) -> HashMap { mod tests { use super::*; use protocols::agent::FSGroup; + use slog::Drain; use std::fs::File; use std::fs::OpenOptions; use std::io::Write; @@ -1112,6 +1152,31 @@ mod tests { skip_if_not_root, skip_loop_by_user, skip_loop_if_not_root, skip_loop_if_root, }; + #[test] + fn test_already_baremounted() { + let plain = slog_term::PlainSyncDecorator::new(std::io::stdout()); + let logger = Logger::root(slog_term::FullFormat::new(plain).build().fuse(), o!()); + + let test_cases = [ + ("dev", "/dev", "devtmpfs"), + ("udev", "/dev", "devtmpfs"), + ("proc", "/proc", "proc"), + ("sysfs", "/sys", "sysfs"), + ]; + + for &(source, destination, fs_type) in &test_cases { + let source = Path::new(source); + let destination = Path::new(destination); + let flags = MsFlags::MS_RDONLY; + let options = "mode=755"; + println!( + "testing if already mounted baremount({:?} {:?} {:?})", + source, destination, fs_type + ); + assert!(baremount(source, destination, fs_type, flags, options, &logger).is_ok()); + } + } + #[test] fn test_mount() { #[derive(Debug)] diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index 4e7429e49..439521fae 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -665,15 +665,16 @@ impl AgentService { let cid = req.container_id; let eid = req.exec_id; - let mut term_exit_notifier = Arc::new(tokio::sync::Notify::new()); + let term_exit_notifier; let reader = { let s = self.sandbox.clone(); let mut sandbox = s.lock().await; let p = sandbox.find_container_process(cid.as_str(), eid.as_str())?; + term_exit_notifier = p.term_exit_notifier.clone(); + if p.term_master.is_some() { - term_exit_notifier = p.term_exit_notifier.clone(); p.get_reader(StreamType::TermMaster) } else if stdout { if p.parent_stdout.is_some() { @@ -693,9 +694,12 @@ impl AgentService { let reader = reader.ok_or_else(|| anyhow!("cannot get stream reader"))?; tokio::select! { - _ = term_exit_notifier.notified() => { - Err(anyhow!("eof")) - } + // Poll the futures in the order they appear from top to bottom + // it is very important to avoid data loss. If there is still + // data in the buffer and read_stream branch will return + // Poll::Ready so that the term_exit_notifier will never polled + // before all data were read. + biased; v = read_stream(reader, req.len as usize) => { let vector = v?; let mut resp = ReadStreamResponse::new(); @@ -703,6 +707,9 @@ impl AgentService { Ok(resp) } + _ = term_exit_notifier.notified() => { + Err(anyhow!("eof")) + } } } diff --git a/src/agent/src/sandbox.rs b/src/agent/src/sandbox.rs index 24b678747..b0caa0154 100644 --- a/src/agent/src/sandbox.rs +++ b/src/agent/src/sandbox.rs @@ -435,7 +435,7 @@ fn online_resources(logger: &Logger, path: &str, pattern: &str, num: i32) -> Res } // max wait for all CPUs to online will use 50 * 100 = 5 seconds. -const ONLINE_CPUMEM_WATI_MILLIS: u64 = 50; +const ONLINE_CPUMEM_WAIT_MILLIS: u64 = 50; const ONLINE_CPUMEM_MAX_RETRIES: i32 = 100; #[instrument] @@ -465,7 +465,7 @@ fn online_cpus(logger: &Logger, num: i32) -> Result { ); return Ok(num); } - thread::sleep(time::Duration::from_millis(ONLINE_CPUMEM_WATI_MILLIS)); + thread::sleep(time::Duration::from_millis(ONLINE_CPUMEM_WAIT_MILLIS)); } Err(anyhow!( diff --git a/src/agent/src/signal.rs b/src/agent/src/signal.rs index d67000b80..401ded953 100644 --- a/src/agent/src/signal.rs +++ b/src/agent/src/signal.rs @@ -57,7 +57,7 @@ async fn handle_sigchild(logger: Logger, sandbox: Arc>) -> Result continue; } - let mut p = process.unwrap(); + let p = process.unwrap(); let ret: i32 = match wait_status { WaitStatus::Exited(_, c) => c, diff --git a/src/dragonball/Cargo.lock b/src/dragonball/Cargo.lock index 0ed990c0b..b71455729 100644 --- a/src/dragonball/Cargo.lock +++ b/src/dragonball/Cargo.lock @@ -210,8 +210,6 @@ dependencies = [ [[package]] name = "dbs-address-space" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95e20d28a9cd13bf00d0ecd1bd073d242242b04f0acb663d7adfc659f8879322" dependencies = [ "arc-swap", "lazy_static", @@ -225,8 +223,6 @@ dependencies = [ [[package]] name = "dbs-allocator" version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "543711b94b4bc1437d2ebb45f856452e96a45a67ab39f8dcf8c887c2a3701004" dependencies = [ "thiserror", ] @@ -234,8 +230,6 @@ dependencies = [ [[package]] name = "dbs-arch" version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "194c844946cd1d13f7a9eb29b84afbc5354578eee2b06fea96226bc3872e7424" dependencies = [ "kvm-bindings", "kvm-ioctls", @@ -249,8 +243,6 @@ dependencies = [ [[package]] name = "dbs-boot" version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5466a92f75aa928a9103dcb2088f6d1638ef9da8945fad7389a73864dfa0182c" dependencies = [ "dbs-arch", "kvm-bindings", @@ -265,8 +257,6 @@ dependencies = [ [[package]] name = "dbs-device" version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14ecea44b4bc861c0c2ccb51868bea781286dc70e40ae46b54d4511e690a654a" dependencies = [ "thiserror", ] @@ -274,8 +264,6 @@ dependencies = [ [[package]] name = "dbs-interrupt" version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1eb2c5bb9f8f123ace33b1b2e8d53dd2d87331ee770ad1f82e56c3382c6bed6d" dependencies = [ "dbs-arch", "dbs-device", @@ -288,11 +276,10 @@ dependencies = [ [[package]] name = "dbs-legacy-devices" version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4d089ac1c4d186c8133be59de09462e9793f7add10017c5b040318a3a7f431f" dependencies = [ "dbs-device", "dbs-utils", + "libc", "log", "serde", "vm-superio", @@ -302,8 +289,6 @@ dependencies = [ [[package]] name = "dbs-upcall" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea3a78128fd0be8b8b10257675c262b378dc5d00b1e18157736a6c27e45ce4fb" dependencies = [ "anyhow", "dbs-utils", @@ -316,8 +301,6 @@ dependencies = [ [[package]] name = "dbs-utils" version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cb6ff873451b76e22789af7fbe1d0478c42c717f817e66908be7a3a2288068c" dependencies = [ "anyhow", "event-manager", @@ -332,8 +315,6 @@ dependencies = [ [[package]] name = "dbs-virtio-devices" version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d671cc3e5f98b84ef6b6bed007d28f72f16d3aea8eb38e2d42b00b2973c1d8" dependencies = [ "byteorder", "caps", @@ -349,9 +330,10 @@ dependencies = [ "log", "nix 0.24.3", "nydus-api", - "nydus-blobfs", "nydus-rafs", + "nydus-storage", "rlimit", + "sendfd", "serde", "serde_json", "thiserror", @@ -498,10 +480,25 @@ dependencies = [ ] [[package]] -name = "fuse-backend-rs" -version = "0.10.2" +name = "foreign-types" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08af89cb80a7c5693bd63a2b1ee7ac31a307670977c18fda036b3aa94be8c47f" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "fuse-backend-rs" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc24820b14267bec37fa87f5c2a32b5f1c5405b8c60cc3aa77afd481bd2628a6" dependencies = [ "arc-swap", "bitflags", @@ -518,95 +515,6 @@ dependencies = [ "vmm-sys-util 0.10.0", ] -[[package]] -name = "futures" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13e2792b0ff0340399d58445b88fd9770e3489eff258a4cbc1523418f12abf84" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e5317663a9089767a1ec00a487df42e0ca174b61b4483213ac24448e4664df5" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec90ff4d0fe1f57d600049061dc6bb68ed03c7d2fbd697274c41805dcb3f8608" - -[[package]] -name = "futures-executor" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8de0a35a6ab97ec8869e32a2473f4b1324459e14c29275d14b10cb1fd19b50e" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfb8371b6fb2aeb2d280374607aeabfc99d95c72edfe51692e42d3d7f0d08531" - -[[package]] -name = "futures-macro" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a73af87da33b5acf53acfebdc339fe592ecf5357ac7c0a7734ab9d8c876a70" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "futures-sink" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f310820bb3e8cfd46c80db4d7fb8353e15dfff853a127158425f31e0be6c8364" - -[[package]] -name = "futures-task" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcf79a1bf610b10f42aea489289c5a2c478a786509693b80cd39c44ccd936366" - -[[package]] -name = "futures-util" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c1d6de3acfef38d2be4b1f543f553131788603495be83da675e180c8d6b7bd1" -dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "slab", -] - [[package]] name = "generic-array" version = "0.14.6" @@ -891,82 +799,45 @@ dependencies = [ [[package]] name = "nydus-api" -version = "0.2.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1899def1a22ed32b1d60de4e444f525c4023a208ee0d1136a65399cff82837ce" +checksum = "33a6ca41dd10813e3d29397550fbb0f15ad149381f312e04659d39e0adcf2002" dependencies = [ + "backtrace", "libc", "log", - "nydus-error", "serde", "serde_json", "toml", ] -[[package]] -name = "nydus-blobfs" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784cf6e1319da7a94734987dcc71d2940f74231256922431a505c832fc778dd3" -dependencies = [ - "fuse-backend-rs", - "libc", - "log", - "nydus-api", - "nydus-error", - "nydus-rafs", - "nydus-storage", - "serde", - "serde_json", - "vm-memory", -] - -[[package]] -name = "nydus-error" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae2ec1efd1589377dbefca6b1047294c71b2fbab164d93319f97b20faae92001" -dependencies = [ - "backtrace", - "httpdate", - "libc", - "log", - "serde", - "serde_json", -] - [[package]] name = "nydus-rafs" -version = "0.2.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0ace6945daa16842e72e9fe7647e2b8715856f50f07350cce82bd68db1ed02c" +checksum = "ed21e44a99472850d2afc4fb07427ed46d4e6a8b1cce28b42bd689319e45076d" dependencies = [ "anyhow", "arc-swap", "bitflags", - "blake3", "fuse-backend-rs", - "futures", "lazy_static", "libc", "log", - "lz4-sys", "nix 0.24.3", "nydus-api", - "nydus-error", "nydus-storage", "nydus-utils", "serde", "serde_json", - "spmc", "vm-memory", ] [[package]] name = "nydus-storage" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08bc5ea9054fca2ec8b19dcce25ea600679b7fbf035aad86cfe4a659002c88b" +checksum = "9591fbee1875895bf1f765656695d0be6887fe65372fbf4924b8b3959bd61375" dependencies = [ "arc-swap", "bitflags", @@ -978,7 +849,6 @@ dependencies = [ "log", "nix 0.24.3", "nydus-api", - "nydus-error", "nydus-utils", "serde", "serde_json", @@ -989,12 +859,13 @@ dependencies = [ [[package]] name = "nydus-utils" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1e681d7207a1ec500323d5ca39ebb7e381fc4f14db5ff0c532c18ff1226a81f" +checksum = "fe8b9269e3a370682f272a1b2cac4bdaf6d6657f3f6966560c4fedab36548362" dependencies = [ "blake3", "flate2", + "httpdate", "lazy_static", "libc", "libz-sys", @@ -1002,7 +873,8 @@ dependencies = [ "lz4", "lz4-sys", "nix 0.24.3", - "nydus-error", + "nydus-api", + "openssl", "serde", "serde_json", "sha2", @@ -1025,6 +897,54 @@ version = "1.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +[[package]] +name = "openssl" +version = "0.10.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.26", +] + +[[package]] +name = "openssl-src" +version = "111.26.0+1.1.1u" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efc62c9f12b22b8f5208c23a7200a442b2e5999f8bdf80233852122b5a4f6f37" +dependencies = [ + "cc", +] + +[[package]] +name = "openssl-sys" +version = "0.9.90" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6" +dependencies = [ + "cc", + "libc", + "openssl-src", + "pkg-config", + "vcpkg", +] + [[package]] name = "parking_lot" version = "0.12.1" @@ -1054,12 +974,6 @@ version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - [[package]] name = "pkg-config" version = "0.3.26" @@ -1068,18 +982,18 @@ checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" [[package]] name = "proc-macro2" -version = "1.0.51" +version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6" +checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.23" +version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +checksum = "5fe8a65d69dd0808184ebb5f836ab526bb259db23c657efa38711b1072ee47f0" dependencies = [ "proc-macro2", ] @@ -1166,6 +1080,15 @@ dependencies = [ "libc", ] +[[package]] +name = "sendfd" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "604b71b8fc267e13bb3023a2c901126c8f349393666a6d98ac1ae5729b701798" +dependencies = [ + "libc", +] + [[package]] name = "serde" version = "1.0.156" @@ -1183,7 +1106,7 @@ checksum = "d7e29c4601e36bcec74a223228dce795f4cd3616341a4af93520ca1a837c087d" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1275,12 +1198,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "spmc" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02a8428da277a8e3a15271d79943e80ccc2ef254e78813a166a08d65e4c3ece5" - [[package]] name = "subtle" version = "2.4.1" @@ -1298,6 +1215,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "syn" +version = "2.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45c3457aacde3c65315de5031ec191ce46604304d2446e803d71ade03308d970" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "take_mut" version = "0.2.2" @@ -1350,7 +1278,7 @@ checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1434,7 +1362,7 @@ checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] @@ -1480,7 +1408,7 @@ checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 1.0.109", ] [[package]] diff --git a/src/dragonball/Cargo.toml b/src/dragonball/Cargo.toml index b0e03047d..08bd34a8f 100644 --- a/src/dragonball/Cargo.toml +++ b/src/dragonball/Cargo.toml @@ -12,16 +12,16 @@ edition = "2018" [dependencies] arc-swap = "1.5.0" bytes = "1.1.0" -dbs-address-space = "0.3.0" -dbs-allocator = "0.1.0" -dbs-arch = "0.2.0" -dbs-boot = "0.4.0" -dbs-device = "0.2.0" -dbs-interrupt = { version = "0.2.0", features = ["kvm-irq"] } -dbs-legacy-devices = "0.1.0" -dbs-upcall = { version = "0.3.0", optional = true } -dbs-utils = "0.2.0" -dbs-virtio-devices = { version = "0.3.1", optional = true, features = ["virtio-mmio"] } +dbs-address-space = { path = "./src/dbs_address_space" } +dbs-allocator = { path = "./src/dbs_allocator" } +dbs-arch = { path = "./src/dbs_arch" } +dbs-boot = { path = "./src/dbs_boot" } +dbs-device = { path = "./src/dbs_device" } +dbs-interrupt = { path = "./src/dbs_interrupt", features = ["kvm-irq"] } +dbs-legacy-devices = { path = "./src/dbs_legacy_devices" } +dbs-upcall = { path = "./src/dbs_upcall" , optional = true } +dbs-utils = { path = "./src/dbs_utils" } +dbs-virtio-devices = { path = "./src/dbs_virtio_devices", optional = true, features = ["virtio-mmio"] } kvm-bindings = "0.6.0" kvm-ioctls = "0.12.0" lazy_static = "1.2" diff --git a/src/dragonball/Makefile b/src/dragonball/Makefile index ab8e5b694..68ee3bd46 100644 --- a/src/dragonball/Makefile +++ b/src/dragonball/Makefile @@ -39,12 +39,15 @@ clean: test: ifdef SUPPORT_VIRTUALIZATION - cargo test --all-features --target $(TRIPLE) -- --nocapture + RUST_BACKTRACE=1 cargo test --all-features --target $(TRIPLE) -- --nocapture --test-threads=1 else @echo "INFO: skip testing dragonball, it need virtualization support." exit 0 endif +coverage: + RUST_BACKTRACE=1 cargo llvm-cov --all-features --target $(TRIPLE) -- --nocapture --test-threads=1 + endif # ifeq ($(ARCH), s390x) .DEFAULT_GOAL := default diff --git a/src/dragonball/README.md b/src/dragonball/README.md index 3fde0782e..767b9af47 100644 --- a/src/dragonball/README.md +++ b/src/dragonball/README.md @@ -16,10 +16,22 @@ and configuration process. # Documentation -Device: [Device Document](docs/device.md) -vCPU: [vCPU Document](docs/vcpu.md) -API: [API Document](docs/api.md) -`Upcall`: [`Upcall` Document](docs/upcall.md) +- Device: [Device Document](docs/device.md) +- vCPU: [vCPU Document](docs/vcpu.md) +- API: [API Document](docs/api.md) +- `Upcall`: [`Upcall` Document](docs/upcall.md) +- `dbs_acpi`: [`dbs_acpi` Document](src/dbs_acpi/README.md) +- `dbs_address_space`: [`dbs_address_space` Document](src/dbs_address_space/README.md) +- `dbs_allocator`: [`dbs_allocator` Document](src/dbs_allocator/README.md) +- `dbs_arch`: [`dbs_arch` Document](src/dbs_arch/README.md) +- `dbs_boot`: [`dbs_boot` Document](src/dbs_boot/README.md) +- `dbs_device`: [`dbs_device` Document](src/dbs_device/README.md) +- `dbs_interrupt`: [`dbs_interrput` Document](src/dbs_interrupt/README.md) +- `dbs_legacy_devices`: [`dbs_legacy_devices` Document](src/dbs_legacy_devices/README.md) +- `dbs_tdx`: [`dbs_tdx` Document](src/dbs_tdx/README.md) +- `dbs_upcall`: [`dbs_upcall` Document](src/dbs_upcall/README.md) +- `dbs_utils`: [`dbs_utils` Document](src/dbs_utils/README.md) +- `dbs_virtio_devices`: [`dbs_virtio_devices` Document](src/dbs_virtio_devices/README.md) Currently, the documents are still actively adding. You could see the [official documentation](docs/) page for more details. diff --git a/src/dragonball/src/dbs_acpi/Cargo.toml b/src/dragonball/src/dbs_acpi/Cargo.toml new file mode 100644 index 000000000..df5e7867a --- /dev/null +++ b/src/dragonball/src/dbs_acpi/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "dbs-acpi" +version = "0.1.0" +authors = ["Alibaba Dragonball Team"] +description = "acpi definitions for virtual machines." +license = "Apache-2.0" +edition = "2018" +homepage = "https://github.com/openanolis/dragonball-sandbox" +repository = "https://github.com/openanolis/dragonball-sandbox" +keywords = ["dragonball", "acpi", "vmm", "secure-sandbox"] +readme = "README.md" + +[dependencies] +vm-memory = "0.9.0" \ No newline at end of file diff --git a/src/dragonball/src/dbs_acpi/README.md b/src/dragonball/src/dbs_acpi/README.md new file mode 100644 index 000000000..cc2b49754 --- /dev/null +++ b/src/dragonball/src/dbs_acpi/README.md @@ -0,0 +1,11 @@ +# dbs-acpi + +`dbs-acpi` provides ACPI data structures for VMM to emulate ACPI behavior. + +## Acknowledgement + +Part of the code is derived from the [Cloud Hypervisor](https://github.com/cloud-hypervisor/cloud-hypervisor) project. + +## License + +This project is licensed under [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). diff --git a/src/dragonball/src/dbs_acpi/src/lib.rs b/src/dragonball/src/dbs_acpi/src/lib.rs new file mode 100644 index 000000000..a3094e309 --- /dev/null +++ b/src/dragonball/src/dbs_acpi/src/lib.rs @@ -0,0 +1,29 @@ +// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2023 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +pub mod rsdp; +pub mod sdt; + +fn generate_checksum(data: &[u8]) -> u8 { + (255 - data.iter().fold(0u8, |acc, x| acc.wrapping_add(*x))).wrapping_add(1) +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_generate_checksum() { + let mut buf = [0x00; 8]; + let sum = generate_checksum(&buf); + assert_eq!(sum, 0); + buf[0] = 0xff; + let sum = generate_checksum(&buf); + assert_eq!(sum, 1); + buf[0] = 0xaa; + buf[1] = 0xcc; + buf[4] = generate_checksum(&buf); + let sum = buf.iter().fold(0u8, |s, v| s.wrapping_add(*v)); + assert_eq!(sum, 0); + } +} diff --git a/src/dragonball/src/dbs_acpi/src/rsdp.rs b/src/dragonball/src/dbs_acpi/src/rsdp.rs new file mode 100644 index 000000000..05c36f809 --- /dev/null +++ b/src/dragonball/src/dbs_acpi/src/rsdp.rs @@ -0,0 +1,60 @@ +// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2023 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +// RSDP (Root System Description Pointer) is a data structure used in the ACPI programming interface. +use vm_memory::ByteValued; + +#[repr(packed)] +#[derive(Clone, Copy, Default)] +pub struct Rsdp { + pub signature: [u8; 8], + pub checksum: u8, + pub oem_id: [u8; 6], + pub revision: u8, + _rsdt_addr: u32, + pub length: u32, + pub xsdt_addr: u64, + pub extended_checksum: u8, + _reserved: [u8; 3], +} + +// SAFETY: Rsdp only contains a series of integers +unsafe impl ByteValued for Rsdp {} + +impl Rsdp { + pub fn new(xsdt_addr: u64) -> Self { + let mut rsdp = Rsdp { + signature: *b"RSD PTR ", + checksum: 0, + oem_id: *b"ALICLD", + revision: 1, + _rsdt_addr: 0, + length: std::mem::size_of::() as u32, + xsdt_addr, + extended_checksum: 0, + _reserved: [0; 3], + }; + rsdp.checksum = super::generate_checksum(&rsdp.as_slice()[0..19]); + rsdp.extended_checksum = super::generate_checksum(rsdp.as_slice()); + rsdp + } + + pub fn len() -> usize { + std::mem::size_of::() + } +} +#[cfg(test)] +mod tests { + use super::Rsdp; + use vm_memory::bytes::ByteValued; + #[test] + fn test_rsdp() { + let rsdp = Rsdp::new(0xa0000); + let sum = rsdp + .as_slice() + .iter() + .fold(0u8, |acc, x| acc.wrapping_add(*x)); + assert_eq!(sum, 0); + } +} diff --git a/src/dragonball/src/dbs_acpi/src/sdt.rs b/src/dragonball/src/dbs_acpi/src/sdt.rs new file mode 100644 index 000000000..f6a79f576 --- /dev/null +++ b/src/dragonball/src/dbs_acpi/src/sdt.rs @@ -0,0 +1,137 @@ +// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2023 Alibaba Cloud +// +// SPDX-License-Identifier: Apache-2.0 +#[repr(packed)] +pub struct GenericAddress { + pub address_space_id: u8, + pub register_bit_width: u8, + pub register_bit_offset: u8, + pub access_size: u8, + pub address: u64, +} + +impl GenericAddress { + pub fn io_port_address(address: u16) -> Self { + GenericAddress { + address_space_id: 1, + register_bit_width: 8 * std::mem::size_of::() as u8, + register_bit_offset: 0, + access_size: std::mem::size_of::() as u8, + address: u64::from(address), + } + } + + pub fn mmio_address(address: u64) -> Self { + GenericAddress { + address_space_id: 0, + register_bit_width: 8 * std::mem::size_of::() as u8, + register_bit_offset: 0, + access_size: std::mem::size_of::() as u8, + address, + } + } +} + +pub struct Sdt { + data: Vec, +} + +#[allow(clippy::len_without_is_empty)] +impl Sdt { + pub fn new(signature: [u8; 4], length: u32, revision: u8) -> Self { + assert!(length >= 36); + const OEM_ID: [u8; 6] = *b"ALICLD"; + const OEM_TABLE: [u8; 8] = *b"RUND "; + const CREATOR_ID: [u8; 4] = *b"ALIC"; + let mut data = Vec::with_capacity(length as usize); + data.extend_from_slice(&signature); + data.extend_from_slice(&length.to_le_bytes()); + data.push(revision); + data.push(0); // checksum + data.extend_from_slice(&OEM_ID); // oem id u32 + data.extend_from_slice(&OEM_TABLE); // oem table + data.extend_from_slice(&1u32.to_le_bytes()); // oem revision u32 + data.extend_from_slice(&CREATOR_ID); // creator id u32 + data.extend_from_slice(&1u32.to_le_bytes()); // creator revison u32 + assert_eq!(data.len(), 36); + data.resize(length as usize, 0); + let mut sdt = Sdt { data }; + sdt.update_checksum(); + sdt + } + + pub fn update_checksum(&mut self) { + self.data[9] = 0; + let checksum = super::generate_checksum(self.data.as_slice()); + self.data[9] = checksum + } + + pub fn as_slice(&self) -> &[u8] { + self.data.as_slice() + } + + pub fn append(&mut self, value: T) { + let orig_length = self.data.len(); + let new_length = orig_length + std::mem::size_of::(); + self.data.resize(new_length, 0); + self.write_u32(4, new_length as u32); + self.write(orig_length, value); + } + + pub fn append_slice(&mut self, data: &[u8]) { + let orig_length = self.data.len(); + let new_length = orig_length + data.len(); + self.write_u32(4, new_length as u32); + self.data.extend_from_slice(data); + self.update_checksum(); + } + + /// Write a value at the given offset + pub fn write(&mut self, offset: usize, value: T) { + assert!((offset + (std::mem::size_of::() - 1)) < self.data.len()); + unsafe { + *(((self.data.as_mut_ptr() as usize) + offset) as *mut T) = value; + } + self.update_checksum(); + } + + pub fn write_u8(&mut self, offset: usize, val: u8) { + self.write(offset, val); + } + + pub fn write_u16(&mut self, offset: usize, val: u16) { + self.write(offset, val); + } + + pub fn write_u32(&mut self, offset: usize, val: u32) { + self.write(offset, val); + } + + pub fn write_u64(&mut self, offset: usize, val: u64) { + self.write(offset, val); + } + + pub fn len(&self) -> usize { + self.data.len() + } +} +#[cfg(test)] +mod tests { + use super::Sdt; + #[test] + fn test_sdt() { + let mut sdt = Sdt::new(*b"TEST", 40, 1); + let sum: u8 = sdt + .as_slice() + .iter() + .fold(0u8, |acc, x| acc.wrapping_add(*x)); + assert_eq!(sum, 0); + sdt.write_u32(36, 0x12345678); + let sum: u8 = sdt + .as_slice() + .iter() + .fold(0u8, |acc, x| acc.wrapping_add(*x)); + assert_eq!(sum, 0); + } +} diff --git a/src/dragonball/src/dbs_address_space/Cargo.toml b/src/dragonball/src/dbs_address_space/Cargo.toml new file mode 100644 index 000000000..f507fa4dc --- /dev/null +++ b/src/dragonball/src/dbs_address_space/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "dbs-address-space" +version = "0.3.0" +authors = ["Alibaba Dragonball Team"] +description = "address space manager for virtual machines." +license = "Apache-2.0" +edition = "2018" +homepage = "https://github.com/openanolis/dragonball-sandbox" +repository = "https://github.com/openanolis/dragonball-sandbox" +keywords = ["dragonball", "address", "vmm", "secure-sandbox"] +readme = "README.md" + +[dependencies] +arc-swap = ">=0.4.8" +libc = "0.2.39" +nix = "0.23.1" +lazy_static = "1" +thiserror = "1" +vmm-sys-util = "0.11.0" +vm-memory = { version = "0.9", features = ["backend-mmap", "backend-atomic"] } diff --git a/src/dragonball/src/dbs_address_space/LICENSE b/src/dragonball/src/dbs_address_space/LICENSE new file mode 120000 index 000000000..30cff7403 --- /dev/null +++ b/src/dragonball/src/dbs_address_space/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/src/dragonball/src/dbs_address_space/README.md b/src/dragonball/src/dbs_address_space/README.md new file mode 100644 index 000000000..e3ea81d4c --- /dev/null +++ b/src/dragonball/src/dbs_address_space/README.md @@ -0,0 +1,80 @@ +# dbs-address-space + +## Design + +The `dbs-address-space` crate is an address space manager for virtual machines, which manages memory and MMIO resources resident in the guest physical address space. + +Main components are: +- `AddressSpaceRegion`: Struct to maintain configuration information about a guest address region. +```rust +#[derive(Debug, Clone)] +pub struct AddressSpaceRegion { + /// Type of address space regions. + pub ty: AddressSpaceRegionType, + /// Base address of the region in virtual machine's physical address space. + pub base: GuestAddress, + /// Size of the address space region. + pub size: GuestUsize, + /// Host NUMA node ids assigned to this region. + pub host_numa_node_id: Option, + + /// File/offset tuple to back the memory allocation. + file_offset: Option, + /// Mmap permission flags. + perm_flags: i32, + /// Hugepage madvise hint. + /// + /// It needs 'advise' or 'always' policy in host shmem config. + is_hugepage: bool, + /// Hotplug hint. + is_hotplug: bool, + /// Anonymous memory hint. + /// + /// It should be true for regions with the MADV_DONTFORK flag enabled. + is_anon: bool, +} +``` +- `AddressSpaceBase`: Base implementation to manage guest physical address space, without support of region hotplug. +```rust +#[derive(Clone)] +pub struct AddressSpaceBase { + regions: Vec>, + layout: AddressSpaceLayout, +} +``` +- `AddressSpaceBase`: An address space implementation with region hotplug capability. +```rust +/// The `AddressSpace` is a wrapper over [AddressSpaceBase] to support hotplug of +/// address space regions. +#[derive(Clone)] +pub struct AddressSpace { + state: Arc>, +} +``` + +## Usage +```rust +// 1. create several memory regions +let reg = Arc::new( + AddressSpaceRegion::create_default_memory_region( + GuestAddress(0x100000), + 0x100000, + None, + "shmem", + "", + false, + false, + false, + ) + .unwrap() +); +let regions = vec![reg]; +// 2. create layout (depending on archs) +let layout = AddressSpaceLayout::new(GUEST_PHYS_END, GUEST_MEM_START, GUEST_MEM_END); +// 3. create address space from regions and layout +let address_space = AddressSpace::from_regions(regions, layout.clone()); +``` + +## License + +This project is licensed under [Apache License](http://www.apache.org/licenses/LICENSE-2.0), Version 2.0. diff --git a/src/dragonball/src/dbs_address_space/src/address_space.rs b/src/dragonball/src/dbs_address_space/src/address_space.rs new file mode 100644 index 000000000..35bfab66d --- /dev/null +++ b/src/dragonball/src/dbs_address_space/src/address_space.rs @@ -0,0 +1,830 @@ +// Copyright (C) 2021 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Physical address space manager for virtual machines. + +use std::sync::Arc; + +use arc_swap::ArcSwap; +use vm_memory::{GuestAddress, GuestMemoryMmap}; + +use crate::{AddressSpaceError, AddressSpaceLayout, AddressSpaceRegion, AddressSpaceRegionType}; + +/// Base implementation to manage guest physical address space, without support of region hotplug. +#[derive(Clone)] +pub struct AddressSpaceBase { + regions: Vec>, + layout: AddressSpaceLayout, +} + +impl AddressSpaceBase { + /// Create an instance of `AddressSpaceBase` from an `AddressSpaceRegion` array. + /// + /// To achieve better performance by using binary search algorithm, the `regions` vector + /// will gotten sorted by guest physical address. + /// + /// Note, panicking if some regions intersects with each other. + /// + /// # Arguments + /// * `regions` - prepared regions to managed by the address space instance. + /// * `layout` - prepared address space layout configuration. + pub fn from_regions( + mut regions: Vec>, + layout: AddressSpaceLayout, + ) -> Self { + regions.sort_unstable_by_key(|v| v.base); + for region in regions.iter() { + if !layout.is_region_valid(region) { + panic!( + "Invalid region {:?} for address space layout {:?}", + region, layout + ); + } + } + for idx in 1..regions.len() { + if regions[idx].intersect_with(®ions[idx - 1]) { + panic!("address space regions intersect with each other"); + } + } + AddressSpaceBase { regions, layout } + } + + /// Insert a new address space region into the address space. + /// + /// # Arguments + /// * `region` - the new region to be inserted. + pub fn insert_region( + &mut self, + region: Arc, + ) -> Result<(), AddressSpaceError> { + if !self.layout.is_region_valid(®ion) { + return Err(AddressSpaceError::InvalidAddressRange( + region.start_addr().0, + region.len(), + )); + } + for idx in 0..self.regions.len() { + if self.regions[idx].intersect_with(®ion) { + return Err(AddressSpaceError::InvalidAddressRange( + region.start_addr().0, + region.len(), + )); + } + } + self.regions.push(region); + Ok(()) + } + + /// Enumerate all regions in the address space. + /// + /// # Arguments + /// * `cb` - the callback function to apply to each region. + pub fn walk_regions(&self, mut cb: F) -> Result<(), AddressSpaceError> + where + F: FnMut(&Arc) -> Result<(), AddressSpaceError>, + { + for reg in self.regions.iter() { + cb(reg)?; + } + + Ok(()) + } + + /// Get address space layout associated with the address space. + pub fn layout(&self) -> AddressSpaceLayout { + self.layout.clone() + } + + /// Get maximum of guest physical address in the address space. + pub fn last_addr(&self) -> GuestAddress { + let mut last_addr = GuestAddress(self.layout.mem_start); + for reg in self.regions.iter() { + if reg.ty != AddressSpaceRegionType::DAXMemory && reg.last_addr() > last_addr { + last_addr = reg.last_addr(); + } + } + last_addr + } + + /// Check whether the guest physical address `guest_addr` belongs to a DAX memory region. + /// + /// # Arguments + /// * `guest_addr` - the guest physical address to inquire + pub fn is_dax_region(&self, guest_addr: GuestAddress) -> bool { + for reg in self.regions.iter() { + // Safe because we have validate the region when creating the address space object. + if reg.region_type() == AddressSpaceRegionType::DAXMemory + && reg.start_addr() <= guest_addr + && reg.start_addr().0 + reg.len() > guest_addr.0 + { + return true; + } + } + false + } + + /// Get protection flags of memory region that guest physical address `guest_addr` belongs to. + /// + /// # Arguments + /// * `guest_addr` - the guest physical address to inquire + pub fn prot_flags(&self, guest_addr: GuestAddress) -> Result { + for reg in self.regions.iter() { + if reg.start_addr() <= guest_addr && reg.start_addr().0 + reg.len() > guest_addr.0 { + return Ok(reg.prot_flags()); + } + } + + Err(AddressSpaceError::InvalidRegionType) + } + + /// Get optional NUMA node id associated with guest physical address `gpa`. + /// + /// # Arguments + /// * `gpa` - guest physical address to query. + pub fn numa_node_id(&self, gpa: u64) -> Option { + for reg in self.regions.iter() { + if gpa >= reg.base.0 && gpa < (reg.base.0 + reg.size) { + return reg.host_numa_node_id; + } + } + None + } +} + +/// An address space implementation with region hotplug capability. +/// +/// The `AddressSpace` is a wrapper over [AddressSpaceBase] to support hotplug of +/// address space regions. +#[derive(Clone)] +pub struct AddressSpace { + state: Arc>, +} + +impl AddressSpace { + /// Convert a [GuestMemoryMmap] object into `GuestMemoryAtomic`. + pub fn convert_into_vm_as( + gm: GuestMemoryMmap, + ) -> vm_memory::atomic::GuestMemoryAtomic { + vm_memory::atomic::GuestMemoryAtomic::from(Arc::new(gm)) + } + + /// Create an instance of `AddressSpace` from an `AddressSpaceRegion` array. + /// + /// To achieve better performance by using binary search algorithm, the `regions` vector + /// will gotten sorted by guest physical address. + /// + /// Note, panicking if some regions intersects with each other. + /// + /// # Arguments + /// * `regions` - prepared regions to managed by the address space instance. + /// * `layout` - prepared address space layout configuration. + pub fn from_regions(regions: Vec>, layout: AddressSpaceLayout) -> Self { + let base = AddressSpaceBase::from_regions(regions, layout); + + AddressSpace { + state: Arc::new(ArcSwap::new(Arc::new(base))), + } + } + + /// Insert a new address space region into the address space. + /// + /// # Arguments + /// * `region` - the new region to be inserted. + pub fn insert_region( + &mut self, + region: Arc, + ) -> Result<(), AddressSpaceError> { + let curr = self.state.load().regions.clone(); + let layout = self.state.load().layout.clone(); + let mut base = AddressSpaceBase::from_regions(curr, layout); + base.insert_region(region)?; + let _old = self.state.swap(Arc::new(base)); + + Ok(()) + } + + /// Enumerate all regions in the address space. + /// + /// # Arguments + /// * `cb` - the callback function to apply to each region. + pub fn walk_regions(&self, cb: F) -> Result<(), AddressSpaceError> + where + F: FnMut(&Arc) -> Result<(), AddressSpaceError>, + { + self.state.load().walk_regions(cb) + } + + /// Get address space layout associated with the address space. + pub fn layout(&self) -> AddressSpaceLayout { + self.state.load().layout() + } + + /// Get maximum of guest physical address in the address space. + pub fn last_addr(&self) -> GuestAddress { + self.state.load().last_addr() + } + + /// Check whether the guest physical address `guest_addr` belongs to a DAX memory region. + /// + /// # Arguments + /// * `guest_addr` - the guest physical address to inquire + pub fn is_dax_region(&self, guest_addr: GuestAddress) -> bool { + self.state.load().is_dax_region(guest_addr) + } + + /// Get protection flags of memory region that guest physical address `guest_addr` belongs to. + /// + /// # Arguments + /// * `guest_addr` - the guest physical address to inquire + pub fn prot_flags(&self, guest_addr: GuestAddress) -> Result { + self.state.load().prot_flags(guest_addr) + } + + /// Get optional NUMA node id associated with guest physical address `gpa`. + /// + /// # Arguments + /// * `gpa` - guest physical address to query. + pub fn numa_node_id(&self, gpa: u64) -> Option { + self.state.load().numa_node_id(gpa) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + use vm_memory::GuestUsize; + use vmm_sys_util::tempfile::TempFile; + + // define macros for unit test + const GUEST_PHYS_END: u64 = (1 << 46) - 1; + const GUEST_MEM_START: u64 = 0; + const GUEST_MEM_END: u64 = GUEST_PHYS_END >> 1; + const GUEST_DEVICE_START: u64 = GUEST_MEM_END + 1; + + #[test] + fn test_address_space_base_from_regions() { + let mut file = TempFile::new().unwrap().into_file(); + let sample_buf = &[1, 2, 3, 4, 5]; + assert!(file.write_all(sample_buf).is_ok()); + file.set_len(0x10000).unwrap(); + + let reg = Arc::new( + AddressSpaceRegion::create_device_region(GuestAddress(GUEST_DEVICE_START), 0x1000) + .unwrap(), + ); + let regions = vec![reg]; + let layout = AddressSpaceLayout::new(GUEST_PHYS_END, GUEST_MEM_START, GUEST_MEM_END); + let address_space = AddressSpaceBase::from_regions(regions, layout.clone()); + assert_eq!(address_space.layout(), layout); + } + + #[test] + #[should_panic(expected = "Invalid region")] + fn test_address_space_base_from_regions_when_region_invalid() { + let reg = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x100), + 0x1000, + None, + None, + 0, + 0, + false, + )); + let regions = vec![reg]; + let layout = AddressSpaceLayout::new(0x2000, 0x200, 0x1800); + let _address_space = AddressSpaceBase::from_regions(regions, layout); + } + + #[test] + #[should_panic(expected = "address space regions intersect with each other")] + fn test_address_space_base_from_regions_when_region_intersected() { + let reg1 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x100), + 0x200, + None, + None, + 0, + 0, + false, + )); + let reg2 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x200), + 0x200, + None, + None, + 0, + 0, + false, + )); + let regions = vec![reg1, reg2]; + let layout = AddressSpaceLayout::new(0x2000, 0x0, 0x1800); + let _address_space = AddressSpaceBase::from_regions(regions, layout); + } + + #[test] + fn test_address_space_base_insert_region() { + let reg1 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x100), + 0x200, + None, + None, + 0, + 0, + false, + )); + let reg2 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x300), + 0x200, + None, + None, + 0, + 0, + false, + )); + let regions = vec![reg1]; + let layout = AddressSpaceLayout::new(0x2000, 0x100, 0x1800); + let mut address_space = AddressSpaceBase::from_regions(regions, layout); + + // Normal case. + address_space.insert_region(reg2).unwrap(); + assert!(!address_space.regions[1].intersect_with(&address_space.regions[0])); + + // Error invalid address range case when region invaled. + let invalid_reg = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x0), + 0x100, + None, + None, + 0, + 0, + false, + )); + assert_eq!( + format!( + "{:?}", + address_space.insert_region(invalid_reg).err().unwrap() + ), + format!("InvalidAddressRange({:?}, {:?})", 0x0, 0x100) + ); + + // Error Error invalid address range case when region to be inserted will intersect + // exsisting regions. + let intersected_reg = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x400), + 0x200, + None, + None, + 0, + 0, + false, + )); + assert_eq!( + format!( + "{:?}", + address_space.insert_region(intersected_reg).err().unwrap() + ), + format!("InvalidAddressRange({:?}, {:?})", 0x400, 0x200) + ); + } + + #[test] + fn test_address_space_base_walk_regions() { + let reg1 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x100), + 0x200, + None, + None, + 0, + 0, + false, + )); + let reg2 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x300), + 0x200, + None, + None, + 0, + 0, + false, + )); + let regions = vec![reg1, reg2]; + let layout = AddressSpaceLayout::new(0x2000, 0x0, 0x1800); + let address_space = AddressSpaceBase::from_regions(regions, layout); + + // The argument of walk_regions is a function which takes a &Arc + // and returns result. This function will be applied to all regions. + fn do_not_have_hotplug(region: &Arc) -> Result<(), AddressSpaceError> { + if region.is_hotplug() { + Err(AddressSpaceError::InvalidRegionType) // The Error type is dictated to AddressSpaceError. + } else { + Ok(()) + } + } + assert!(matches!( + address_space.walk_regions(do_not_have_hotplug).unwrap(), + () + )); + } + + #[test] + fn test_address_space_base_last_addr() { + let reg1 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x100), + 0x200, + None, + None, + 0, + 0, + false, + )); + let reg2 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x300), + 0x200, + None, + None, + 0, + 0, + false, + )); + let regions = vec![reg1, reg2]; + let layout = AddressSpaceLayout::new(0x2000, 0x0, 0x1800); + let address_space = AddressSpaceBase::from_regions(regions, layout); + + assert_eq!(address_space.last_addr(), GuestAddress(0x500 - 1)); + } + + #[test] + fn test_address_space_base_is_dax_region() { + let page_size = 4096; + let address_space_region = vec![ + Arc::new(AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(page_size), + page_size as GuestUsize, + )), + Arc::new(AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(page_size * 2), + page_size as GuestUsize, + )), + Arc::new(AddressSpaceRegion::new( + AddressSpaceRegionType::DAXMemory, + GuestAddress(GUEST_DEVICE_START), + page_size as GuestUsize, + )), + ]; + let layout = AddressSpaceLayout::new(GUEST_PHYS_END, GUEST_MEM_START, GUEST_MEM_END); + let address_space = AddressSpaceBase::from_regions(address_space_region, layout); + + assert!(!address_space.is_dax_region(GuestAddress(page_size))); + assert!(!address_space.is_dax_region(GuestAddress(page_size * 2))); + assert!(address_space.is_dax_region(GuestAddress(GUEST_DEVICE_START))); + assert!(address_space.is_dax_region(GuestAddress(GUEST_DEVICE_START + 1))); + assert!(!address_space.is_dax_region(GuestAddress(GUEST_DEVICE_START + page_size))); + assert!(address_space.is_dax_region(GuestAddress(GUEST_DEVICE_START + page_size - 1))); + } + + #[test] + fn test_address_space_base_prot_flags() { + let reg1 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x100), + 0x200, + Some(0), + None, + 0, + 0, + false, + )); + let reg2 = Arc::new(AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x300), + 0x300, + )); + let regions = vec![reg1, reg2]; + let layout = AddressSpaceLayout::new(0x2000, 0x0, 0x1800); + let address_space = AddressSpaceBase::from_regions(regions, layout); + + // Normal case, reg1. + assert_eq!(address_space.prot_flags(GuestAddress(0x200)).unwrap(), 0); + // Normal case, reg2. + assert_eq!( + address_space.prot_flags(GuestAddress(0x500)).unwrap(), + libc::PROT_READ | libc::PROT_WRITE + ); + // Inquire gpa where no region is set. + assert!(matches!( + address_space.prot_flags(GuestAddress(0x600)), + Err(AddressSpaceError::InvalidRegionType) + )); + } + + #[test] + fn test_address_space_base_numa_node_id() { + let reg1 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x100), + 0x200, + Some(0), + None, + 0, + 0, + false, + )); + let reg2 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x300), + 0x300, + None, + None, + 0, + 0, + false, + )); + let regions = vec![reg1, reg2]; + let layout = AddressSpaceLayout::new(0x2000, 0x0, 0x1800); + let address_space = AddressSpaceBase::from_regions(regions, layout); + + // Normal case. + assert_eq!(address_space.numa_node_id(0x200).unwrap(), 0); + // Inquire region with None as its numa node id. + assert_eq!(address_space.numa_node_id(0x400), None); + // Inquire gpa where no region is set. + assert_eq!(address_space.numa_node_id(0x600), None); + } + + #[test] + fn test_address_space_convert_into_vm_as() { + // ! Further and detailed test is needed here. + let gmm = GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0x0), 0x400)]).unwrap(); + let _vm = AddressSpace::convert_into_vm_as(gmm); + } + + #[test] + fn test_address_space_insert_region() { + let reg1 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x100), + 0x200, + None, + None, + 0, + 0, + false, + )); + let reg2 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x300), + 0x200, + None, + None, + 0, + 0, + false, + )); + let regions = vec![reg1]; + let layout = AddressSpaceLayout::new(0x2000, 0x100, 0x1800); + let mut address_space = AddressSpace::from_regions(regions, layout); + + // Normal case. + assert!(matches!(address_space.insert_region(reg2).unwrap(), ())); + + // Error invalid address range case when region invaled. + let invalid_reg = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x0), + 0x100, + None, + None, + 0, + 0, + false, + )); + assert_eq!( + format!( + "{:?}", + address_space.insert_region(invalid_reg).err().unwrap() + ), + format!("InvalidAddressRange({:?}, {:?})", 0x0, 0x100) + ); + + // Error Error invalid address range case when region to be inserted will intersect + // exsisting regions. + let intersected_reg = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x400), + 0x200, + None, + None, + 0, + 0, + false, + )); + assert_eq!( + format!( + "{:?}", + address_space.insert_region(intersected_reg).err().unwrap() + ), + format!("InvalidAddressRange({:?}, {:?})", 0x400, 0x200) + ); + } + + #[test] + fn test_address_space_walk_regions() { + let reg1 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x100), + 0x200, + None, + None, + 0, + 0, + false, + )); + let reg2 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x300), + 0x200, + None, + None, + 0, + 0, + false, + )); + let regions = vec![reg1, reg2]; + let layout = AddressSpaceLayout::new(0x2000, 0x0, 0x1800); + let address_space = AddressSpace::from_regions(regions, layout); + + fn access_all_hotplug_flag( + region: &Arc, + ) -> Result<(), AddressSpaceError> { + region.is_hotplug(); + Ok(()) + } + + assert!(matches!( + address_space.walk_regions(access_all_hotplug_flag).unwrap(), + () + )); + } + + #[test] + fn test_address_space_layout() { + let reg = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x100), + 0x1000, + None, + None, + 0, + 0, + false, + )); + let regions = vec![reg]; + let layout = AddressSpaceLayout::new(0x2000, 0x0, 0x1800); + let address_space = AddressSpace::from_regions(regions, layout.clone()); + + assert_eq!(layout, address_space.layout()); + } + + #[test] + fn test_address_space_last_addr() { + let reg1 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x100), + 0x200, + None, + None, + 0, + 0, + false, + )); + let reg2 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x300), + 0x200, + None, + None, + 0, + 0, + false, + )); + let regions = vec![reg1, reg2]; + let layout = AddressSpaceLayout::new(0x2000, 0x0, 0x1800); + let address_space = AddressSpace::from_regions(regions, layout); + + assert_eq!(address_space.last_addr(), GuestAddress(0x500 - 1)); + } + + #[test] + fn test_address_space_is_dax_region() { + let page_size = 4096; + let address_space_region = vec![ + Arc::new(AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(page_size), + page_size as GuestUsize, + )), + Arc::new(AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(page_size * 2), + page_size as GuestUsize, + )), + Arc::new(AddressSpaceRegion::new( + AddressSpaceRegionType::DAXMemory, + GuestAddress(GUEST_DEVICE_START), + page_size as GuestUsize, + )), + ]; + let layout = AddressSpaceLayout::new(GUEST_PHYS_END, GUEST_MEM_START, GUEST_MEM_END); + let address_space = AddressSpace::from_regions(address_space_region, layout); + + assert!(!address_space.is_dax_region(GuestAddress(page_size))); + assert!(!address_space.is_dax_region(GuestAddress(page_size * 2))); + assert!(address_space.is_dax_region(GuestAddress(GUEST_DEVICE_START))); + assert!(address_space.is_dax_region(GuestAddress(GUEST_DEVICE_START + 1))); + assert!(!address_space.is_dax_region(GuestAddress(GUEST_DEVICE_START + page_size))); + assert!(address_space.is_dax_region(GuestAddress(GUEST_DEVICE_START + page_size - 1))); + } + + #[test] + fn test_address_space_prot_flags() { + let reg1 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x100), + 0x200, + Some(0), + None, + 0, + 0, + false, + )); + let reg2 = Arc::new(AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x300), + 0x300, + )); + let regions = vec![reg1, reg2]; + let layout = AddressSpaceLayout::new(0x2000, 0x0, 0x1800); + let address_space = AddressSpace::from_regions(regions, layout); + + // Normal case, reg1. + assert_eq!(address_space.prot_flags(GuestAddress(0x200)).unwrap(), 0); + // Normal case, reg2. + assert_eq!( + address_space.prot_flags(GuestAddress(0x500)).unwrap(), + libc::PROT_READ | libc::PROT_WRITE + ); + // Inquire gpa where no region is set. + assert!(matches!( + address_space.prot_flags(GuestAddress(0x600)), + Err(AddressSpaceError::InvalidRegionType) + )); + } + + #[test] + fn test_address_space_numa_node_id() { + let reg1 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x100), + 0x200, + Some(0), + None, + 0, + 0, + false, + )); + let reg2 = Arc::new(AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x300), + 0x300, + None, + None, + 0, + 0, + false, + )); + let regions = vec![reg1, reg2]; + let layout = AddressSpaceLayout::new(0x2000, 0x0, 0x1800); + let address_space = AddressSpace::from_regions(regions, layout); + + // Normal case. + assert_eq!(address_space.numa_node_id(0x200).unwrap(), 0); + // Inquire region with None as its numa node id. + assert_eq!(address_space.numa_node_id(0x400), None); + // Inquire gpa where no region is set. + assert_eq!(address_space.numa_node_id(0x600), None); + } +} diff --git a/src/dragonball/src/dbs_address_space/src/layout.rs b/src/dragonball/src/dbs_address_space/src/layout.rs new file mode 100644 index 000000000..cd6c6bfb0 --- /dev/null +++ b/src/dragonball/src/dbs_address_space/src/layout.rs @@ -0,0 +1,154 @@ +// Copyright (C) 2021 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use lazy_static::lazy_static; + +use crate::{AddressSpaceRegion, AddressSpaceRegionType}; + +// Max retry times for reading /proc +const PROC_READ_RETRY: u64 = 5; + +lazy_static! { + /// Upper bound of host memory. + pub static ref USABLE_END: u64 = { + for _ in 0..PROC_READ_RETRY { + if let Ok(buf) = std::fs::read("/proc/meminfo") { + let content = String::from_utf8_lossy(&buf); + for line in content.lines() { + if line.starts_with("MemTotal:") { + if let Some(end) = line.find(" kB") { + if let Ok(size) = line[9..end].trim().parse::() { + return (size << 10) - 1; + } + } + } + } + } + } + panic!("Exceed max retry times. Cannot get total mem size from /proc/meminfo"); + }; +} + +/// Address space layout configuration. +/// +/// The layout configuration must guarantee that `mem_start` <= `mem_end` <= `phys_end`. +/// Non-memory region should be arranged into the range [mem_end, phys_end). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct AddressSpaceLayout { + /// end of guest physical address + pub phys_end: u64, + /// start of guest memory address + pub mem_start: u64, + /// end of guest memory address + pub mem_end: u64, + /// end of usable memory address + pub usable_end: u64, +} + +impl AddressSpaceLayout { + /// Create a new instance of `AddressSpaceLayout`. + pub fn new(phys_end: u64, mem_start: u64, mem_end: u64) -> Self { + AddressSpaceLayout { + phys_end, + mem_start, + mem_end, + usable_end: *USABLE_END, + } + } + + /// Check whether an region is valid with the constraints of the layout. + pub fn is_region_valid(&self, region: &AddressSpaceRegion) -> bool { + let region_end = match region.base.0.checked_add(region.size) { + None => return false, + Some(v) => v, + }; + + match region.ty { + AddressSpaceRegionType::DefaultMemory => { + if region.base.0 < self.mem_start || region_end > self.mem_end { + return false; + } + } + AddressSpaceRegionType::DeviceMemory | AddressSpaceRegionType::DAXMemory => { + if region.base.0 < self.mem_end || region_end > self.phys_end { + return false; + } + } + } + + true + } +} + +#[cfg(test)] +mod tests { + use super::*; + use vm_memory::GuestAddress; + + #[test] + fn test_is_region_valid() { + let layout = AddressSpaceLayout::new(0x1_0000_0000, 0x1000_0000, 0x2000_0000); + + let region = AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x0), + 0x1_0000, + ); + assert!(!layout.is_region_valid(®ion)); + let region = AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x2000_0000), + 0x1_0000, + ); + assert!(!layout.is_region_valid(®ion)); + let region = AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x1_0000), + 0x2000_0000, + ); + assert!(!layout.is_region_valid(®ion)); + let region = AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(u64::MAX), + 0x1_0000_0000, + ); + assert!(!layout.is_region_valid(®ion)); + let region = AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x1000_0000), + 0x1_0000, + ); + assert!(layout.is_region_valid(®ion)); + + let region = AddressSpaceRegion::new( + AddressSpaceRegionType::DeviceMemory, + GuestAddress(0x1000_0000), + 0x1_0000, + ); + assert!(!layout.is_region_valid(®ion)); + let region = AddressSpaceRegion::new( + AddressSpaceRegionType::DeviceMemory, + GuestAddress(0x1_0000_0000), + 0x1_0000, + ); + assert!(!layout.is_region_valid(®ion)); + let region = AddressSpaceRegion::new( + AddressSpaceRegionType::DeviceMemory, + GuestAddress(0x1_0000), + 0x1_0000_0000, + ); + assert!(!layout.is_region_valid(®ion)); + let region = AddressSpaceRegion::new( + AddressSpaceRegionType::DeviceMemory, + GuestAddress(u64::MAX), + 0x1_0000_0000, + ); + assert!(!layout.is_region_valid(®ion)); + let region = AddressSpaceRegion::new( + AddressSpaceRegionType::DeviceMemory, + GuestAddress(0x8000_0000), + 0x1_0000, + ); + assert!(layout.is_region_valid(®ion)); + } +} diff --git a/src/dragonball/src/dbs_address_space/src/lib.rs b/src/dragonball/src/dbs_address_space/src/lib.rs new file mode 100644 index 000000000..7e38cbbdd --- /dev/null +++ b/src/dragonball/src/dbs_address_space/src/lib.rs @@ -0,0 +1,87 @@ +// Copyright (C) 2021 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![deny(missing_docs)] + +//! Traits and Structs to manage guest physical address space for virtual machines. +//! +//! The [vm-memory](https://crates.io/crates/vm-memory) implements mechanisms to manage and access +//! guest memory resident in guest physical address space. In addition to guest memory, there may +//! be other type of devices resident in the same guest physical address space. +//! +//! The `dbs-address-space` crate provides traits and structs to manage the guest physical address +//! space for virtual machines, and mechanisms to coordinate all the devices resident in the +//! guest physical address space. + +use vm_memory::GuestUsize; + +mod address_space; +pub use self::address_space::{AddressSpace, AddressSpaceBase}; + +mod layout; +pub use layout::{AddressSpaceLayout, USABLE_END}; + +mod memory; +pub use memory::{GuestMemoryHybrid, GuestMemoryManager, GuestRegionHybrid, GuestRegionRaw}; + +mod numa; +pub use self::numa::{NumaIdTable, NumaNode, NumaNodeInfo, MPOL_MF_MOVE, MPOL_PREFERRED}; + +mod region; +pub use region::{AddressSpaceRegion, AddressSpaceRegionType}; + +/// Errors associated with virtual machine address space management. +#[derive(Debug, thiserror::Error)] +pub enum AddressSpaceError { + /// Invalid address space region type. + #[error("invalid address space region type")] + InvalidRegionType, + + /// Invalid address range. + #[error("invalid address space region (0x{0:x}, 0x{1:x})")] + InvalidAddressRange(u64, GuestUsize), + + /// Invalid guest memory source type. + #[error("invalid memory source type {0}")] + InvalidMemorySourceType(String), + + /// Failed to create memfd to map anonymous memory. + #[error("can not create memfd to map anonymous memory")] + CreateMemFd(#[source] nix::Error), + + /// Failed to open memory file. + #[error("can not open memory file")] + OpenFile(#[source] std::io::Error), + + /// Failed to create directory. + #[error("can not create directory")] + CreateDir(#[source] std::io::Error), + + /// Failed to set size for memory file. + #[error("can not set size for memory file")] + SetFileSize(#[source] std::io::Error), + + /// Failed to unlink memory file. + #[error("can not unlink memory file")] + UnlinkFile(#[source] nix::Error), +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_error_code() { + let e = AddressSpaceError::InvalidRegionType; + + assert_eq!(format!("{e}"), "invalid address space region type"); + assert_eq!(format!("{e:?}"), "InvalidRegionType"); + assert_eq!( + format!( + "{}", + AddressSpaceError::InvalidMemorySourceType("test".to_string()) + ), + "invalid memory source type test" + ); + } +} diff --git a/src/dragonball/src/dbs_address_space/src/memory/hybrid.rs b/src/dragonball/src/dbs_address_space/src/memory/hybrid.rs new file mode 100644 index 000000000..87a09749e --- /dev/null +++ b/src/dragonball/src/dbs_address_space/src/memory/hybrid.rs @@ -0,0 +1,1105 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::io::{Read, Write}; +use std::sync::atomic::Ordering; +use std::sync::Arc; + +use vm_memory::bitmap::{Bitmap, BS}; +use vm_memory::guest_memory::GuestMemoryIterator; +use vm_memory::mmap::{Error, NewBitmap}; +use vm_memory::{ + guest_memory, AtomicAccess, Bytes, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegion, + GuestRegionMmap, GuestUsize, MemoryRegionAddress, VolatileSlice, +}; + +use crate::GuestRegionRaw; + +/// An adapter for different concrete implementations of `GuestMemoryRegion`. +#[derive(Debug)] +pub enum GuestRegionHybrid { + /// Region of type `GuestRegionMmap`. + Mmap(GuestRegionMmap), + /// Region of type `GuestRegionRaw`. + Raw(GuestRegionRaw), +} + +impl GuestRegionHybrid { + /// Create a `GuestRegionHybrid` object from `GuestRegionMmap` object. + pub fn from_mmap_region(region: GuestRegionMmap) -> Self { + GuestRegionHybrid::Mmap(region) + } + + /// Create a `GuestRegionHybrid` object from `GuestRegionRaw` object. + pub fn from_raw_region(region: GuestRegionRaw) -> Self { + GuestRegionHybrid::Raw(region) + } +} + +impl Bytes for GuestRegionHybrid { + type E = guest_memory::Error; + + fn write(&self, buf: &[u8], addr: MemoryRegionAddress) -> guest_memory::Result { + match self { + GuestRegionHybrid::Mmap(region) => region.write(buf, addr), + GuestRegionHybrid::Raw(region) => region.write(buf, addr), + } + } + + fn read(&self, buf: &mut [u8], addr: MemoryRegionAddress) -> guest_memory::Result { + match self { + GuestRegionHybrid::Mmap(region) => region.read(buf, addr), + GuestRegionHybrid::Raw(region) => region.read(buf, addr), + } + } + + fn write_slice(&self, buf: &[u8], addr: MemoryRegionAddress) -> guest_memory::Result<()> { + match self { + GuestRegionHybrid::Mmap(region) => region.write_slice(buf, addr), + GuestRegionHybrid::Raw(region) => region.write_slice(buf, addr), + } + } + + fn read_slice(&self, buf: &mut [u8], addr: MemoryRegionAddress) -> guest_memory::Result<()> { + match self { + GuestRegionHybrid::Mmap(region) => region.read_slice(buf, addr), + GuestRegionHybrid::Raw(region) => region.read_slice(buf, addr), + } + } + + fn read_from( + &self, + addr: MemoryRegionAddress, + src: &mut F, + count: usize, + ) -> guest_memory::Result + where + F: Read, + { + match self { + GuestRegionHybrid::Mmap(region) => region.read_from(addr, src, count), + GuestRegionHybrid::Raw(region) => region.read_from(addr, src, count), + } + } + + fn read_exact_from( + &self, + addr: MemoryRegionAddress, + src: &mut F, + count: usize, + ) -> guest_memory::Result<()> + where + F: Read, + { + match self { + GuestRegionHybrid::Mmap(region) => region.read_exact_from(addr, src, count), + GuestRegionHybrid::Raw(region) => region.read_exact_from(addr, src, count), + } + } + + fn write_to( + &self, + addr: MemoryRegionAddress, + dst: &mut F, + count: usize, + ) -> guest_memory::Result + where + F: Write, + { + match self { + GuestRegionHybrid::Mmap(region) => region.write_to(addr, dst, count), + GuestRegionHybrid::Raw(region) => region.write_to(addr, dst, count), + } + } + + fn write_all_to( + &self, + addr: MemoryRegionAddress, + dst: &mut F, + count: usize, + ) -> guest_memory::Result<()> + where + F: Write, + { + match self { + GuestRegionHybrid::Mmap(region) => region.write_all_to(addr, dst, count), + GuestRegionHybrid::Raw(region) => region.write_all_to(addr, dst, count), + } + } + + fn store( + &self, + val: T, + addr: MemoryRegionAddress, + order: Ordering, + ) -> guest_memory::Result<()> { + match self { + GuestRegionHybrid::Mmap(region) => region.store(val, addr, order), + GuestRegionHybrid::Raw(region) => region.store(val, addr, order), + } + } + + fn load( + &self, + addr: MemoryRegionAddress, + order: Ordering, + ) -> guest_memory::Result { + match self { + GuestRegionHybrid::Mmap(region) => region.load(addr, order), + GuestRegionHybrid::Raw(region) => region.load(addr, order), + } + } +} + +impl GuestMemoryRegion for GuestRegionHybrid { + type B = B; + + fn len(&self) -> GuestUsize { + match self { + GuestRegionHybrid::Mmap(region) => region.len(), + GuestRegionHybrid::Raw(region) => region.len(), + } + } + + fn start_addr(&self) -> GuestAddress { + match self { + GuestRegionHybrid::Mmap(region) => region.start_addr(), + GuestRegionHybrid::Raw(region) => region.start_addr(), + } + } + + fn bitmap(&self) -> &Self::B { + match self { + GuestRegionHybrid::Mmap(region) => region.bitmap(), + GuestRegionHybrid::Raw(region) => region.bitmap(), + } + } + + fn get_host_address(&self, addr: MemoryRegionAddress) -> guest_memory::Result<*mut u8> { + match self { + GuestRegionHybrid::Mmap(region) => region.get_host_address(addr), + GuestRegionHybrid::Raw(region) => region.get_host_address(addr), + } + } + + fn file_offset(&self) -> Option<&FileOffset> { + match self { + GuestRegionHybrid::Mmap(region) => region.file_offset(), + GuestRegionHybrid::Raw(region) => region.file_offset(), + } + } + + unsafe fn as_slice(&self) -> Option<&[u8]> { + match self { + GuestRegionHybrid::Mmap(region) => region.as_slice(), + GuestRegionHybrid::Raw(region) => region.as_slice(), + } + } + + unsafe fn as_mut_slice(&self) -> Option<&mut [u8]> { + match self { + GuestRegionHybrid::Mmap(region) => region.as_mut_slice(), + GuestRegionHybrid::Raw(region) => region.as_mut_slice(), + } + } + + fn get_slice( + &self, + offset: MemoryRegionAddress, + count: usize, + ) -> guest_memory::Result>> { + match self { + GuestRegionHybrid::Mmap(region) => region.get_slice(offset, count), + GuestRegionHybrid::Raw(region) => region.get_slice(offset, count), + } + } + + #[cfg(target_os = "linux")] + fn is_hugetlbfs(&self) -> Option { + match self { + GuestRegionHybrid::Mmap(region) => region.is_hugetlbfs(), + GuestRegionHybrid::Raw(region) => region.is_hugetlbfs(), + } + } +} + +/// [`GuestMemory`](trait.GuestMemory.html) implementation that manage hybrid types of guest memory +/// regions. +/// +/// Represents the entire physical memory of the guest by tracking all its memory regions. +/// Each region is an instance of `GuestRegionHybrid`. +#[derive(Clone, Debug, Default)] +pub struct GuestMemoryHybrid { + pub(crate) regions: Vec>>, +} + +impl GuestMemoryHybrid { + /// Creates an empty `GuestMemoryHybrid` instance. + pub fn new() -> Self { + Self::default() + } +} + +impl GuestMemoryHybrid { + /// Creates a new `GuestMemoryHybrid` from a vector of regions. + /// + /// # Arguments + /// + /// * `regions` - The vector of regions. + /// The regions shouldn't overlap and they should be sorted + /// by the starting address. + pub fn from_regions(mut regions: Vec>) -> Result { + Self::from_arc_regions(regions.drain(..).map(Arc::new).collect()) + } + + /// Creates a new `GuestMemoryHybrid` from a vector of Arc regions. + /// + /// Similar to the constructor `from_regions()` as it returns a + /// `GuestMemoryHybrid`. The need for this constructor is to provide a way for + /// consumer of this API to create a new `GuestMemoryHybrid` based on existing + /// regions coming from an existing `GuestMemoryHybrid` instance. + /// + /// # Arguments + /// + /// * `regions` - The vector of `Arc` regions. + /// The regions shouldn't overlap and they should be sorted + /// by the starting address. + pub fn from_arc_regions(regions: Vec>>) -> Result { + if regions.is_empty() { + return Err(Error::NoMemoryRegion); + } + + for window in regions.windows(2) { + let prev = &window[0]; + let next = &window[1]; + + if prev.start_addr() > next.start_addr() { + return Err(Error::UnsortedMemoryRegions); + } + + if prev.last_addr() >= next.start_addr() { + return Err(Error::MemoryRegionOverlap); + } + } + + Ok(Self { regions }) + } + + /// Insert a region into the `GuestMemoryHybrid` object and return a new `GuestMemoryHybrid`. + /// + /// # Arguments + /// * `region`: the memory region to insert into the guest memory object. + pub fn insert_region( + &self, + region: Arc>, + ) -> Result, Error> { + let mut regions = self.regions.clone(); + regions.push(region); + regions.sort_by_key(|x| x.start_addr()); + + Self::from_arc_regions(regions) + } + + /// Remove a region into the `GuestMemoryHybrid` object and return a new `GuestMemoryHybrid` + /// on success, together with the removed region. + /// + /// # Arguments + /// * `base`: base address of the region to be removed + /// * `size`: size of the region to be removed + pub fn remove_region( + &self, + base: GuestAddress, + size: GuestUsize, + ) -> Result<(GuestMemoryHybrid, Arc>), Error> { + if let Ok(region_index) = self.regions.binary_search_by_key(&base, |x| x.start_addr()) { + if self.regions.get(region_index).unwrap().len() as GuestUsize == size { + let mut regions = self.regions.clone(); + let region = regions.remove(region_index); + return Ok((Self { regions }, region)); + } + } + + Err(Error::InvalidGuestRegion) + } +} + +/// An iterator over the elements of `GuestMemoryHybrid`. +/// +/// This struct is created by `GuestMemory::iter()`. See its documentation for more. +pub struct Iter<'a, B>(std::slice::Iter<'a, Arc>>); + +impl<'a, B> Iterator for Iter<'a, B> { + type Item = &'a GuestRegionHybrid; + + fn next(&mut self) -> Option { + self.0.next().map(AsRef::as_ref) + } +} + +impl<'a, B: 'a> GuestMemoryIterator<'a, GuestRegionHybrid> for GuestMemoryHybrid { + type Iter = Iter<'a, B>; +} + +impl GuestMemory for GuestMemoryHybrid { + type R = GuestRegionHybrid; + + type I = Self; + + fn num_regions(&self) -> usize { + self.regions.len() + } + + fn find_region(&self, addr: GuestAddress) -> Option<&GuestRegionHybrid> { + let index = match self.regions.binary_search_by_key(&addr, |x| x.start_addr()) { + Ok(x) => Some(x), + // Within the closest region with starting address < addr + Err(x) if (x > 0 && addr <= self.regions[x - 1].last_addr()) => Some(x - 1), + _ => None, + }; + index.map(|x| self.regions[x].as_ref()) + } + + fn iter(&self) -> Iter { + Iter(self.regions.iter()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Seek; + use vm_memory::{GuestMemoryError, MmapRegion}; + use vmm_sys_util::tempfile::TempFile; + + #[test] + fn test_region_new() { + let start_addr = GuestAddress(0x0); + + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x400).unwrap(), start_addr).unwrap(); + let guest_region = GuestRegionHybrid::from_mmap_region(mmap_reg); + + assert_eq!(guest_region.start_addr(), start_addr); + assert_eq!(guest_region.len(), 0x400); + + let mut buf = [0u8; 1024]; + let raw_region = + unsafe { GuestRegionRaw::<()>::new(start_addr, &mut buf as *mut _, 0x800) }; + let guest_region = GuestRegionHybrid::from_raw_region(raw_region); + + assert_eq!(guest_region.start_addr(), start_addr); + assert_eq!(guest_region.len(), 0x800); + } + + #[test] + fn test_write_and_read_on_mmap_region() { + let start_addr = GuestAddress(0x0); + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x800).unwrap(), start_addr).unwrap(); + let guest_region = GuestRegionHybrid::from_mmap_region(mmap_reg); + let buf_to_write = [0xF0u8; 0x400]; + let write_addr = MemoryRegionAddress(0x400); + + // Normal case. + let number_of_bytes_write = guest_region.write(&buf_to_write, write_addr).unwrap(); + assert_eq!(number_of_bytes_write, 0x400); + let mut buf_read = [0u8; 0x400]; + let number_of_bytes_read = guest_region.read(&mut buf_read, write_addr).unwrap(); + assert_eq!(number_of_bytes_read, 0x400); + assert_eq!(buf_read, [0xF0u8; 0x400]); + + // Error invalid backend address case in write(). + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_region + .write(&buf_to_write, invalid_addr) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error invalid backend address case in read(). + assert!(matches!( + guest_region + .read(&mut buf_read, invalid_addr) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + } + + #[test] + fn test_write_and_read_on_raw_region() { + let start_addr = GuestAddress(0x0); + let mut buf_of_raw_region = [0u8; 0x800]; + let raw_region = unsafe { + GuestRegionRaw::<()>::new(start_addr, &mut buf_of_raw_region as *mut _, 0x800) + }; + let guest_region = GuestRegionHybrid::from_raw_region(raw_region); + let buf_to_write = [0xF0u8; 0x400]; + let write_addr = MemoryRegionAddress(0x400); + + // Normal case. + let number_of_bytes_write = guest_region.write(&buf_to_write, write_addr).unwrap(); + assert_eq!(number_of_bytes_write, 0x400); + let mut buf_read = [0u8; 0x400]; + let number_of_bytes_read = guest_region.read(&mut buf_read, write_addr).unwrap(); + assert_eq!(number_of_bytes_read, 0x400); + assert_eq!(buf_read, [0xF0u8; 0x400]); + + // Error invalid backend address case in write(). + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_region + .write(&buf_to_write, invalid_addr) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error invalid backend address case in read(). + assert!(matches!( + guest_region + .read(&mut buf_read, invalid_addr) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + } + + #[test] + fn test_write_slice_and_read_slice_on_mmap_region() { + let start_addr = GuestAddress(0x0); + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x800).unwrap(), start_addr).unwrap(); + let guest_region = GuestRegionHybrid::from_mmap_region(mmap_reg); + let buf_to_write = [0xF0u8; 0x400]; + let write_addr = MemoryRegionAddress(0x400); + + // Normal case. + guest_region.write_slice(&buf_to_write, write_addr).unwrap(); + let mut buf_read = [0x0u8; 0x400]; + guest_region.read_slice(&mut buf_read, write_addr).unwrap(); + assert_eq!(buf_read, [0xF0u8; 0x400]); + + // Error invalid backend address case in write_slice(). + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_region + .write_slice(&buf_to_write, invalid_addr) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error partial buffer case in write_slice(). + let insufficient_addr = MemoryRegionAddress(0x600); + assert_eq!( + format!( + "{:?}", + guest_region + .write_slice(&buf_to_write, insufficient_addr) + .err() + .unwrap() + ), + format!( + "PartialBuffer {{ expected: {:?}, completed: {:?} }}", + buf_to_write.len(), + guest_region.len() as usize - 0x600_usize + ) + ); + + // Error invalid backend address case in write_slice(). + let invalid_addr = MemoryRegionAddress(0x900); + let mut buf_read = [0x0u8; 0x400]; + assert!(matches!( + guest_region + .read_slice(&mut buf_read, invalid_addr) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error partial buffer case in write_slice(). + let insufficient_addr = MemoryRegionAddress(0x600); + let mut buf_read = [0x0u8; 0x400]; + assert_eq!( + format!( + "{:?}", + guest_region + .read_slice(&mut buf_read, insufficient_addr) + .err() + .unwrap() + ), + format!( + "PartialBuffer {{ expected: {:?}, completed: {:?} }}", + buf_to_write.len(), + guest_region.len() as usize - 0x600_usize + ) + ); + assert_eq!( + { + let mut buf = [0x0u8; 0x400]; + for cell in buf.iter_mut().take(0x200) { + *cell = 0xF0; + } + buf + }, + buf_read + ); + } + + #[test] + fn test_write_and_read_slice_on_raw_region() { + let start_addr = GuestAddress(0x0); + let mut buf_of_raw_region = [0u8; 0x800]; + let raw_region = unsafe { + GuestRegionRaw::<()>::new(start_addr, &mut buf_of_raw_region as *mut _, 0x800) + }; + let guest_region = GuestRegionHybrid::from_raw_region(raw_region); + let buf_to_write = [0xF0u8; 0x400]; + let write_addr = MemoryRegionAddress(0x400); + + // Normal case. + guest_region.write_slice(&buf_to_write, write_addr).unwrap(); + let mut buf_read = [0x0u8; 0x400]; + guest_region.read_slice(&mut buf_read, write_addr).unwrap(); + assert_eq!(buf_read, [0xF0u8; 0x400]); + + // Error invalid backend address case in write_slice(). + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_region + .write_slice(&buf_to_write, invalid_addr) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error partial buffer case in write_slice(). + let insufficient_addr = MemoryRegionAddress(0x600); + assert_eq!( + format!( + "{:?}", + guest_region + .write_slice(&buf_to_write, insufficient_addr) + .err() + .unwrap() + ), + format!( + "PartialBuffer {{ expected: {:?}, completed: {:?} }}", + buf_to_write.len(), + guest_region.len() as usize - 0x600_usize + ) + ); + + // Error invalid backend address case in write_slice(). + let invalid_addr = MemoryRegionAddress(0x900); + let mut buf_read = [0x0u8; 0x400]; + assert!(matches!( + guest_region + .read_slice(&mut buf_read, invalid_addr) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error partial buffer case in write_slice(). + let insufficient_addr = MemoryRegionAddress(0x600); + let mut buf_read = [0x0u8; 0x400]; + assert_eq!( + format!( + "{:?}", + guest_region + .read_slice(&mut buf_read, insufficient_addr) + .err() + .unwrap() + ), + format!( + "PartialBuffer {{ expected: {:?}, completed: {:?} }}", + buf_to_write.len(), + guest_region.len() as usize - 0x600_usize + ) + ); + assert_eq!( + { + let mut buf = [0x0u8; 0x400]; + for cell in buf.iter_mut().take(0x200) { + *cell = 0xF0; + } + buf + }, + buf_read + ); + } + + #[test] + fn test_read_from_and_write_to_on_mmap_region() { + let start_addr = GuestAddress(0x0); + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x800).unwrap(), start_addr).unwrap(); + let guest_region = GuestRegionHybrid::from_mmap_region(mmap_reg); + let write_addr = MemoryRegionAddress(0x400); + let original_content = b"hello world"; + let size_of_file = original_content.len(); + + // Normal case. + let mut file_to_write_mmap_region = TempFile::new().unwrap().into_file(); + file_to_write_mmap_region + .set_len(size_of_file as u64) + .unwrap(); + file_to_write_mmap_region + .write_all(original_content) + .unwrap(); + // Rewind file pointer after write operation. + file_to_write_mmap_region.rewind().unwrap(); + guest_region + .read_from(write_addr, &mut file_to_write_mmap_region, size_of_file) + .unwrap(); + let mut file_read_from_mmap_region = TempFile::new().unwrap().into_file(); + file_read_from_mmap_region + .set_len(size_of_file as u64) + .unwrap(); + guest_region + .write_all_to(write_addr, &mut file_read_from_mmap_region, size_of_file) + .unwrap(); + // Rewind file pointer after write operation. + file_read_from_mmap_region.rewind().unwrap(); + let mut content = String::new(); + file_read_from_mmap_region + .read_to_string(&mut content) + .unwrap(); + assert_eq!(content.as_bytes(), original_content); + assert_eq!( + file_read_from_mmap_region.metadata().unwrap().len(), + size_of_file as u64 + ); + + // Error invalid backend address case in read_from() on mmap region. + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_region + .read_from(invalid_addr, &mut file_to_write_mmap_region, size_of_file) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error invalid backend address case in write_to() on mmap region. + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_region + .write_to(invalid_addr, &mut file_read_from_mmap_region, size_of_file) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + } + + #[test] + fn test_read_from_and_write_to_on_raw_region() { + let start_addr = GuestAddress(0x0); + let mut buf_of_raw_region = [0u8; 0x800]; + let raw_region = unsafe { + GuestRegionRaw::<()>::new(start_addr, &mut buf_of_raw_region as *mut _, 0x800) + }; + let guest_region = GuestRegionHybrid::from_raw_region(raw_region); + let write_addr = MemoryRegionAddress(0x400); + let original_content = b"hello world"; + let size_of_file = original_content.len(); + + // Normal case. + let mut file_to_write_mmap_region = TempFile::new().unwrap().into_file(); + file_to_write_mmap_region + .set_len(size_of_file as u64) + .unwrap(); + file_to_write_mmap_region + .write_all(original_content) + .unwrap(); + // Rewind file pointer after write operation. + file_to_write_mmap_region.rewind().unwrap(); + guest_region + .read_from(write_addr, &mut file_to_write_mmap_region, size_of_file) + .unwrap(); + let mut file_read_from_mmap_region = TempFile::new().unwrap().into_file(); + file_read_from_mmap_region + .set_len(size_of_file as u64) + .unwrap(); + guest_region + .write_all_to(write_addr, &mut file_read_from_mmap_region, size_of_file) + .unwrap(); + // Rewind file pointer after write operation. + file_read_from_mmap_region.rewind().unwrap(); + let mut content = String::new(); + file_read_from_mmap_region + .read_to_string(&mut content) + .unwrap(); + assert_eq!(content.as_bytes(), original_content); + assert_eq!( + file_read_from_mmap_region.metadata().unwrap().len(), + size_of_file as u64 + ); + + // Error invalid backend address case in read_from() on raw region. + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_region + .read_from(invalid_addr, &mut file_to_write_mmap_region, size_of_file) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error invalid backend address case in write_to() on raw region. + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_region + .write_to(invalid_addr, &mut file_read_from_mmap_region, size_of_file) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + } + + #[test] + fn test_write_all_to_and_read_exact_from() { + let start_addr = GuestAddress(0x0); + let write_addr = MemoryRegionAddress(0x400); + let original_content = b"hello world"; + let size_of_file = original_content.len(); + // Preset a GuestRegionHybrid from a mmap region + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x800).unwrap(), start_addr).unwrap(); + let guest_mmap_region = GuestRegionHybrid::from_mmap_region(mmap_reg); + // Preset a GuestRegionHybrid from a raw region + let mut buf_of_raw_region = [0u8; 0x800]; + let raw_region = unsafe { + GuestRegionRaw::<()>::new(start_addr, &mut buf_of_raw_region as *mut _, 0x800) + }; + let guest_raw_region = GuestRegionHybrid::from_raw_region(raw_region); + + // Normal case on mmap region. + let mut file_to_write_mmap_region = TempFile::new().unwrap().into_file(); + file_to_write_mmap_region + .set_len(size_of_file as u64) + .unwrap(); + file_to_write_mmap_region + .write_all(original_content) + .unwrap(); + file_to_write_mmap_region.rewind().unwrap(); + guest_mmap_region + .read_exact_from(write_addr, &mut file_to_write_mmap_region, size_of_file) + .unwrap(); + let mut file_read_from_mmap_region = TempFile::new().unwrap().into_file(); + file_read_from_mmap_region + .set_len(size_of_file as u64) + .unwrap(); + guest_mmap_region + .write_all_to(write_addr, &mut file_read_from_mmap_region, size_of_file) + .unwrap(); + file_read_from_mmap_region.rewind().unwrap(); + let mut content = String::new(); + file_read_from_mmap_region + .read_to_string(&mut content) + .unwrap(); + assert_eq!(content.as_bytes(), original_content); + assert_eq!( + file_read_from_mmap_region.metadata().unwrap().len(), + size_of_file as u64 + ); + + // Normal case on raw region. + let mut file_to_write_raw_region = TempFile::new().unwrap().into_file(); + file_to_write_raw_region + .set_len(size_of_file as u64) + .unwrap(); + file_to_write_raw_region + .write_all(original_content) + .unwrap(); + file_to_write_raw_region.rewind().unwrap(); + guest_raw_region + .read_exact_from(write_addr, &mut file_to_write_raw_region, size_of_file) + .unwrap(); + let mut file_read_from_raw_region = TempFile::new().unwrap().into_file(); + file_read_from_raw_region + .set_len(size_of_file as u64) + .unwrap(); + guest_raw_region + .write_all_to(write_addr, &mut file_read_from_raw_region, size_of_file) + .unwrap(); + file_read_from_raw_region.rewind().unwrap(); + let mut content = String::new(); + file_read_from_raw_region + .read_to_string(&mut content) + .unwrap(); + assert_eq!(content.as_bytes(), original_content); + assert_eq!( + file_read_from_raw_region.metadata().unwrap().len(), + size_of_file as u64 + ); + + // Error invalid backend address case in read_exact_from() on mmap region. + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_mmap_region + .read_exact_from(invalid_addr, &mut file_to_write_mmap_region, size_of_file) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error invalid backend address case in write_all_to() on mmap region. + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_mmap_region + .write_all_to(invalid_addr, &mut file_read_from_mmap_region, size_of_file) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error invalid backend address case in read_exact_from() on raw region. + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_raw_region + .read_exact_from(invalid_addr, &mut file_to_write_raw_region, size_of_file) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error invalid backend address case in write_all_to() on raw region. + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_raw_region + .write_all_to(invalid_addr, &mut file_read_from_raw_region, size_of_file) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + } + + #[test] + fn test_store_and_load() { + let test_val = 0xFF; + let start_addr = GuestAddress(0x0); + let write_addr = MemoryRegionAddress(0x400); + // Preset a GuestRegionHybrid from a mmap region + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x800).unwrap(), start_addr).unwrap(); + let guest_mmap_region = GuestRegionHybrid::from_mmap_region(mmap_reg); + // Preset a GuestRegionHybrid from a raw region + let mut buf_of_raw_region = [0u8; 0x800]; + let raw_region = unsafe { + GuestRegionRaw::<()>::new(start_addr, &mut buf_of_raw_region as *mut _, 0x800) + }; + let guest_raw_region = GuestRegionHybrid::from_raw_region(raw_region); + + // Normal case. + guest_mmap_region + .store(test_val, write_addr, Ordering::Relaxed) + .unwrap(); + let val_read_from_mmap_region: u64 = guest_mmap_region + .load(write_addr, Ordering::Relaxed) + .unwrap(); + assert_eq!(val_read_from_mmap_region, test_val); + guest_raw_region + .store(test_val, write_addr, Ordering::Relaxed) + .unwrap(); + let val_read_from_raw_region: u64 = guest_raw_region + .load(write_addr, Ordering::Relaxed) + .unwrap(); + assert_eq!(val_read_from_raw_region, test_val); + + // Error invalid backend address case in store() on mmap region. + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_mmap_region + .store(test_val, invalid_addr, Ordering::Relaxed) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error invalid backend address case in store() on raw region. + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_raw_region + .store(test_val, invalid_addr, Ordering::Relaxed) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error invalid backend address case in laod() on mmap region. + assert!(matches!( + guest_mmap_region + .load::(invalid_addr, Ordering::Relaxed) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + + // Error invalid backend address case in laod() on raw region. + assert!(matches!( + guest_raw_region + .load::(invalid_addr, Ordering::Relaxed) + .err() + .unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + } + + #[test] + fn test_bitmap() { + // TODO: #185 Need futher and detailed test on bitmap object. + let start_addr = GuestAddress(0x0); + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x800).unwrap(), start_addr).unwrap(); + let guest_mmap_region = GuestRegionHybrid::from_mmap_region(mmap_reg); + let mut buf_of_raw_region = [0u8; 0x800]; + let raw_region = unsafe { + GuestRegionRaw::<()>::new(start_addr, &mut buf_of_raw_region as *mut _, 0x800) + }; + let guest_raw_region = GuestRegionHybrid::from_raw_region(raw_region); + + assert_eq!(guest_mmap_region.bitmap(), guest_raw_region.bitmap()); + } + + #[test] + fn test_get_host_address_on_mmap_region() { + let start_addr = GuestAddress(0x0); + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x800).unwrap(), start_addr).unwrap(); + let guest_region = GuestRegionHybrid::from_mmap_region(mmap_reg); + + // Normal case. + let addr_1 = guest_region + .get_host_address(MemoryRegionAddress(0x0)) + .unwrap(); + let addr_2 = guest_region + .get_host_address(MemoryRegionAddress(0x400)) + .unwrap(); + assert_eq!(addr_1 as u64 + 0x400, addr_2 as u64); + + // Error invalid backend address case. + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_region.get_host_address(invalid_addr).err().unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + } + + #[test] + fn test_get_host_address_on_raw_region() { + let start_addr = GuestAddress(0x0); + let mut buf_of_raw_region = [0u8; 0x800]; + let raw_region = unsafe { + GuestRegionRaw::<()>::new(start_addr, &mut buf_of_raw_region as *mut _, 0x800) + }; + let guest_region = GuestRegionHybrid::from_raw_region(raw_region); + + // Normal case. + let addr_1 = guest_region + .get_host_address(MemoryRegionAddress(0x0)) + .unwrap(); + let addr_2 = guest_region + .get_host_address(MemoryRegionAddress(0x400)) + .unwrap(); + assert_eq!(addr_1 as u64 + 0x400, addr_2 as u64); + + // Error invalid backend address case. + let invalid_addr = MemoryRegionAddress(0x900); + assert!(matches!( + guest_region.get_host_address(invalid_addr).err().unwrap(), + GuestMemoryError::InvalidBackendAddress + )); + } + + // TODO: #186 The following function are not yet implemented: + // - 'fn file_offset()' + // - 'unsafe fn as_slice()' + // - 'unsafe fn as_mut_slice()' + // Tests of these functions will be needed when they are implemented. + + #[test] + fn test_guest_memory_mmap_get_slice() { + //Preset a GuestRegionHybrid from a mmap region + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x400).unwrap(), GuestAddress(0)).unwrap(); + let guest_mmap_region = GuestRegionHybrid::from_mmap_region(mmap_reg); + + // Normal case. + let slice_addr = MemoryRegionAddress(0x100); + let slice_size = 0x200; + let slice = guest_mmap_region.get_slice(slice_addr, slice_size).unwrap(); + assert_eq!(slice.len(), slice_size); + + // Empty slice. + let slice_addr = MemoryRegionAddress(0x200); + let slice_size = 0x0; + let slice = guest_mmap_region.get_slice(slice_addr, slice_size).unwrap(); + assert!(slice.is_empty()); + + // Error case when slice_size is beyond the boundary. + let slice_addr = MemoryRegionAddress(0x300); + let slice_size = 0x200; + assert!(guest_mmap_region.get_slice(slice_addr, slice_size).is_err()); + } + + #[test] + fn test_from_regions_on_guest_memory_hybrid() { + // Normal case. + let mut regions = Vec::>::new(); + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x100).unwrap(), GuestAddress(0x100)) + .unwrap(); + regions.push(GuestRegionHybrid::Mmap(mmap_reg)); + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x100).unwrap(), GuestAddress(0x200)) + .unwrap(); + regions.push(GuestRegionHybrid::Mmap(mmap_reg)); + let guest_region = GuestMemoryHybrid::<()>::from_regions(regions).unwrap(); + assert_eq!(guest_region.regions[0].start_addr(), GuestAddress(0x100)); + assert_eq!(guest_region.regions[1].start_addr(), GuestAddress(0x200)); + + // Error unsorted region case. + let mut regions = Vec::>::new(); + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x400).unwrap(), GuestAddress(0x200)) + .unwrap(); + regions.push(GuestRegionHybrid::Mmap(mmap_reg)); + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x400).unwrap(), GuestAddress(0x100)) + .unwrap(); + regions.push(GuestRegionHybrid::Mmap(mmap_reg)); + let guest_region = GuestMemoryHybrid::<()>::from_regions(regions); + assert!(matches!( + guest_region.err().unwrap(), + Error::UnsortedMemoryRegions + )); + + // Error no memory region case. + let regions = Vec::>::new(); + let guest_region = GuestMemoryHybrid::<()>::from_regions(regions); + assert!(matches!(guest_region.err().unwrap(), Error::NoMemoryRegion)); + } + + #[test] + fn test_iterator_on_guest_region_hybrid() { + let mut regions = Vec::>::new(); + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x100).unwrap(), GuestAddress(0x100)) + .unwrap(); + regions.push(GuestRegionHybrid::Mmap(mmap_reg)); + let mmap_reg = + GuestRegionMmap::new(MmapRegion::<()>::new(0x100).unwrap(), GuestAddress(0x200)) + .unwrap(); + regions.push(GuestRegionHybrid::Mmap(mmap_reg)); + let guest_region = GuestMemoryHybrid::<()>::from_regions(regions).unwrap(); + let mut region = guest_region.iter(); + + assert_eq!(region.next().unwrap().start_addr(), GuestAddress(0x100)); + assert_eq!(region.next().unwrap().start_addr(), GuestAddress(0x200)); + } +} diff --git a/src/dragonball/src/dbs_address_space/src/memory/mod.rs b/src/dragonball/src/dbs_address_space/src/memory/mod.rs new file mode 100644 index 000000000..371acda9d --- /dev/null +++ b/src/dragonball/src/dbs_address_space/src/memory/mod.rs @@ -0,0 +1,193 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Structs to manage guest memory for virtual machines. +//! +//! The `vm-memory` crate only provides traits and structs to access normal guest memory, +//! it doesn't support special guest memory like virtio-fs/virtio-pmem DAX window etc. +//! So this crate provides `GuestMemoryManager` over `vm-memory` to provide uniform abstraction +//! for all guest memory. +//! +//! It also provides interfaces to coordinate guest memory hotplug events. + +use std::str::FromStr; +use std::sync::Arc; +use vm_memory::{GuestAddressSpace, GuestMemoryAtomic, GuestMemoryLoadGuard, GuestMemoryMmap}; + +mod raw_region; +pub use raw_region::GuestRegionRaw; + +mod hybrid; +pub use hybrid::{GuestMemoryHybrid, GuestRegionHybrid}; + +/// Type of source to allocate memory for virtual machines. +#[derive(Debug, Eq, PartialEq)] +pub enum MemorySourceType { + /// File on HugeTlbFs. + FileOnHugeTlbFs, + /// mmap() without flag `MAP_HUGETLB`. + MmapAnonymous, + /// mmap() with flag `MAP_HUGETLB`. + MmapAnonymousHugeTlbFs, + /// memfd() without flag `MFD_HUGETLB`. + MemFdShared, + /// memfd() with flag `MFD_HUGETLB`. + MemFdOnHugeTlbFs, +} + +impl MemorySourceType { + /// Check whether the memory source is huge page. + pub fn is_hugepage(&self) -> bool { + *self == Self::FileOnHugeTlbFs + || *self == Self::MmapAnonymousHugeTlbFs + || *self == Self::MemFdOnHugeTlbFs + } + + /// Check whether the memory source is anonymous memory. + pub fn is_mmap_anonymous(&self) -> bool { + *self == Self::MmapAnonymous || *self == Self::MmapAnonymousHugeTlbFs + } +} + +impl FromStr for MemorySourceType { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "hugetlbfs" => Ok(MemorySourceType::FileOnHugeTlbFs), + "memfd" => Ok(MemorySourceType::MemFdShared), + "shmem" => Ok(MemorySourceType::MemFdShared), + "hugememfd" => Ok(MemorySourceType::MemFdOnHugeTlbFs), + "hugeshmem" => Ok(MemorySourceType::MemFdOnHugeTlbFs), + "anon" => Ok(MemorySourceType::MmapAnonymous), + "mmap" => Ok(MemorySourceType::MmapAnonymous), + "hugeanon" => Ok(MemorySourceType::MmapAnonymousHugeTlbFs), + "hugemmap" => Ok(MemorySourceType::MmapAnonymousHugeTlbFs), + _ => Err(format!("unknown memory source type {s}")), + } + } +} + +#[derive(Debug, Default)] +struct GuestMemoryHotplugManager {} + +/// The `GuestMemoryManager` manages all guest memory for virtual machines. +/// +/// The `GuestMemoryManager` fulfills several different responsibilities. +/// - First, it manages different types of guest memory, such as normal guest memory, virtio-fs +/// DAX window and virtio-pmem DAX window etc. Different clients may want to access different +/// types of memory. So the manager maintains two GuestMemory objects, one contains all guest +/// memory, the other contains only normal guest memory. +/// - Second, it coordinates memory/DAX window hotplug events, so clients may register hooks +/// to receive hotplug notifications. +#[allow(unused)] +#[derive(Debug, Clone)] +pub struct GuestMemoryManager { + default: GuestMemoryAtomic, + /// GuestMemory object hosts all guest memory. + hybrid: GuestMemoryAtomic, + /// GuestMemory object for vIOMMU. + iommu: GuestMemoryAtomic, + /// GuestMemory object hosts normal guest memory. + normal: GuestMemoryAtomic, + hotplug: Arc, +} + +impl GuestMemoryManager { + /// Create a new instance of `GuestMemoryManager`. + pub fn new() -> Self { + Self::default() + } + + /// Get a reference to the normal `GuestMemory` object. + pub fn get_normal_guest_memory(&self) -> &GuestMemoryAtomic { + &self.normal + } + + /// Try to downcast the `GuestAddressSpace` object to a `GuestMemoryManager` object. + pub fn to_manager(_m: &AS) -> Option<&Self> { + None + } +} + +impl Default for GuestMemoryManager { + fn default() -> Self { + let hybrid = GuestMemoryAtomic::new(GuestMemoryHybrid::new()); + let iommu = GuestMemoryAtomic::new(GuestMemoryHybrid::new()); + let normal = GuestMemoryAtomic::new(GuestMemoryMmap::new()); + // By default, it provides to the `GuestMemoryHybrid` object containing all guest memory. + let default = hybrid.clone(); + + GuestMemoryManager { + default, + hybrid, + iommu, + normal, + hotplug: Arc::new(GuestMemoryHotplugManager::default()), + } + } +} + +impl GuestAddressSpace for GuestMemoryManager { + type M = GuestMemoryHybrid; + type T = GuestMemoryLoadGuard; + + fn memory(&self) -> Self::T { + self.default.memory() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_memory_source_type() { + assert_eq!( + MemorySourceType::from_str("hugetlbfs").unwrap(), + MemorySourceType::FileOnHugeTlbFs + ); + assert_eq!( + MemorySourceType::from_str("memfd").unwrap(), + MemorySourceType::MemFdShared + ); + assert_eq!( + MemorySourceType::from_str("shmem").unwrap(), + MemorySourceType::MemFdShared + ); + assert_eq!( + MemorySourceType::from_str("hugememfd").unwrap(), + MemorySourceType::MemFdOnHugeTlbFs + ); + assert_eq!( + MemorySourceType::from_str("hugeshmem").unwrap(), + MemorySourceType::MemFdOnHugeTlbFs + ); + assert_eq!( + MemorySourceType::from_str("anon").unwrap(), + MemorySourceType::MmapAnonymous + ); + assert_eq!( + MemorySourceType::from_str("mmap").unwrap(), + MemorySourceType::MmapAnonymous + ); + assert_eq!( + MemorySourceType::from_str("hugeanon").unwrap(), + MemorySourceType::MmapAnonymousHugeTlbFs + ); + assert_eq!( + MemorySourceType::from_str("hugemmap").unwrap(), + MemorySourceType::MmapAnonymousHugeTlbFs + ); + assert!(MemorySourceType::from_str("test").is_err()); + } + + #[ignore] + #[test] + fn test_to_manager() { + let manager = GuestMemoryManager::new(); + let mgr = GuestMemoryManager::to_manager(&manager).unwrap(); + + assert_eq!(&manager as *const _, mgr as *const _); + } +} diff --git a/src/dragonball/src/dbs_address_space/src/memory/raw_region.rs b/src/dragonball/src/dbs_address_space/src/memory/raw_region.rs new file mode 100644 index 000000000..5af21ca3e --- /dev/null +++ b/src/dragonball/src/dbs_address_space/src/memory/raw_region.rs @@ -0,0 +1,990 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::io::{Read, Write}; +use std::sync::atomic::Ordering; + +use vm_memory::bitmap::{Bitmap, BS}; +use vm_memory::mmap::NewBitmap; +use vm_memory::volatile_memory::compute_offset; +use vm_memory::{ + guest_memory, volatile_memory, Address, AtomicAccess, Bytes, FileOffset, GuestAddress, + GuestMemoryRegion, GuestUsize, MemoryRegionAddress, VolatileSlice, +}; + +/// Guest memory region for virtio-fs DAX window. +#[derive(Debug)] +pub struct GuestRegionRaw { + guest_base: GuestAddress, + addr: *mut u8, + size: usize, + bitmap: B, +} + +impl GuestRegionRaw { + /// Create a `GuestRegionRaw` object from raw pointer. + /// + /// # Safety + /// Caller needs to ensure `addr` and `size` are valid with static lifetime. + pub unsafe fn new(guest_base: GuestAddress, addr: *mut u8, size: usize) -> Self { + let bitmap = B::with_len(size); + + GuestRegionRaw { + guest_base, + addr, + size, + bitmap, + } + } +} + +impl Bytes for GuestRegionRaw { + type E = guest_memory::Error; + + fn write(&self, buf: &[u8], addr: MemoryRegionAddress) -> guest_memory::Result { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice() + .unwrap() + .write(buf, maddr) + .map_err(Into::into) + } + + fn read(&self, buf: &mut [u8], addr: MemoryRegionAddress) -> guest_memory::Result { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice() + .unwrap() + .read(buf, maddr) + .map_err(Into::into) + } + + fn write_slice(&self, buf: &[u8], addr: MemoryRegionAddress) -> guest_memory::Result<()> { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice() + .unwrap() + .write_slice(buf, maddr) + .map_err(Into::into) + } + + fn read_slice(&self, buf: &mut [u8], addr: MemoryRegionAddress) -> guest_memory::Result<()> { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice() + .unwrap() + .read_slice(buf, maddr) + .map_err(Into::into) + } + + fn read_from( + &self, + addr: MemoryRegionAddress, + src: &mut F, + count: usize, + ) -> guest_memory::Result + where + F: Read, + { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice() + .unwrap() + .read_from::(maddr, src, count) + .map_err(Into::into) + } + + fn read_exact_from( + &self, + addr: MemoryRegionAddress, + src: &mut F, + count: usize, + ) -> guest_memory::Result<()> + where + F: Read, + { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice() + .unwrap() + .read_exact_from::(maddr, src, count) + .map_err(Into::into) + } + + fn write_to( + &self, + addr: MemoryRegionAddress, + dst: &mut F, + count: usize, + ) -> guest_memory::Result + where + F: Write, + { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice() + .unwrap() + .write_to::(maddr, dst, count) + .map_err(Into::into) + } + + fn write_all_to( + &self, + addr: MemoryRegionAddress, + dst: &mut F, + count: usize, + ) -> guest_memory::Result<()> + where + F: Write, + { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice() + .unwrap() + .write_all_to::(maddr, dst, count) + .map_err(Into::into) + } + + fn store( + &self, + val: T, + addr: MemoryRegionAddress, + order: Ordering, + ) -> guest_memory::Result<()> { + self.as_volatile_slice().and_then(|s| { + s.store(val, addr.raw_value() as usize, order) + .map_err(Into::into) + }) + } + + fn load( + &self, + addr: MemoryRegionAddress, + order: Ordering, + ) -> guest_memory::Result { + self.as_volatile_slice() + .and_then(|s| s.load(addr.raw_value() as usize, order).map_err(Into::into)) + } +} + +impl GuestMemoryRegion for GuestRegionRaw { + type B = B; + + fn len(&self) -> GuestUsize { + self.size as GuestUsize + } + + fn start_addr(&self) -> GuestAddress { + self.guest_base + } + + fn bitmap(&self) -> &Self::B { + &self.bitmap + } + + fn get_host_address(&self, addr: MemoryRegionAddress) -> guest_memory::Result<*mut u8> { + // Not sure why wrapping_offset is not unsafe. Anyway this + // is safe because we've just range-checked addr using check_address. + self.check_address(addr) + .ok_or(guest_memory::Error::InvalidBackendAddress) + .map(|addr| self.addr.wrapping_offset(addr.raw_value() as isize)) + } + + fn file_offset(&self) -> Option<&FileOffset> { + None + } + + unsafe fn as_slice(&self) -> Option<&[u8]> { + // This is safe because we mapped the area at addr ourselves, so this slice will not + // overflow. However, it is possible to alias. + Some(std::slice::from_raw_parts(self.addr, self.size)) + } + + unsafe fn as_mut_slice(&self) -> Option<&mut [u8]> { + // This is safe because we mapped the area at addr ourselves, so this slice will not + // overflow. However, it is possible to alias. + Some(std::slice::from_raw_parts_mut(self.addr, self.size)) + } + + fn get_slice( + &self, + offset: MemoryRegionAddress, + count: usize, + ) -> guest_memory::Result>> { + let offset = offset.raw_value() as usize; + let end = compute_offset(offset, count)?; + if end > self.size { + return Err(volatile_memory::Error::OutOfBounds { addr: end }.into()); + } + + // Safe because we checked that offset + count was within our range and we only ever hand + // out volatile accessors. + Ok(unsafe { + VolatileSlice::with_bitmap( + (self.addr as usize + offset) as *mut _, + count, + self.bitmap.slice_at(offset), + ) + }) + } + + #[cfg(target_os = "linux")] + fn is_hugetlbfs(&self) -> Option { + None + } +} + +#[cfg(test)] +mod tests { + extern crate vmm_sys_util; + + use super::*; + use crate::{GuestMemoryHybrid, GuestRegionHybrid}; + use std::sync::Arc; + use vm_memory::{GuestAddressSpace, GuestMemory, VolatileMemory}; + + /* + use crate::bitmap::tests::test_guest_memory_and_region; + use crate::bitmap::AtomicBitmap; + use crate::GuestAddressSpace; + + use std::fs::File; + use std::mem; + use std::path::Path; + use vmm_sys_util::tempfile::TempFile; + + type GuestMemoryMmap = super::GuestMemoryMmap<()>; + type GuestRegionMmap = super::GuestRegionMmap<()>; + type MmapRegion = super::MmapRegion<()>; + */ + + #[test] + fn test_region_raw_new() { + let mut buf = [0u8; 1024]; + let m = + unsafe { GuestRegionRaw::<()>::new(GuestAddress(0x10_0000), &mut buf as *mut _, 1024) }; + + assert_eq!(m.start_addr(), GuestAddress(0x10_0000)); + assert_eq!(m.len(), 1024); + } + + /* + fn check_guest_memory_mmap( + maybe_guest_mem: Result, + expected_regions_summary: &[(GuestAddress, usize)], + ) { + assert!(maybe_guest_mem.is_ok()); + + let guest_mem = maybe_guest_mem.unwrap(); + assert_eq!(guest_mem.num_regions(), expected_regions_summary.len()); + let maybe_last_mem_reg = expected_regions_summary.last(); + if let Some((region_addr, region_size)) = maybe_last_mem_reg { + let mut last_addr = region_addr.unchecked_add(*region_size as u64); + if last_addr.raw_value() != 0 { + last_addr = last_addr.unchecked_sub(1); + } + assert_eq!(guest_mem.last_addr(), last_addr); + } + for ((region_addr, region_size), mmap) in expected_regions_summary + .iter() + .zip(guest_mem.regions.iter()) + { + assert_eq!(region_addr, &mmap.guest_base); + assert_eq!(region_size, &mmap.mapping.size()); + + assert!(guest_mem.find_region(*region_addr).is_some()); + } + } + + fn new_guest_memory_mmap( + regions_summary: &[(GuestAddress, usize)], + ) -> Result { + GuestMemoryMmap::from_ranges(regions_summary) + } + + fn new_guest_memory_mmap_from_regions( + regions_summary: &[(GuestAddress, usize)], + ) -> Result { + GuestMemoryMmap::from_regions( + regions_summary + .iter() + .map(|(region_addr, region_size)| { + GuestRegionMmap::new(MmapRegion::new(*region_size).unwrap(), *region_addr) + .unwrap() + }) + .collect(), + ) + } + + fn new_guest_memory_mmap_from_arc_regions( + regions_summary: &[(GuestAddress, usize)], + ) -> Result { + GuestMemoryMmap::from_arc_regions( + regions_summary + .iter() + .map(|(region_addr, region_size)| { + Arc::new( + GuestRegionMmap::new(MmapRegion::new(*region_size).unwrap(), *region_addr) + .unwrap(), + ) + }) + .collect(), + ) + } + + fn new_guest_memory_mmap_with_files( + regions_summary: &[(GuestAddress, usize)], + ) -> Result { + let regions: Vec<(GuestAddress, usize, Option)> = regions_summary + .iter() + .map(|(region_addr, region_size)| { + let f = TempFile::new().unwrap().into_file(); + f.set_len(*region_size as u64).unwrap(); + + (*region_addr, *region_size, Some(FileOffset::new(f, 0))) + }) + .collect(); + + GuestMemoryMmap::from_ranges_with_files(®ions) + } + */ + + #[test] + fn slice_addr() { + let mut buf = [0u8; 1024]; + let m = + unsafe { GuestRegionRaw::<()>::new(GuestAddress(0x10_0000), &mut buf as *mut _, 1024) }; + + let s = m.get_slice(MemoryRegionAddress(2), 3).unwrap(); + assert_eq!(s.as_ptr(), &mut buf[2] as *mut _); + } + + /* + #[test] + fn test_address_in_range() { + let f1 = TempFile::new().unwrap().into_file(); + f1.set_len(0x400).unwrap(); + let f2 = TempFile::new().unwrap().into_file(); + f2.set_len(0x400).unwrap(); + + let start_addr1 = GuestAddress(0x0); + let start_addr2 = GuestAddress(0x800); + let guest_mem = + GuestMemoryMmap::from_ranges(&[(start_addr1, 0x400), (start_addr2, 0x400)]).unwrap(); + let guest_mem_backed_by_file = GuestMemoryMmap::from_ranges_with_files(&[ + (start_addr1, 0x400, Some(FileOffset::new(f1, 0))), + (start_addr2, 0x400, Some(FileOffset::new(f2, 0))), + ]) + .unwrap(); + + let guest_mem_list = vec![guest_mem, guest_mem_backed_by_file]; + for guest_mem in guest_mem_list.iter() { + assert!(guest_mem.address_in_range(GuestAddress(0x200))); + assert!(!guest_mem.address_in_range(GuestAddress(0x600))); + assert!(guest_mem.address_in_range(GuestAddress(0xa00))); + assert!(!guest_mem.address_in_range(GuestAddress(0xc00))); + } + } + + #[test] + fn test_check_address() { + let f1 = TempFile::new().unwrap().into_file(); + f1.set_len(0x400).unwrap(); + let f2 = TempFile::new().unwrap().into_file(); + f2.set_len(0x400).unwrap(); + + let start_addr1 = GuestAddress(0x0); + let start_addr2 = GuestAddress(0x800); + let guest_mem = + GuestMemoryMmap::from_ranges(&[(start_addr1, 0x400), (start_addr2, 0x400)]).unwrap(); + let guest_mem_backed_by_file = GuestMemoryMmap::from_ranges_with_files(&[ + (start_addr1, 0x400, Some(FileOffset::new(f1, 0))), + (start_addr2, 0x400, Some(FileOffset::new(f2, 0))), + ]) + .unwrap(); + + let guest_mem_list = vec![guest_mem, guest_mem_backed_by_file]; + for guest_mem in guest_mem_list.iter() { + assert_eq!( + guest_mem.check_address(GuestAddress(0x200)), + Some(GuestAddress(0x200)) + ); + assert_eq!(guest_mem.check_address(GuestAddress(0x600)), None); + assert_eq!( + guest_mem.check_address(GuestAddress(0xa00)), + Some(GuestAddress(0xa00)) + ); + assert_eq!(guest_mem.check_address(GuestAddress(0xc00)), None); + } + } + + #[test] + fn test_to_region_addr() { + let f1 = TempFile::new().unwrap().into_file(); + f1.set_len(0x400).unwrap(); + let f2 = TempFile::new().unwrap().into_file(); + f2.set_len(0x400).unwrap(); + + let start_addr1 = GuestAddress(0x0); + let start_addr2 = GuestAddress(0x800); + let guest_mem = + GuestMemoryMmap::from_ranges(&[(start_addr1, 0x400), (start_addr2, 0x400)]).unwrap(); + let guest_mem_backed_by_file = GuestMemoryMmap::from_ranges_with_files(&[ + (start_addr1, 0x400, Some(FileOffset::new(f1, 0))), + (start_addr2, 0x400, Some(FileOffset::new(f2, 0))), + ]) + .unwrap(); + + let guest_mem_list = vec![guest_mem, guest_mem_backed_by_file]; + for guest_mem in guest_mem_list.iter() { + assert!(guest_mem.to_region_addr(GuestAddress(0x600)).is_none()); + let (r0, addr0) = guest_mem.to_region_addr(GuestAddress(0x800)).unwrap(); + let (r1, addr1) = guest_mem.to_region_addr(GuestAddress(0xa00)).unwrap(); + assert!(r0.as_ptr() == r1.as_ptr()); + assert_eq!(addr0, MemoryRegionAddress(0)); + assert_eq!(addr1, MemoryRegionAddress(0x200)); + } + } + + #[test] + fn test_get_host_address() { + let f1 = TempFile::new().unwrap().into_file(); + f1.set_len(0x400).unwrap(); + let f2 = TempFile::new().unwrap().into_file(); + f2.set_len(0x400).unwrap(); + + let start_addr1 = GuestAddress(0x0); + let start_addr2 = GuestAddress(0x800); + let guest_mem = + GuestMemoryMmap::from_ranges(&[(start_addr1, 0x400), (start_addr2, 0x400)]).unwrap(); + let guest_mem_backed_by_file = GuestMemoryMmap::from_ranges_with_files(&[ + (start_addr1, 0x400, Some(FileOffset::new(f1, 0))), + (start_addr2, 0x400, Some(FileOffset::new(f2, 0))), + ]) + .unwrap(); + + let guest_mem_list = vec![guest_mem, guest_mem_backed_by_file]; + for guest_mem in guest_mem_list.iter() { + assert!(guest_mem.get_host_address(GuestAddress(0x600)).is_err()); + let ptr0 = guest_mem.get_host_address(GuestAddress(0x800)).unwrap(); + let ptr1 = guest_mem.get_host_address(GuestAddress(0xa00)).unwrap(); + assert_eq!( + ptr0, + guest_mem.find_region(GuestAddress(0x800)).unwrap().as_ptr() + ); + assert_eq!(unsafe { ptr0.offset(0x200) }, ptr1); + } + } + + #[test] + fn test_deref() { + let f = TempFile::new().unwrap().into_file(); + f.set_len(0x400).unwrap(); + + let start_addr = GuestAddress(0x0); + let guest_mem = GuestMemoryMmap::from_ranges(&[(start_addr, 0x400)]).unwrap(); + let guest_mem_backed_by_file = GuestMemoryMmap::from_ranges_with_files(&[( + start_addr, + 0x400, + Some(FileOffset::new(f, 0)), + )]) + .unwrap(); + + let guest_mem_list = vec![guest_mem, guest_mem_backed_by_file]; + for guest_mem in guest_mem_list.iter() { + let sample_buf = &[1, 2, 3, 4, 5]; + + assert_eq!(guest_mem.write(sample_buf, start_addr).unwrap(), 5); + let slice = guest_mem + .find_region(GuestAddress(0)) + .unwrap() + .as_volatile_slice() + .unwrap(); + + let buf = &mut [0, 0, 0, 0, 0]; + assert_eq!(slice.read(buf, 0).unwrap(), 5); + assert_eq!(buf, sample_buf); + } + } + + #[test] + fn test_read_u64() { + let f1 = TempFile::new().unwrap().into_file(); + f1.set_len(0x1000).unwrap(); + let f2 = TempFile::new().unwrap().into_file(); + f2.set_len(0x1000).unwrap(); + + let start_addr1 = GuestAddress(0x0); + let start_addr2 = GuestAddress(0x1000); + let bad_addr = GuestAddress(0x2001); + let bad_addr2 = GuestAddress(0x1ffc); + let max_addr = GuestAddress(0x2000); + + let gm = + GuestMemoryMmap::from_ranges(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]).unwrap(); + let gm_backed_by_file = GuestMemoryMmap::from_ranges_with_files(&[ + (start_addr1, 0x1000, Some(FileOffset::new(f1, 0))), + (start_addr2, 0x1000, Some(FileOffset::new(f2, 0))), + ]) + .unwrap(); + + let gm_list = vec![gm, gm_backed_by_file]; + for gm in gm_list.iter() { + let val1: u64 = 0xaa55_aa55_aa55_aa55; + let val2: u64 = 0x55aa_55aa_55aa_55aa; + assert_eq!( + format!("{:?}", gm.write_obj(val1, bad_addr).err().unwrap()), + format!("InvalidGuestAddress({:?})", bad_addr,) + ); + assert_eq!( + format!("{:?}", gm.write_obj(val1, bad_addr2).err().unwrap()), + format!( + "PartialBuffer {{ expected: {:?}, completed: {:?} }}", + mem::size_of::(), + max_addr.checked_offset_from(bad_addr2).unwrap() + ) + ); + + gm.write_obj(val1, GuestAddress(0x500)).unwrap(); + gm.write_obj(val2, GuestAddress(0x1000 + 32)).unwrap(); + let num1: u64 = gm.read_obj(GuestAddress(0x500)).unwrap(); + let num2: u64 = gm.read_obj(GuestAddress(0x1000 + 32)).unwrap(); + assert_eq!(val1, num1); + assert_eq!(val2, num2); + } + } + + #[test] + fn write_and_read() { + let f = TempFile::new().unwrap().into_file(); + f.set_len(0x400).unwrap(); + + let mut start_addr = GuestAddress(0x1000); + let gm = GuestMemoryMmap::from_ranges(&[(start_addr, 0x400)]).unwrap(); + let gm_backed_by_file = GuestMemoryMmap::from_ranges_with_files(&[( + start_addr, + 0x400, + Some(FileOffset::new(f, 0)), + )]) + .unwrap(); + + let gm_list = vec![gm, gm_backed_by_file]; + for gm in gm_list.iter() { + let sample_buf = &[1, 2, 3, 4, 5]; + + assert_eq!(gm.write(sample_buf, start_addr).unwrap(), 5); + + let buf = &mut [0u8; 5]; + assert_eq!(gm.read(buf, start_addr).unwrap(), 5); + assert_eq!(buf, sample_buf); + + start_addr = GuestAddress(0x13ff); + assert_eq!(gm.write(sample_buf, start_addr).unwrap(), 1); + assert_eq!(gm.read(buf, start_addr).unwrap(), 1); + assert_eq!(buf[0], sample_buf[0]); + start_addr = GuestAddress(0x1000); + } + } + + #[test] + fn read_to_and_write_from_mem() { + let f = TempFile::new().unwrap().into_file(); + f.set_len(0x400).unwrap(); + + let gm = GuestMemoryMmap::from_ranges(&[(GuestAddress(0x1000), 0x400)]).unwrap(); + let gm_backed_by_file = GuestMemoryMmap::from_ranges_with_files(&[( + GuestAddress(0x1000), + 0x400, + Some(FileOffset::new(f, 0)), + )]) + .unwrap(); + + let gm_list = vec![gm, gm_backed_by_file]; + for gm in gm_list.iter() { + let addr = GuestAddress(0x1010); + let mut file = if cfg!(unix) { + File::open(Path::new("/dev/zero")).unwrap() + } else { + File::open(Path::new("c:\\Windows\\system32\\ntoskrnl.exe")).unwrap() + }; + gm.write_obj(!0u32, addr).unwrap(); + gm.read_exact_from(addr, &mut file, mem::size_of::()) + .unwrap(); + let value: u32 = gm.read_obj(addr).unwrap(); + if cfg!(unix) { + assert_eq!(value, 0); + } else { + assert_eq!(value, 0x0090_5a4d); + } + + let mut sink = Vec::new(); + gm.write_all_to(addr, &mut sink, mem::size_of::()) + .unwrap(); + if cfg!(unix) { + assert_eq!(sink, vec![0; mem::size_of::()]); + } else { + assert_eq!(sink, vec![0x4d, 0x5a, 0x90, 0x00]); + }; + } + } + + #[test] + fn create_vec_with_regions() { + let region_size = 0x400; + let regions = vec![ + (GuestAddress(0x0), region_size), + (GuestAddress(0x1000), region_size), + ]; + let mut iterated_regions = Vec::new(); + let gm = GuestMemoryMmap::from_ranges(®ions).unwrap(); + + for region in gm.iter() { + assert_eq!(region.len(), region_size as GuestUsize); + } + + for region in gm.iter() { + iterated_regions.push((region.start_addr(), region.len() as usize)); + } + assert_eq!(regions, iterated_regions); + + assert!(regions + .iter() + .map(|x| (x.0, x.1)) + .eq(iterated_regions.iter().copied())); + + assert_eq!(gm.regions[0].guest_base, regions[0].0); + assert_eq!(gm.regions[1].guest_base, regions[1].0); + } + + #[test] + fn test_memory() { + let region_size = 0x400; + let regions = vec![ + (GuestAddress(0x0), region_size), + (GuestAddress(0x1000), region_size), + ]; + let mut iterated_regions = Vec::new(); + let gm = Arc::new(GuestMemoryMmap::from_ranges(®ions).unwrap()); + let mem = gm.memory(); + + for region in mem.iter() { + assert_eq!(region.len(), region_size as GuestUsize); + } + + for region in mem.iter() { + iterated_regions.push((region.start_addr(), region.len() as usize)); + } + assert_eq!(regions, iterated_regions); + + assert!(regions + .iter() + .map(|x| (x.0, x.1)) + .eq(iterated_regions.iter().copied())); + + assert_eq!(gm.regions[0].guest_base, regions[0].0); + assert_eq!(gm.regions[1].guest_base, regions[1].0); + } + + #[test] + fn test_access_cross_boundary() { + let f1 = TempFile::new().unwrap().into_file(); + f1.set_len(0x1000).unwrap(); + let f2 = TempFile::new().unwrap().into_file(); + f2.set_len(0x1000).unwrap(); + + let start_addr1 = GuestAddress(0x0); + let start_addr2 = GuestAddress(0x1000); + let gm = + GuestMemoryMmap::from_ranges(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]).unwrap(); + let gm_backed_by_file = GuestMemoryMmap::from_ranges_with_files(&[ + (start_addr1, 0x1000, Some(FileOffset::new(f1, 0))), + (start_addr2, 0x1000, Some(FileOffset::new(f2, 0))), + ]) + .unwrap(); + + let gm_list = vec![gm, gm_backed_by_file]; + for gm in gm_list.iter() { + let sample_buf = &[1, 2, 3, 4, 5]; + assert_eq!(gm.write(sample_buf, GuestAddress(0xffc)).unwrap(), 5); + let buf = &mut [0u8; 5]; + assert_eq!(gm.read(buf, GuestAddress(0xffc)).unwrap(), 5); + assert_eq!(buf, sample_buf); + } + } + + #[test] + fn test_retrieve_fd_backing_memory_region() { + let f = TempFile::new().unwrap().into_file(); + f.set_len(0x400).unwrap(); + + let start_addr = GuestAddress(0x0); + let gm = GuestMemoryMmap::from_ranges(&[(start_addr, 0x400)]).unwrap(); + assert!(gm.find_region(start_addr).is_some()); + let region = gm.find_region(start_addr).unwrap(); + assert!(region.file_offset().is_none()); + + let gm = GuestMemoryMmap::from_ranges_with_files(&[( + start_addr, + 0x400, + Some(FileOffset::new(f, 0)), + )]) + .unwrap(); + assert!(gm.find_region(start_addr).is_some()); + let region = gm.find_region(start_addr).unwrap(); + assert!(region.file_offset().is_some()); + } + + // Windows needs a dedicated test where it will retrieve the allocation + // granularity to determine a proper offset (other than 0) that can be + // used for the backing file. Refer to Microsoft docs here: + // https://docs.microsoft.com/en-us/windows/desktop/api/memoryapi/nf-memoryapi-mapviewoffile + #[test] + #[cfg(unix)] + fn test_retrieve_offset_from_fd_backing_memory_region() { + let f = TempFile::new().unwrap().into_file(); + f.set_len(0x1400).unwrap(); + // Needs to be aligned on 4k, otherwise mmap will fail. + let offset = 0x1000; + + let start_addr = GuestAddress(0x0); + let gm = GuestMemoryMmap::from_ranges(&[(start_addr, 0x400)]).unwrap(); + assert!(gm.find_region(start_addr).is_some()); + let region = gm.find_region(start_addr).unwrap(); + assert!(region.file_offset().is_none()); + + let gm = GuestMemoryMmap::from_ranges_with_files(&[( + start_addr, + 0x400, + Some(FileOffset::new(f, offset)), + )]) + .unwrap(); + assert!(gm.find_region(start_addr).is_some()); + let region = gm.find_region(start_addr).unwrap(); + assert!(region.file_offset().is_some()); + assert_eq!(region.file_offset().unwrap().start(), offset); + } + */ + + #[test] + fn test_mmap_insert_region() { + let start_addr1 = GuestAddress(0); + let start_addr2 = GuestAddress(0x10_0000); + + let guest_mem = GuestMemoryHybrid::<()>::new(); + let mut raw_buf = [0u8; 0x1000]; + let raw_ptr = &mut raw_buf as *mut u8; + let reg = unsafe { GuestRegionRaw::<()>::new(start_addr1, raw_ptr, 0x1000) }; + let guest_mem = guest_mem + .insert_region(Arc::new(GuestRegionHybrid::from_raw_region(reg))) + .unwrap(); + let reg = unsafe { GuestRegionRaw::<()>::new(start_addr2, raw_ptr, 0x1000) }; + let gm = &guest_mem + .insert_region(Arc::new(GuestRegionHybrid::from_raw_region(reg))) + .unwrap(); + let mem_orig = gm.memory(); + assert_eq!(mem_orig.num_regions(), 2); + + let reg = unsafe { GuestRegionRaw::new(GuestAddress(0x8000), raw_ptr, 0x1000) }; + let mmap = Arc::new(GuestRegionHybrid::from_raw_region(reg)); + let gm = gm.insert_region(mmap).unwrap(); + let reg = unsafe { GuestRegionRaw::new(GuestAddress(0x4000), raw_ptr, 0x1000) }; + let mmap = Arc::new(GuestRegionHybrid::from_raw_region(reg)); + let gm = gm.insert_region(mmap).unwrap(); + let reg = unsafe { GuestRegionRaw::new(GuestAddress(0xc000), raw_ptr, 0x1000) }; + let mmap = Arc::new(GuestRegionHybrid::from_raw_region(reg)); + let gm = gm.insert_region(mmap).unwrap(); + let reg = unsafe { GuestRegionRaw::new(GuestAddress(0xc000), raw_ptr, 0x1000) }; + let mmap = Arc::new(GuestRegionHybrid::from_raw_region(reg)); + gm.insert_region(mmap).unwrap_err(); + + assert_eq!(mem_orig.num_regions(), 2); + assert_eq!(gm.num_regions(), 5); + + assert_eq!(gm.regions[0].start_addr(), GuestAddress(0x0000)); + assert_eq!(gm.regions[1].start_addr(), GuestAddress(0x4000)); + assert_eq!(gm.regions[2].start_addr(), GuestAddress(0x8000)); + assert_eq!(gm.regions[3].start_addr(), GuestAddress(0xc000)); + assert_eq!(gm.regions[4].start_addr(), GuestAddress(0x10_0000)); + } + + #[test] + fn test_mmap_remove_region() { + let start_addr1 = GuestAddress(0); + let start_addr2 = GuestAddress(0x10_0000); + + let guest_mem = GuestMemoryHybrid::<()>::new(); + let mut raw_buf = [0u8; 0x1000]; + let reg = unsafe { GuestRegionRaw::<()>::new(start_addr1, &mut raw_buf as *mut _, 0x1000) }; + let guest_mem = guest_mem + .insert_region(Arc::new(GuestRegionHybrid::from_raw_region(reg))) + .unwrap(); + let reg = unsafe { GuestRegionRaw::<()>::new(start_addr2, &mut raw_buf as *mut _, 0x1000) }; + let gm = &guest_mem + .insert_region(Arc::new(GuestRegionHybrid::from_raw_region(reg))) + .unwrap(); + let mem_orig = gm.memory(); + assert_eq!(mem_orig.num_regions(), 2); + + gm.remove_region(GuestAddress(0), 128).unwrap_err(); + gm.remove_region(GuestAddress(0x4000), 128).unwrap_err(); + let (gm, region) = gm.remove_region(GuestAddress(0x10_0000), 0x1000).unwrap(); + + assert_eq!(mem_orig.num_regions(), 2); + assert_eq!(gm.num_regions(), 1); + + assert_eq!(gm.regions[0].start_addr(), GuestAddress(0x0000)); + assert_eq!(region.start_addr(), GuestAddress(0x10_0000)); + } + + #[test] + fn test_guest_memory_mmap_get_slice() { + let start_addr1 = GuestAddress(0); + let mut raw_buf = [0u8; 0x400]; + let region = + unsafe { GuestRegionRaw::<()>::new(start_addr1, &mut raw_buf as *mut _, 0x400) }; + + // Normal case. + let slice_addr = MemoryRegionAddress(0x100); + let slice_size = 0x200; + let slice = region.get_slice(slice_addr, slice_size).unwrap(); + assert_eq!(slice.len(), slice_size); + + // Empty slice. + let slice_addr = MemoryRegionAddress(0x200); + let slice_size = 0x0; + let slice = region.get_slice(slice_addr, slice_size).unwrap(); + assert!(slice.is_empty()); + + // Error case when slice_size is beyond the boundary. + let slice_addr = MemoryRegionAddress(0x300); + let slice_size = 0x200; + assert!(region.get_slice(slice_addr, slice_size).is_err()); + } + + #[test] + fn test_guest_memory_mmap_as_volatile_slice() { + let start_addr1 = GuestAddress(0); + let mut raw_buf = [0u8; 0x400]; + let region = + unsafe { GuestRegionRaw::<()>::new(start_addr1, &mut raw_buf as *mut _, 0x400) }; + let region_size = 0x400; + + // Test slice length. + let slice = region.as_volatile_slice().unwrap(); + assert_eq!(slice.len(), region_size); + + // Test slice data. + let v = 0x1234_5678u32; + let r = slice.get_ref::(0x200).unwrap(); + r.store(v); + assert_eq!(r.load(), v); + } + + #[test] + fn test_guest_memory_get_slice() { + let start_addr1 = GuestAddress(0); + let start_addr2 = GuestAddress(0x800); + + let guest_mem = GuestMemoryHybrid::<()>::new(); + let mut raw_buf = [0u8; 0x400]; + let reg = unsafe { GuestRegionRaw::<()>::new(start_addr1, &mut raw_buf as *mut _, 0x400) }; + let guest_mem = guest_mem + .insert_region(Arc::new(GuestRegionHybrid::from_raw_region(reg))) + .unwrap(); + let reg = unsafe { GuestRegionRaw::<()>::new(start_addr2, &mut raw_buf as *mut _, 0x400) }; + let guest_mem = guest_mem + .insert_region(Arc::new(GuestRegionHybrid::from_raw_region(reg))) + .unwrap(); + + // Normal cases. + let slice_size = 0x200; + let slice = guest_mem + .get_slice(GuestAddress(0x100), slice_size) + .unwrap(); + assert_eq!(slice.len(), slice_size); + + let slice_size = 0x400; + let slice = guest_mem + .get_slice(GuestAddress(0x800), slice_size) + .unwrap(); + assert_eq!(slice.len(), slice_size); + + // Empty slice. + assert!(guest_mem + .get_slice(GuestAddress(0x900), 0) + .unwrap() + .is_empty()); + + // Error cases, wrong size or base address. + assert!(guest_mem.get_slice(GuestAddress(0), 0x500).is_err()); + assert!(guest_mem.get_slice(GuestAddress(0x600), 0x100).is_err()); + assert!(guest_mem.get_slice(GuestAddress(0xc00), 0x100).is_err()); + } + + #[test] + fn test_checked_offset() { + let start_addr1 = GuestAddress(0); + let start_addr2 = GuestAddress(0x800); + let start_addr3 = GuestAddress(0xc00); + + let guest_mem = GuestMemoryHybrid::<()>::new(); + let mut raw_buf = [0u8; 0x400]; + let reg = unsafe { GuestRegionRaw::<()>::new(start_addr1, &mut raw_buf as *mut _, 0x400) }; + let guest_mem = guest_mem + .insert_region(Arc::new(GuestRegionHybrid::from_raw_region(reg))) + .unwrap(); + let reg = unsafe { GuestRegionRaw::<()>::new(start_addr2, &mut raw_buf as *mut _, 0x400) }; + let guest_mem = guest_mem + .insert_region(Arc::new(GuestRegionHybrid::from_raw_region(reg))) + .unwrap(); + let reg = unsafe { GuestRegionRaw::<()>::new(start_addr3, &mut raw_buf as *mut _, 0x400) }; + let guest_mem = guest_mem + .insert_region(Arc::new(GuestRegionHybrid::from_raw_region(reg))) + .unwrap(); + + assert_eq!( + guest_mem.checked_offset(start_addr1, 0x200), + Some(GuestAddress(0x200)) + ); + assert_eq!( + guest_mem.checked_offset(start_addr1, 0xa00), + Some(GuestAddress(0xa00)) + ); + assert_eq!( + guest_mem.checked_offset(start_addr2, 0x7ff), + Some(GuestAddress(0xfff)) + ); + assert_eq!(guest_mem.checked_offset(start_addr2, 0xc00), None); + assert_eq!(guest_mem.checked_offset(start_addr1, std::usize::MAX), None); + + assert_eq!(guest_mem.checked_offset(start_addr1, 0x400), None); + assert_eq!( + guest_mem.checked_offset(start_addr1, 0x400 - 1), + Some(GuestAddress(0x400 - 1)) + ); + } + + #[test] + fn test_check_range() { + let start_addr1 = GuestAddress(0); + let start_addr2 = GuestAddress(0x800); + let start_addr3 = GuestAddress(0xc00); + + let guest_mem = GuestMemoryHybrid::<()>::new(); + let mut raw_buf = [0u8; 0x400]; + let reg = unsafe { GuestRegionRaw::<()>::new(start_addr1, &mut raw_buf as *mut _, 0x400) }; + let guest_mem = guest_mem + .insert_region(Arc::new(GuestRegionHybrid::from_raw_region(reg))) + .unwrap(); + let reg = unsafe { GuestRegionRaw::<()>::new(start_addr2, &mut raw_buf as *mut _, 0x400) }; + let guest_mem = guest_mem + .insert_region(Arc::new(GuestRegionHybrid::from_raw_region(reg))) + .unwrap(); + let reg = unsafe { GuestRegionRaw::<()>::new(start_addr3, &mut raw_buf as *mut _, 0x400) }; + let guest_mem = guest_mem + .insert_region(Arc::new(GuestRegionHybrid::from_raw_region(reg))) + .unwrap(); + + assert!(guest_mem.check_range(start_addr1, 0x0)); + assert!(guest_mem.check_range(start_addr1, 0x200)); + assert!(guest_mem.check_range(start_addr1, 0x400)); + assert!(!guest_mem.check_range(start_addr1, 0xa00)); + assert!(guest_mem.check_range(start_addr2, 0x7ff)); + assert!(guest_mem.check_range(start_addr2, 0x800)); + assert!(!guest_mem.check_range(start_addr2, 0x801)); + assert!(!guest_mem.check_range(start_addr2, 0xc00)); + assert!(!guest_mem.check_range(start_addr1, usize::MAX)); + } +} diff --git a/src/dragonball/src/dbs_address_space/src/numa.rs b/src/dragonball/src/dbs_address_space/src/numa.rs new file mode 100644 index 000000000..71f2d748a --- /dev/null +++ b/src/dragonball/src/dbs_address_space/src/numa.rs @@ -0,0 +1,85 @@ +// Copyright (C) 2021 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Types for NUMA information. + +use vm_memory::{GuestAddress, GuestUsize}; + +/// Strategy of mbind() and don't lead to OOM. +pub const MPOL_PREFERRED: u32 = 1; + +/// Strategy of mbind() +pub const MPOL_MF_MOVE: u32 = 2; + +/// Type for recording numa ids of different devices +pub struct NumaIdTable { + /// vectors of numa id for each memory region + pub memory: Vec, + /// vectors of numa id for each cpu + pub cpu: Vec, +} + +/// Record numa node memory information. +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] +pub struct NumaNodeInfo { + /// Base address of the region in guest physical address space. + pub base: GuestAddress, + /// Size of the address region. + pub size: GuestUsize, +} + +/// Record all region's info of a numa node. +#[derive(Debug, Default, Clone, PartialEq, Eq)] +pub struct NumaNode { + region_infos: Vec, + vcpu_ids: Vec, +} + +impl NumaNode { + /// get reference of region_infos in numa node. + pub fn region_infos(&self) -> &Vec { + &self.region_infos + } + + /// get vcpu ids belonging to a numa node. + pub fn vcpu_ids(&self) -> &Vec { + &self.vcpu_ids + } + + /// add a new numa region info into this numa node. + pub fn add_info(&mut self, info: &NumaNodeInfo) { + self.region_infos.push(*info); + } + + /// add a group of vcpu ids belong to this numa node + pub fn add_vcpu_ids(&mut self, vcpu_ids: &[u32]) { + self.vcpu_ids.extend(vcpu_ids) + } + + /// create a new numa node struct + pub fn new() -> NumaNode { + NumaNode { + region_infos: Vec::new(), + vcpu_ids: Vec::new(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_create_numa_node() { + let mut numa_node = NumaNode::new(); + let info = NumaNodeInfo { + base: GuestAddress(0), + size: 1024, + }; + numa_node.add_info(&info); + assert_eq!(*numa_node.region_infos(), vec![info]); + let vcpu_ids = vec![0, 1, 2, 3]; + numa_node.add_vcpu_ids(&vcpu_ids); + assert_eq!(*numa_node.vcpu_ids(), vcpu_ids); + } +} diff --git a/src/dragonball/src/dbs_address_space/src/region.rs b/src/dragonball/src/dbs_address_space/src/region.rs new file mode 100644 index 000000000..a0a832404 --- /dev/null +++ b/src/dragonball/src/dbs_address_space/src/region.rs @@ -0,0 +1,564 @@ +// Copyright (C) 2021 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::ffi::CString; +use std::fs::{File, OpenOptions}; +use std::os::unix::io::FromRawFd; +use std::path::Path; +use std::str::FromStr; + +use nix::sys::memfd; +use vm_memory::{Address, FileOffset, GuestAddress, GuestUsize}; + +use crate::memory::MemorySourceType; +use crate::memory::MemorySourceType::MemFdShared; +use crate::AddressSpaceError; + +/// Type of address space regions. +/// +/// On physical machines, physical memory may have different properties, such as +/// volatile vs non-volatile, read-only vs read-write, non-executable vs executable etc. +/// On virtual machines, the concept of memory property may be extended to support better +/// cooperation between the hypervisor and the guest kernel. Here address space region type means +/// what the region will be used for by the guest OS, and different permissions and policies may +/// be applied to different address space regions. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum AddressSpaceRegionType { + /// Normal memory accessible by CPUs and IO devices. + DefaultMemory, + /// MMIO address region for Devices. + DeviceMemory, + /// DAX address region for virtio-fs/virtio-pmem. + DAXMemory, +} + +/// Struct to maintain configuration information about a guest address region. +#[derive(Debug, Clone)] +pub struct AddressSpaceRegion { + /// Type of address space regions. + pub ty: AddressSpaceRegionType, + /// Base address of the region in virtual machine's physical address space. + pub base: GuestAddress, + /// Size of the address space region. + pub size: GuestUsize, + /// Host NUMA node ids assigned to this region. + pub host_numa_node_id: Option, + + /// File/offset tuple to back the memory allocation. + file_offset: Option, + /// Mmap permission flags. + perm_flags: i32, + /// Mmap protection flags. + prot_flags: i32, + /// Hugepage madvise hint. + /// + /// It needs 'advise' or 'always' policy in host shmem config. + is_hugepage: bool, + /// Hotplug hint. + is_hotplug: bool, + /// Anonymous memory hint. + /// + /// It should be true for regions with the MADV_DONTFORK flag enabled. + is_anon: bool, +} + +#[allow(clippy::too_many_arguments)] +impl AddressSpaceRegion { + /// Create an address space region with default configuration. + pub fn new(ty: AddressSpaceRegionType, base: GuestAddress, size: GuestUsize) -> Self { + AddressSpaceRegion { + ty, + base, + size, + host_numa_node_id: None, + file_offset: None, + perm_flags: libc::MAP_SHARED, + prot_flags: libc::PROT_READ | libc::PROT_WRITE, + is_hugepage: false, + is_hotplug: false, + is_anon: false, + } + } + + /// Create an address space region with all configurable information. + /// + /// # Arguments + /// * `ty` - Type of the address region + /// * `base` - Base address in VM to map content + /// * `size` - Length of content to map + /// * `numa_node_id` - Optional NUMA node id to allocate memory from + /// * `file_offset` - Optional file descriptor and offset to map content from + /// * `perm_flags` - mmap permission flags + /// * `prot_flags` - mmap protection flags + /// * `is_hotplug` - Whether it's a region for hotplug. + pub fn build( + ty: AddressSpaceRegionType, + base: GuestAddress, + size: GuestUsize, + host_numa_node_id: Option, + file_offset: Option, + perm_flags: i32, + prot_flags: i32, + is_hotplug: bool, + ) -> Self { + let mut region = Self::new(ty, base, size); + + region.set_host_numa_node_id(host_numa_node_id); + region.set_file_offset(file_offset); + region.set_perm_flags(perm_flags); + region.set_prot_flags(prot_flags); + if is_hotplug { + region.set_hotplug(); + } + + region + } + + /// Create an address space region to map memory into the virtual machine. + /// + /// # Arguments + /// * `base` - Base address in VM to map content + /// * `size` - Length of content to map + /// * `numa_node_id` - Optional NUMA node id to allocate memory from + /// * `mem_type` - Memory mapping from, 'shmem' or 'hugetlbfs' + /// * `mem_file_path` - Memory file path + /// * `mem_prealloc` - Whether to enable pre-allocation of guest memory + /// * `is_hotplug` - Whether it's a region for hotplug. + pub fn create_default_memory_region( + base: GuestAddress, + size: GuestUsize, + numa_node_id: Option, + mem_type: &str, + mem_file_path: &str, + mem_prealloc: bool, + is_hotplug: bool, + ) -> Result { + Self::create_memory_region( + base, + size, + numa_node_id, + mem_type, + mem_file_path, + mem_prealloc, + libc::PROT_READ | libc::PROT_WRITE, + is_hotplug, + ) + } + + /// Create an address space region to map memory from memfd/hugetlbfs into the virtual machine. + /// + /// # Arguments + /// * `base` - Base address in VM to map content + /// * `size` - Length of content to map + /// * `numa_node_id` - Optional NUMA node id to allocate memory from + /// * `mem_type` - Memory mapping from, 'shmem' or 'hugetlbfs' + /// * `mem_file_path` - Memory file path + /// * `mem_prealloc` - Whether to enable pre-allocation of guest memory + /// * `is_hotplug` - Whether it's a region for hotplug. + /// * `prot_flags` - mmap protection flags + pub fn create_memory_region( + base: GuestAddress, + size: GuestUsize, + numa_node_id: Option, + mem_type: &str, + mem_file_path: &str, + mem_prealloc: bool, + prot_flags: i32, + is_hotplug: bool, + ) -> Result { + let perm_flags = if mem_prealloc { + libc::MAP_SHARED | libc::MAP_POPULATE + } else { + libc::MAP_SHARED + }; + let source_type = MemorySourceType::from_str(mem_type) + .map_err(|_e| AddressSpaceError::InvalidMemorySourceType(mem_type.to_string()))?; + let mut reg = match source_type { + MemorySourceType::MemFdShared | MemorySourceType::MemFdOnHugeTlbFs => { + let fn_str = if source_type == MemFdShared { + CString::new("shmem").expect("CString::new('shmem') failed") + } else { + CString::new("hugeshmem").expect("CString::new('hugeshmem') failed") + }; + let filename = fn_str.as_c_str(); + let fd = memfd::memfd_create(filename, memfd::MemFdCreateFlag::empty()) + .map_err(AddressSpaceError::CreateMemFd)?; + // Safe because we have just created the fd. + let file: File = unsafe { File::from_raw_fd(fd) }; + file.set_len(size).map_err(AddressSpaceError::SetFileSize)?; + Self::build( + AddressSpaceRegionType::DefaultMemory, + base, + size, + numa_node_id, + Some(FileOffset::new(file, 0)), + perm_flags, + prot_flags, + is_hotplug, + ) + } + MemorySourceType::MmapAnonymous | MemorySourceType::MmapAnonymousHugeTlbFs => { + let mut perm_flags = libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; + if mem_prealloc { + perm_flags |= libc::MAP_POPULATE + } + Self::build( + AddressSpaceRegionType::DefaultMemory, + base, + size, + numa_node_id, + None, + perm_flags, + prot_flags, + is_hotplug, + ) + } + MemorySourceType::FileOnHugeTlbFs => { + let path = Path::new(mem_file_path); + if let Some(parent_dir) = path.parent() { + // Ensure that the parent directory is existed for the mem file path. + std::fs::create_dir_all(parent_dir).map_err(AddressSpaceError::CreateDir)?; + } + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(mem_file_path) + .map_err(AddressSpaceError::OpenFile)?; + nix::unistd::unlink(mem_file_path).map_err(AddressSpaceError::UnlinkFile)?; + file.set_len(size).map_err(AddressSpaceError::SetFileSize)?; + let file_offset = FileOffset::new(file, 0); + Self::build( + AddressSpaceRegionType::DefaultMemory, + base, + size, + numa_node_id, + Some(file_offset), + perm_flags, + prot_flags, + is_hotplug, + ) + } + }; + + if source_type.is_hugepage() { + reg.set_hugepage(); + } + if source_type.is_mmap_anonymous() { + reg.set_anonpage(); + } + + Ok(reg) + } + + /// Create an address region for device MMIO. + /// + /// # Arguments + /// * `base` - Base address in VM to map content + /// * `size` - Length of content to map + pub fn create_device_region( + base: GuestAddress, + size: GuestUsize, + ) -> Result { + Ok(Self::build( + AddressSpaceRegionType::DeviceMemory, + base, + size, + None, + None, + 0, + 0, + false, + )) + } + + /// Get type of the address space region. + pub fn region_type(&self) -> AddressSpaceRegionType { + self.ty + } + + /// Get size of region. + pub fn len(&self) -> GuestUsize { + self.size + } + + /// Get the inclusive start physical address of the region. + pub fn start_addr(&self) -> GuestAddress { + self.base + } + + /// Get the inclusive end physical address of the region. + pub fn last_addr(&self) -> GuestAddress { + debug_assert!(self.size > 0 && self.base.checked_add(self.size).is_some()); + GuestAddress(self.base.raw_value() + self.size - 1) + } + + /// Get mmap permission flags of the address space region. + pub fn perm_flags(&self) -> i32 { + self.perm_flags + } + + /// Set mmap permission flags for the address space region. + pub fn set_perm_flags(&mut self, perm_flags: i32) { + self.perm_flags = perm_flags; + } + + /// Get mmap protection flags of the address space region. + pub fn prot_flags(&self) -> i32 { + self.prot_flags + } + + /// Set mmap protection flags for the address space region. + pub fn set_prot_flags(&mut self, prot_flags: i32) { + self.prot_flags = prot_flags; + } + + /// Get host_numa_node_id flags + pub fn host_numa_node_id(&self) -> Option { + self.host_numa_node_id + } + + /// Set associated NUMA node ID to allocate memory from for this region. + pub fn set_host_numa_node_id(&mut self, host_numa_node_id: Option) { + self.host_numa_node_id = host_numa_node_id; + } + + /// Check whether the address space region is backed by a memory file. + pub fn has_file(&self) -> bool { + self.file_offset.is_some() + } + + /// Get optional file associated with the region. + pub fn file_offset(&self) -> Option<&FileOffset> { + self.file_offset.as_ref() + } + + /// Set associated file/offset pair for the region. + pub fn set_file_offset(&mut self, file_offset: Option) { + self.file_offset = file_offset; + } + + /// Set the hotplug hint. + pub fn set_hotplug(&mut self) { + self.is_hotplug = true + } + + /// Get the hotplug hint. + pub fn is_hotplug(&self) -> bool { + self.is_hotplug + } + + /// Set hugepage hint for `madvise()`, only takes effect when the memory type is `shmem`. + pub fn set_hugepage(&mut self) { + self.is_hugepage = true + } + + /// Get the hugepage hint. + pub fn is_hugepage(&self) -> bool { + self.is_hugepage + } + + /// Set the anonymous memory hint. + pub fn set_anonpage(&mut self) { + self.is_anon = true + } + + /// Get the anonymous memory hint. + pub fn is_anonpage(&self) -> bool { + self.is_anon + } + + /// Check whether the address space region is valid. + pub fn is_valid(&self) -> bool { + self.size > 0 && self.base.checked_add(self.size).is_some() + } + + /// Check whether the address space region intersects with another one. + pub fn intersect_with(&self, other: &AddressSpaceRegion) -> bool { + // Treat invalid address region as intersecting always + let end1 = match self.base.checked_add(self.size) { + Some(addr) => addr, + None => return true, + }; + let end2 = match other.base.checked_add(other.size) { + Some(addr) => addr, + None => return true, + }; + + !(end1 <= other.base || self.base >= end2) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + use vmm_sys_util::tempfile::TempFile; + + #[test] + fn test_address_space_region_valid() { + let reg1 = AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0xFFFFFFFFFFFFF000), + 0x2000, + ); + assert!(!reg1.is_valid()); + let reg1 = AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0xFFFFFFFFFFFFF000), + 0x1000, + ); + assert!(!reg1.is_valid()); + let reg1 = AddressSpaceRegion::new( + AddressSpaceRegionType::DeviceMemory, + GuestAddress(0xFFFFFFFFFFFFE000), + 0x1000, + ); + assert!(reg1.is_valid()); + assert_eq!(reg1.start_addr(), GuestAddress(0xFFFFFFFFFFFFE000)); + assert_eq!(reg1.len(), 0x1000); + assert!(!reg1.has_file()); + assert!(reg1.file_offset().is_none()); + assert_eq!(reg1.perm_flags(), libc::MAP_SHARED); + assert_eq!(reg1.prot_flags(), libc::PROT_READ | libc::PROT_WRITE); + assert_eq!(reg1.region_type(), AddressSpaceRegionType::DeviceMemory); + + let tmp_file = TempFile::new().unwrap(); + let mut f = tmp_file.into_file(); + let sample_buf = &[1, 2, 3, 4, 5]; + assert!(f.write_all(sample_buf).is_ok()); + let reg2 = AddressSpaceRegion::build( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x1000), + 0x1000, + None, + Some(FileOffset::new(f, 0x0)), + 0x5a, + 0x5a, + false, + ); + assert_eq!(reg2.region_type(), AddressSpaceRegionType::DefaultMemory); + assert!(reg2.is_valid()); + assert_eq!(reg2.start_addr(), GuestAddress(0x1000)); + assert_eq!(reg2.len(), 0x1000); + assert!(reg2.has_file()); + assert!(reg2.file_offset().is_some()); + assert_eq!(reg2.perm_flags(), 0x5a); + assert_eq!(reg2.prot_flags(), 0x5a); + } + + #[test] + fn test_address_space_region_intersect() { + let reg1 = AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x1000), + 0x1000, + ); + let reg2 = AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x2000), + 0x1000, + ); + let reg3 = AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x1000), + 0x1001, + ); + let reg4 = AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0x1100), + 0x100, + ); + let reg5 = AddressSpaceRegion::new( + AddressSpaceRegionType::DefaultMemory, + GuestAddress(0xFFFFFFFFFFFFF000), + 0x2000, + ); + + assert!(!reg1.intersect_with(®2)); + assert!(!reg2.intersect_with(®1)); + + // intersect with self + assert!(reg1.intersect_with(®1)); + + // intersect with others + assert!(reg3.intersect_with(®2)); + assert!(reg2.intersect_with(®3)); + assert!(reg1.intersect_with(®4)); + assert!(reg4.intersect_with(®1)); + assert!(reg1.intersect_with(®5)); + assert!(reg5.intersect_with(®1)); + } + + #[test] + fn test_create_device_region() { + let reg = AddressSpaceRegion::create_device_region(GuestAddress(0x10000), 0x1000).unwrap(); + assert_eq!(reg.region_type(), AddressSpaceRegionType::DeviceMemory); + assert_eq!(reg.start_addr(), GuestAddress(0x10000)); + assert_eq!(reg.len(), 0x1000); + } + + #[test] + fn test_create_default_memory_region() { + AddressSpaceRegion::create_default_memory_region( + GuestAddress(0x100000), + 0x100000, + None, + "invalid", + "invalid", + false, + false, + ) + .unwrap_err(); + + let reg = AddressSpaceRegion::create_default_memory_region( + GuestAddress(0x100000), + 0x100000, + None, + "shmem", + "", + false, + false, + ) + .unwrap(); + assert_eq!(reg.region_type(), AddressSpaceRegionType::DefaultMemory); + assert_eq!(reg.start_addr(), GuestAddress(0x100000)); + assert_eq!(reg.last_addr(), GuestAddress(0x1fffff)); + assert_eq!(reg.len(), 0x100000); + assert!(reg.file_offset().is_some()); + + let reg = AddressSpaceRegion::create_default_memory_region( + GuestAddress(0x100000), + 0x100000, + None, + "hugeshmem", + "", + true, + false, + ) + .unwrap(); + assert_eq!(reg.region_type(), AddressSpaceRegionType::DefaultMemory); + assert_eq!(reg.start_addr(), GuestAddress(0x100000)); + assert_eq!(reg.last_addr(), GuestAddress(0x1fffff)); + assert_eq!(reg.len(), 0x100000); + assert!(reg.file_offset().is_some()); + + let reg = AddressSpaceRegion::create_default_memory_region( + GuestAddress(0x100000), + 0x100000, + None, + "mmap", + "", + true, + false, + ) + .unwrap(); + assert_eq!(reg.region_type(), AddressSpaceRegionType::DefaultMemory); + assert_eq!(reg.start_addr(), GuestAddress(0x100000)); + assert_eq!(reg.last_addr(), GuestAddress(0x1fffff)); + assert_eq!(reg.len(), 0x100000); + assert!(reg.file_offset().is_none()); + + // TODO: test hugetlbfs + } +} diff --git a/src/dragonball/src/dbs_allocator/Cargo.toml b/src/dragonball/src/dbs_allocator/Cargo.toml new file mode 100644 index 000000000..c3c0f3c10 --- /dev/null +++ b/src/dragonball/src/dbs_allocator/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "dbs-allocator" +version = "0.1.1" +authors = ["Liu Jiang "] +description = "a resource allocator for virtual machine manager" +license = "Apache-2.0" +edition = "2018" +homepage = "https://github.com/openanolis/dragonball-sandbox" +repository = "https://github.com/openanolis/dragonball-sandbox" +keywords = ["dragonball"] +readme = "README.md" + +[dependencies] +thiserror = "1.0" diff --git a/src/dragonball/src/dbs_allocator/LICENSE b/src/dragonball/src/dbs_allocator/LICENSE new file mode 120000 index 000000000..30cff7403 --- /dev/null +++ b/src/dragonball/src/dbs_allocator/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/src/dragonball/src/dbs_allocator/README.md b/src/dragonball/src/dbs_allocator/README.md new file mode 100644 index 000000000..2e4b07a8c --- /dev/null +++ b/src/dragonball/src/dbs_allocator/README.md @@ -0,0 +1,106 @@ +# dbs-allocator + +## Design + +The resource manager in the `Dragonball Sandbox` needs to manage and allocate different kinds of resource for the +sandbox (virtual machine), such as memory-mapped I/O address space, port I/O address space, legacy IRQ numbers, +MSI/MSI-X vectors, device instance id, etc. The `dbs-allocator` crate is designed to help the resource manager +to track and allocate these types of resources. + +Main components are: +- *Constraints*: struct to declare constraints for resource allocation. +```rust +#[derive(Copy, Clone, Debug)] +pub struct Constraint { + /// Size of resource to allocate. + pub size: u64, + /// Lower boundary for resource allocation. + pub min: u64, + /// Upper boundary for resource allocation. + pub max: u64, + /// Alignment for allocated resource. + pub align: u64, + /// Policy for resource allocation. + pub policy: AllocPolicy, +} +``` +- `IntervalTree`: An interval tree implementation specialized for VMM resource management. +```rust +pub struct IntervalTree { + pub(crate) root: Option>, +} +​ +pub fn allocate(&mut self, constraint: &Constraint) -> Option +pub fn free(&mut self, key: &Range) -> Option +pub fn insert(&mut self, key: Range, data: Option) -> Self +pub fn update(&mut self, key: &Range, data: T) -> Option +pub fn delete(&mut self, key: &Range) -> Option +pub fn get(&self, key: &Range) -> Option> +``` + +## Usage +The concept of Interval Tree may seem complicated, but using dbs-allocator to do resource allocation and release is simple and straightforward. +You can following these steps to allocate your VMM resource. +```rust +// 1. To start with, we should create an interval tree for some specific resouces and give maximum address/id range as root node. The range here could be address range, id range, etc. +​ +let mut resources_pool = IntervalTree::new(); +resources_pool.insert(Range::new(MIN_RANGE, MAX_RANGE), None); +​ +// 2. Next, create a constraint with the size for your resource, you could also assign the maximum, minimum and alignment for the constraint. Then we could use the constraint to allocate the resource in the range we previously decided. Interval Tree will give you the appropriate range. +let mut constraint = Constraint::new(SIZE); +let mut resources_range = self.resources_pool.allocate(&constraint); +​ +// 3. Then we could use the resource range to let other crates like vm-pci / vm-device to create and maintain the device +let mut device = Device::create(resources_range, ..) +``` + +## Example +We will show examples for allocating an unused PCI device ID from the PCI device ID pool and allocating memory address using dbs-allocator +```rust +use dbs_allocator::{Constraint, IntervalTree, Range}; +​ +// Init a dbs-allocator IntervalTree +let mut pci_device_pool = IntervalTree::new(); +​ +// Init PCI device id pool with the range 0 to 255 +pci_device_pool.insert(Range::new(0x0u8, 0xffu8), None); +​ +// Construct a constraint with size 1 and alignment 1 to ask for an ID. +let mut constraint = Constraint::new(1u64).align(1u64); +​ +// Get an ID from the pci_device_pool +let mut id = pci_device_pool.allocate(&constraint).map(|e| e.min as u8); +​ +// Pass the ID generated from dbs-allocator to vm-pci specified functions to create pci devices +let mut pci_device = PciDevice::new(id as u8, ..); + +``` + +```rust +use dbs_allocator::{Constraint, IntervalTree, Range}; +​ +// Init a dbs-allocator IntervalTree +let mut mem_pool = IntervalTree::new(); +​ +// Init memory address from GUEST_MEM_START to GUEST_MEM_END +mem_pool.insert(Range::new(GUEST_MEM_START, GUEST_MEM_END), None); +​ +// Construct a constraint with size, maximum addr and minimum address of memory region to ask for an memory allocation range. +let constraint = Constraint::new(region.len()) + .min(region.start_addr().raw_value()) + .max(region.last_addr().raw_value()); +​ +// Get the memory allocation range from the pci_device_pool +let mem_range = mem_pool.allocate(&constraint).unwrap(); +​ +// Update the mem_range in IntervalTree with memory region info +mem_pool.update(&mem_range, region); +​ +// After allocation, we can use the memory range to do mapping and other memory related work. +... +``` + +## License + +This project is licensed under [Apache License](http://www.apache.org/licenses/LICENSE-2.0), Version 2.0. \ No newline at end of file diff --git a/src/dragonball/src/dbs_allocator/src/interval_tree.rs b/src/dragonball/src/dbs_allocator/src/interval_tree.rs new file mode 100644 index 000000000..c2a13c5c8 --- /dev/null +++ b/src/dragonball/src/dbs_allocator/src/interval_tree.rs @@ -0,0 +1,1297 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! An interval tree implementation specialized for VMM resource management. +//! +//! It's not designed as a generic interval tree, but specialized for VMM resource management. +//! In addition to the normal get()/insert()/delete()/update() tree operations, it also implements +//! allocate()/free() for resource allocation. +//! +//! # Examples +//! ```rust +//! extern crate dbs_allocator; +//! use dbs_allocator::{Constraint, IntervalTree, NodeState, Range}; +//! +//! // Create an interval tree and add available resources. +//! let mut tree = IntervalTree::::new(); +//! tree.insert(Range::new(0x100u32, 0x100u32), None); +//! tree.insert(Range::new(0x200u16, 0x2ffu16), None); +//! +//! // Allocate a range with constraints. +//! let mut constraint = Constraint::new(8u64); +//! constraint.min = 0x211; +//! constraint.max = 0x21f; +//! constraint.align = 0x8; +//! +//! let key = tree.allocate(&constraint); +//! assert_eq!(key, Some(Range::new(0x218u64, 0x21fu64))); +//! let val = tree.get(&Range::new(0x218u64, 0x21fu64)); +//! assert_eq!(val, Some(NodeState::Allocated)); +//! +//! // Associate data with the allocated range and mark the range as occupied. +//! // Note: caller needs to protect from concurrent access between allocate() and the first call +//! // to update() to mark range as occupied. +//! let old = tree.update(&Range::new(0x218u32, 0x21fu32), 2); +//! assert_eq!(old, None); +//! let old = tree.update(&Range::new(0x218u32, 0x21fu32), 3); +//! assert_eq!(old, Some(2)); +//! let val = tree.get(&Range::new(0x218u32, 0x21fu32)); +//! assert_eq!(val, Some(NodeState::Valued(&3))); +//! +//! // Free allocated resource. +//! let old = tree.free(key.as_ref().unwrap()); +//! assert_eq!(old, Some(3)); +//! ``` + +use std::cmp::{max, min, Ordering}; + +use crate::{AllocPolicy, Constraint}; + +/// Represent a closed range `[min, max]`. +#[allow(missing_docs)] +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct Range { + pub min: u64, + pub max: u64, +} + +impl std::fmt::Debug for Range { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "[ {:016x}, {:016x} ]", self.min, self.max) + } +} + +impl Range { + /// Create a instance of [`Range`] with given `min` and `max`. + /// + /// ## Panic + /// - if min is bigger than max + /// - if min == 0 && max == u64:MAX + pub fn new(min: T, max: T) -> Self + where + u64: From, + { + let umin = u64::from(min); + let umax = u64::from(max); + if umin > umax || (umin == 0 && umax == u64::MAX) { + panic!("interval_tree: Range({}, {}) is invalid", umin, umax); + } + Range { + min: umin, + max: umax, + } + } + + /// Create a instance of [`Range`] with given base and size. + /// + /// ## Panic + /// - if base + size wraps around + /// - if base == 0 && size == u64::MAX + pub fn with_size(base: T, size: T) -> Self + where + u64: From, + { + let umin = u64::from(base); + let umax = u64::from(size).checked_add(umin).unwrap(); + if umin > umax || (umin == 0 && umax == std::u64::MAX) { + panic!("interval_tree: Range({}, {}) is invalid", umin, umax); + } + Range { + min: umin, + max: umax, + } + } + + /// Create a instance of [`Range`] containing only the point `value`. + pub fn new_point(value: T) -> Self + where + u64: From, + { + let val = u64::from(value); + Range { min: val, max: val } + } + + /// Get size of the range. + pub fn len(&self) -> u64 { + self.max - self.min + 1 + } + + /// Check whether the range is empty. + pub fn is_empty(&self) -> bool { + false + } + + /// Check whether two Range objects intersect with each other. + pub fn intersect(&self, other: &Range) -> bool { + max(self.min, other.min) <= min(self.max, other.max) + } + + /// Check whether another [Range] object is fully covered by this range. + pub fn contain(&self, other: &Range) -> bool { + self.min <= other.min && self.max >= other.max + } + + /// Create a new instance of [Range] with `min` aligned to `align`. + /// + /// # Examples + /// ```rust + /// extern crate dbs_allocator; + /// use dbs_allocator::Range; + /// + /// let a = Range::new(2u32, 6u32); + /// assert_eq!(a.align_to(0), Some(Range::new(2u32, 6u32))); + /// assert_eq!(a.align_to(1), Some(Range::new(2u16, 6u16))); + /// assert_eq!(a.align_to(2), Some(Range::new(2u64, 6u64))); + /// assert_eq!(a.align_to(4), Some(Range::new(4u8, 6u8))); + /// assert_eq!(a.align_to(8), None); + /// assert_eq!(a.align_to(3), None); + /// let b = Range::new(2u8, 2u8); + /// assert_eq!(b.align_to(2), Some(Range::new(2u8, 2u8))); + /// ``` + pub fn align_to(&self, align: u64) -> Option { + match align { + 0 | 1 => Some(*self), + _ => { + if align & (align - 1) != 0 { + return None; + } + if let Some(min) = self.min.checked_add(align - 1).map(|v| v & !(align - 1)) { + if min <= self.max { + return Some(Range::new(min, self.max)); + } + } + None + } + } + } +} + +impl PartialOrd for Range { + fn partial_cmp(&self, other: &Self) -> Option { + match self.min.cmp(&other.min) { + Ordering::Equal => Some(self.max.cmp(&other.max)), + res => Some(res), + } + } +} + +impl Ord for Range { + fn cmp(&self, other: &Self) -> Ordering { + match self.min.cmp(&other.min) { + Ordering::Equal => self.max.cmp(&other.max), + res => res, + } + } +} + +/// State of interval tree node. +/// +/// Valid state transitions: +/// - None -> Free: [IntervalTree::insert()] +/// - None -> Valued: [IntervalTree::insert()] +/// - Free -> Allocated: [IntervalTree::allocate()] +/// - Allocated -> Valued(T): [IntervalTree::update()] +/// - Valued -> Valued(T): [IntervalTree::update()] +/// - Allocated -> Free: [IntervalTree::free()] +/// - Valued(T) -> Free: [IntervalTree::free()] +/// - * -> None: [IntervalTree::delete()] +#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Ord)] +pub enum NodeState { + /// Node is free + Free, + /// Node is allocated but without associated data + Allocated, + /// Node is allocated with associated data. + Valued(T), +} + +impl NodeState { + fn take(&mut self) -> Self { + std::mem::replace(self, NodeState::::Free) + } + + fn replace(&mut self, value: NodeState) -> Self { + std::mem::replace(self, value) + } + + fn as_ref(&self) -> NodeState<&T> { + match self { + NodeState::::Valued(ref x) => NodeState::<&T>::Valued(x), + NodeState::::Allocated => NodeState::<&T>::Allocated, + NodeState::::Free => NodeState::<&T>::Free, + } + } + + fn as_mut(&mut self) -> NodeState<&mut T> { + match self { + NodeState::::Valued(ref mut x) => NodeState::<&mut T>::Valued(x), + NodeState::::Allocated => NodeState::<&mut T>::Allocated, + NodeState::::Free => NodeState::<&mut T>::Free, + } + } + + fn is_free(&self) -> bool { + matches!(self, NodeState::::Free) + } +} + +impl From> for Option { + fn from(n: NodeState) -> Option { + match n { + NodeState::::Free | NodeState::::Allocated => None, + NodeState::::Valued(data) => Some(data), + } + } +} + +/// Internal tree node to implement interval tree. +#[derive(Debug, PartialEq, Eq)] +struct InnerNode { + /// Interval handled by this node. + key: Range, + /// Optional contained data, None if the node is free. + data: NodeState, + /// Optional left child of current node. + left: Option>, + /// Optional right child of current node. + right: Option>, + /// Cached height of the node. + height: u32, + /// Cached maximum valued covered by this node. + max_key: u64, +} + +impl InnerNode { + fn new(key: Range, data: NodeState) -> Self { + InnerNode { + key, + data, + left: None, + right: None, + height: 1, + max_key: key.max, + } + } +} + +/// Newtype for interval tree nodes. +#[derive(Debug, PartialEq, Eq)] +struct Node(Box>); + +impl Node { + fn new(key: Range, data: Option) -> Self { + let value = if let Some(t) = data { + NodeState::Valued(t) + } else { + NodeState::Free + }; + Node(Box::new(InnerNode::new(key, value))) + } + + /// Returns a readonly reference to the node associated with the `key` or None if not found. + fn search(&self, key: &Range) -> Option<&Self> { + match self.0.key.cmp(key) { + Ordering::Equal => Some(self), + Ordering::Less => self.0.right.as_ref().and_then(|node| node.search(key)), + Ordering::Greater => self.0.left.as_ref().and_then(|node| node.search(key)), + } + } + + /// Returns a shared reference to the node covers full range of the `key`. + fn search_superset(&self, key: &Range) -> Option<&Self> { + if self.0.key.contain(key) { + Some(self) + } else if key.max < self.0.key.min && self.0.left.is_some() { + // Safe to unwrap() because we have just checked it. + self.0.left.as_ref().unwrap().search_superset(key) + } else if key.min > self.0.key.max && self.0.right.is_some() { + // Safe to unwrap() because we have just checked it. + self.0.right.as_ref().unwrap().search_superset(key) + } else { + None + } + } + + /// Returns a mutable reference to the node covers full range of the `key`. + fn search_superset_mut(&mut self, key: &Range) -> Option<&mut Self> { + if self.0.key.contain(key) { + Some(self) + } else if key.max < self.0.key.min && self.0.left.is_some() { + // Safe to unwrap() because we have just checked it. + self.0.left.as_mut().unwrap().search_superset_mut(key) + } else if key.min > self.0.key.max && self.0.right.is_some() { + // Safe to unwrap() because we have just checked it. + self.0.right.as_mut().unwrap().search_superset_mut(key) + } else { + None + } + } + + /// Insert a new (key, data) pair into the subtree. + /// + /// Note: it will panic if the new key intersects with existing nodes. + fn insert(mut self, key: Range, data: Option) -> Self { + match self.0.key.cmp(&key) { + Ordering::Equal => { + panic!("interval_tree: key {:?} exists", key); + } + Ordering::Less => { + if self.0.key.intersect(&key) { + panic!( + "interval_tree: key {:?} intersects with existing {:?}", + key, self.0.key + ); + } + match self.0.right { + None => self.0.right = Some(Node::new(key, data)), + Some(_) => self.0.right = self.0.right.take().map(|n| n.insert(key, data)), + } + } + Ordering::Greater => { + if self.0.key.intersect(&key) { + panic!( + "interval_tree: key {:?} intersects with existing {:?}", + key, self.0.key + ); + } + match self.0.left { + None => self.0.left = Some(Node::new(key, data)), + Some(_) => self.0.left = self.0.left.take().map(|n| n.insert(key, data)), + } + } + } + self.updated_node() + } + + /// Update an existing entry and return the old value. + fn update(&mut self, key: &Range, data: NodeState) -> Option { + match self.0.key.cmp(key) { + Ordering::Equal => { + match (self.0.data.as_ref(), data.as_ref()) { + (NodeState::<&T>::Free, NodeState::<&T>::Free) + | (NodeState::<&T>::Free, NodeState::<&T>::Valued(_)) + | (NodeState::<&T>::Allocated, NodeState::<&T>::Free) + | (NodeState::<&T>::Allocated, NodeState::<&T>::Allocated) + | (NodeState::<&T>::Valued(_), NodeState::<&T>::Free) + | (NodeState::<&T>::Valued(_), NodeState::<&T>::Allocated) => { + panic!("try to update unallocated interval tree node"); + } + _ => {} + } + self.0.data.replace(data).into() + } + Ordering::Less => match self.0.right.as_mut() { + None => None, + Some(node) => node.update(key, data), + }, + Ordering::Greater => match self.0.left.as_mut() { + None => None, + Some(node) => node.update(key, data), + }, + } + } + + /// Delete `key` from the subtree. + /// + /// Note: it doesn't return whether the key exists in the subtree, so caller need to ensure the + /// logic. + fn delete(mut self, key: &Range) -> (Option, Option) { + match self.0.key.cmp(key) { + Ordering::Equal => { + let data = self.0.data.take(); + return (data.into(), self.delete_root()); + } + Ordering::Less => { + if let Some(node) = self.0.right.take() { + let (data, right) = node.delete(key); + self.0.right = right; + return (data, Some(self.updated_node())); + } + } + Ordering::Greater => { + if let Some(node) = self.0.left.take() { + let (data, left) = node.delete(key); + self.0.left = left; + return (data, Some(self.updated_node())); + } + } + } + (None, Some(self)) + } + + /// Rotate the node if necessary to keep balance. + fn rotate(self) -> Self { + let l = height(&self.0.left); + let r = height(&self.0.right); + match (l as i32) - (r as i32) { + 1 | 0 | -1 => self, + 2 => self.rotate_left_successor(), + -2 => self.rotate_right_successor(), + _ => unreachable!(), + } + } + + /// Perform a single left rotation on this node. + fn rotate_left(mut self) -> Self { + let mut new_root = self.0.right.take().expect("Node is broken"); + self.0.right = new_root.0.left.take(); + self.update_cached_info(); + new_root.0.left = Some(self); + new_root.update_cached_info(); + new_root + } + + /// Perform a single right rotation on this node. + fn rotate_right(mut self) -> Self { + let mut new_root = self.0.left.take().expect("Node is broken"); + self.0.left = new_root.0.right.take(); + self.update_cached_info(); + new_root.0.right = Some(self); + new_root.update_cached_info(); + new_root + } + + /// Performs a rotation when the left successor is too high. + fn rotate_left_successor(mut self) -> Self { + let left = self.0.left.take().expect("Node is broken"); + if height(&left.0.left) < height(&left.0.right) { + let rotated = left.rotate_left(); + self.0.left = Some(rotated); + self.update_cached_info(); + } else { + self.0.left = Some(left); + } + self.rotate_right() + } + + /// Performs a rotation when the right successor is too high. + fn rotate_right_successor(mut self) -> Self { + let right = self.0.right.take().expect("Node is broken"); + if height(&right.0.left) > height(&right.0.right) { + let rotated = right.rotate_right(); + self.0.right = Some(rotated); + self.update_cached_info(); + } else { + self.0.right = Some(right); + } + self.rotate_left() + } + + fn delete_root(mut self) -> Option { + match (self.0.left.take(), self.0.right.take()) { + (None, None) => None, + (Some(l), None) => Some(l), + (None, Some(r)) => Some(r), + (Some(l), Some(r)) => Some(Self::combine_subtrees(l, r)), + } + } + + /// Find the minimal key below the tree and returns a new optional tree where the minimal + /// value has been removed and the (optional) minimal node as tuple (min_node, remaining) + fn get_new_root(mut self) -> (Self, Option) { + match self.0.left.take() { + None => { + let remaining = self.0.right.take(); + (self, remaining) + } + Some(left) => { + let (min_node, left) = left.get_new_root(); + self.0.left = left; + (min_node, Some(self.updated_node())) + } + } + } + + fn combine_subtrees(l: Self, r: Self) -> Self { + let (mut new_root, remaining) = r.get_new_root(); + new_root.0.left = Some(l); + new_root.0.right = remaining; + new_root.updated_node() + } + + fn find_candidate(&self, constraint: &Constraint) -> Option<&Self> { + match constraint.policy { + AllocPolicy::FirstMatch => self.first_match(constraint), + AllocPolicy::Default => self.first_match(constraint), + } + } + + fn first_match(&self, constraint: &Constraint) -> Option<&Self> { + let mut candidate = if self.0.left.is_some() { + self.0.left.as_ref().unwrap().first_match(constraint) + } else { + None + }; + + if candidate.is_none() && self.check_constraint(constraint) { + candidate = Some(self); + } + if candidate.is_none() && self.0.right.is_some() { + candidate = self.0.right.as_ref().unwrap().first_match(constraint); + } + candidate + } + + fn check_constraint(&self, constraint: &Constraint) -> bool { + if self.0.data.is_free() { + let min = std::cmp::max(self.0.key.min, constraint.min); + let max = std::cmp::min(self.0.key.max, constraint.max); + if min <= max { + let key = Range::new(min, max); + if constraint.align == 0 || constraint.align == 1 { + return key.len() >= constraint.size; + } + return match key.align_to(constraint.align) { + None => false, + Some(aligned_key) => aligned_key.len() >= constraint.size, + }; + } + } + false + } + + /// Update cached information of the node. + /// Please make sure that the cached values of both children are up to date. + fn update_cached_info(&mut self) { + self.0.height = max(height(&self.0.left), height(&self.0.right)) + 1; + self.0.max_key = max( + max_key(&self.0.left), + max(max_key(&self.0.right), self.0.key.max), + ); + } + + /// Update the sub-tree to keep balance. + fn updated_node(mut self) -> Self { + self.update_cached_info(); + self.rotate() + } +} + +/// Compute height of the optional sub-tree. +fn height(node: &Option>) -> u32 { + node.as_ref().map_or(0, |n| n.0.height) +} + +/// Compute maximum key value covered by the optional sub-tree. +fn max_key(node: &Option>) -> u64 { + node.as_ref().map_or(0, |n| n.0.max_key) +} + +/// An interval tree implementation specialized for VMM resource management. +#[derive(Debug, Default, PartialEq, Eq)] +pub struct IntervalTree { + root: Option>, +} + +impl IntervalTree { + /// Construct a default empty [IntervalTree] object. + /// + /// # Examples + /// ```rust + /// extern crate dbs_allocator; + /// + /// let tree = dbs_allocator::IntervalTree::::new(); + /// ``` + pub fn new() -> Self { + IntervalTree { root: None } + } + + /// Check whether the interval tree is empty. + pub fn is_empty(&self) -> bool { + self.root.is_none() + } + + /// Get the data item associated with the key, or return None if no match found. + /// + /// # Examples + /// ```rust + /// extern crate dbs_allocator; + /// use dbs_allocator::{IntervalTree, NodeState, Range}; + /// + /// let mut tree = dbs_allocator::IntervalTree::::new(); + /// assert!(tree.is_empty()); + /// assert_eq!(tree.get(&Range::new(0x101u64, 0x101u64)), None); + /// tree.insert(Range::new(0x100u64, 0x100u64), Some(1)); + /// tree.insert(Range::new(0x200u64, 0x2ffu64), None); + /// assert!(!tree.is_empty()); + /// assert_eq!( + /// tree.get(&Range::new(0x100u64, 0x100u64)), + /// Some(NodeState::Valued(&1)) + /// ); + /// assert_eq!( + /// tree.get(&Range::new(0x200u64, 0x2ffu64)), + /// Some(NodeState::Free) + /// ); + /// assert_eq!(tree.get(&Range::new(0x101u64, 0x101u64)), None); + /// assert_eq!(tree.get(&Range::new(0x100u64, 0x101u64)), None); + /// ``` + pub fn get(&self, key: &Range) -> Option> { + match self.root { + None => None, + Some(ref node) => node.search(key).map(|n| n.0.data.as_ref()), + } + } + + /// Get a shared reference to the node fully covering the entire key range. + /// + /// # Examples + /// ```rust + /// extern crate dbs_allocator; + /// use dbs_allocator::{IntervalTree, NodeState, Range}; + /// + /// let mut tree = IntervalTree::::new(); + /// tree.insert(Range::new(0x100u32, 0x100u32), Some(1)); + /// tree.insert(Range::new(0x200u32, 0x2ffu32), None); + /// assert_eq!( + /// tree.get_superset(&Range::new(0x100u32, 0x100u32)), + /// Some((&Range::new(0x100u32, 0x100u32), NodeState::Valued(&1))) + /// ); + /// assert_eq!( + /// tree.get_superset(&Range::new(0x210u32, 0x210u32)), + /// Some((&Range::new(0x200u32, 0x2ffu32), NodeState::Free)) + /// ); + /// assert_eq!( + /// tree.get_superset(&Range::new(0x2ffu32, 0x2ffu32)), + /// Some((&Range::new(0x200u32, 0x2ffu32), NodeState::Free)) + /// ); + /// ``` + pub fn get_superset(&self, key: &Range) -> Option<(&Range, NodeState<&T>)> { + match self.root { + None => None, + Some(ref node) => node + .search_superset(key) + .map(|n| (&n.0.key, n.0.data.as_ref())), + } + } + + /// Get a mutable reference to the node fully covering the entire key range. + /// + /// # Examples + /// ```rust + /// extern crate dbs_allocator; + /// use dbs_allocator::{IntervalTree, NodeState, Range}; + /// + /// let mut tree = IntervalTree::::new(); + /// tree.insert(Range::new(0x100u32, 0x100u32), Some(1)); + /// tree.insert(Range::new(0x200u32, 0x2ffu32), None); + /// assert_eq!( + /// tree.get_superset_mut(&Range::new(0x100u32, 0x100u32)), + /// Some((&Range::new(0x100u32, 0x100u32), NodeState::Valued(&mut 1))) + /// ); + /// assert_eq!( + /// tree.get_superset_mut(&Range::new(0x210u32, 0x210u32)), + /// Some((&Range::new(0x200u32, 0x2ffu32), NodeState::Free)) + /// ); + /// assert_eq!( + /// tree.get_superset_mut(&Range::new(0x2ffu32, 0x2ffu32)), + /// Some((&Range::new(0x200u32, 0x2ffu32), NodeState::Free)) + /// ); + /// ``` + pub fn get_superset_mut(&mut self, key: &Range) -> Option<(&Range, NodeState<&mut T>)> { + match self.root { + None => None, + Some(ref mut node) => node + .search_superset_mut(key) + .map(|n| (&n.0.key, n.0.data.as_mut())), + } + } + + /// Get a shared reference to the value associated with the id. + /// + /// # Examples + /// ```rust + /// extern crate dbs_allocator; + /// use dbs_allocator::{IntervalTree, NodeState, Range}; + /// + /// let mut tree = IntervalTree::::new(); + /// tree.insert(Range::new(0x100u16, 0x100u16), Some(1)); + /// tree.insert(Range::new(0x200u16, 0x2ffu16), None); + /// assert_eq!(tree.get_by_id(0x100u16), Some(&1)); + /// assert_eq!(tree.get_by_id(0x210u32), None); + /// assert_eq!(tree.get_by_id(0x2ffu64), None); + /// ``` + pub fn get_by_id(&self, id: U) -> Option<&T> + where + u64: From, + { + match self.root { + None => None, + Some(ref node) => { + let key = Range::new_point(id); + match node.search_superset(&key) { + Some(node) => node.0.data.as_ref().into(), + None => None, + } + } + } + } + + /// Get a mutable reference to the value associated with the id. + /// + /// # Examples + /// ```rust + /// extern crate dbs_allocator; + /// use dbs_allocator::{IntervalTree, NodeState, Range}; + /// + /// let mut tree = IntervalTree::::new(); + /// tree.insert(Range::new(0x100u16, 0x100u16), Some(1)); + /// tree.insert(Range::new(0x200u16, 0x2ffu16), None); + /// assert_eq!(tree.get_by_id_mut(0x100u16), Some(&mut 1)); + /// assert_eq!(tree.get_by_id_mut(0x210u32), None); + /// assert_eq!(tree.get_by_id_mut(0x2ffu64), None); + /// ``` + pub fn get_by_id_mut(&mut self, id: U) -> Option<&mut T> + where + u64: From, + { + match self.root { + None => None, + Some(ref mut node) => { + let key = Range::new_point(id); + match node.search_superset_mut(&key) { + Some(node) => node.0.data.as_mut().into(), + None => None, + } + } + } + } + + /// Insert the (key, data) pair into the interval tree, panic if intersects with existing nodes. + /// + /// # Examples + /// ```rust + /// extern crate dbs_allocator; + /// use dbs_allocator::{IntervalTree, NodeState, Range}; + /// + /// let mut tree = IntervalTree::::new(); + /// tree.insert(Range::new(0x100u32, 0x100u32), Some(1)); + /// tree.insert(Range::new(0x200u32, 0x2ffu32), None); + /// assert_eq!( + /// tree.get(&Range::new(0x100u64, 0x100u64)), + /// Some(NodeState::Valued(&1)) + /// ); + /// assert_eq!( + /// tree.get(&Range::new(0x200u64, 0x2ffu64)), + /// Some(NodeState::Free) + /// ); + /// ``` + pub fn insert(&mut self, key: Range, data: Option) { + match self.root.take() { + None => self.root = Some(Node::new(key, data)), + Some(node) => self.root = Some(node.insert(key, data)), + } + } + + /// Update an existing entry and return the old value. + /// + /// # Examples + /// ```rust + /// extern crate dbs_allocator; + /// use dbs_allocator::{Constraint, IntervalTree, Range}; + /// + /// let mut tree = IntervalTree::::new(); + /// tree.insert(Range::new(0x100u64, 0x100u64), None); + /// tree.insert(Range::new(0x200u64, 0x2ffu64), None); + /// + /// let constraint = Constraint::new(2u32); + /// let key = tree.allocate(&constraint); + /// assert_eq!(key, Some(Range::new(0x200u64, 0x201u64))); + /// let old = tree.update(&Range::new(0x200u64, 0x201u64), 2); + /// assert_eq!(old, None); + /// let old = tree.update(&Range::new(0x200u64, 0x201u64), 3); + /// assert_eq!(old, Some(2)); + /// ``` + pub fn update(&mut self, key: &Range, data: T) -> Option { + match self.root.as_mut() { + None => None, + Some(node) => node.update(key, NodeState::::Valued(data)), + } + } + + /// Remove the `key` from the tree and return the associated data. + /// + /// # Examples + /// ```rust + /// extern crate dbs_allocator; + /// use dbs_allocator::{IntervalTree, Range}; + /// + /// let mut tree = IntervalTree::::new(); + /// tree.insert(Range::new(0x100u64, 0x100u64), Some(1)); + /// tree.insert(Range::new(0x200u64, 0x2ffu64), None); + /// let old = tree.delete(&Range::new(0x100u64, 0x100u64)); + /// assert_eq!(old, Some(1)); + /// let old = tree.delete(&Range::new(0x200u64, 0x2ffu64)); + /// assert_eq!(old, None); + /// ``` + pub fn delete(&mut self, key: &Range) -> Option { + match self.root.take() { + Some(node) => { + let (data, root) = node.delete(key); + self.root = root; + data + } + None => None, + } + } + + /// Allocate a resource range according the allocation constraints. + /// + /// # Examples + /// ```rust + /// extern crate dbs_allocator; + /// use dbs_allocator::{Constraint, IntervalTree, Range}; + /// + /// let mut tree = IntervalTree::::new(); + /// tree.insert(Range::new(0x100u64, 0x100u64), None); + /// tree.insert(Range::new(0x200u64, 0x2ffu64), None); + /// + /// let constraint = Constraint::new(2u8); + /// let key = tree.allocate(&constraint); + /// assert_eq!(key, Some(Range::new(0x200u64, 0x201u64))); + /// tree.update(&Range::new(0x200u64, 0x201u64), 2); + /// ``` + pub fn allocate(&mut self, constraint: &Constraint) -> Option { + if constraint.size == 0 { + return None; + } + let candidate = match self.root.as_mut() { + None => None, + Some(node) => node.find_candidate(constraint), + }; + + match candidate { + None => None, + Some(node) => { + let node_key = node.0.key; + let range = Range::new( + max(node_key.min, constraint.min), + min(node_key.max, constraint.max), + ); + // Safe to unwrap because candidate satisfy the constraints. + let aligned_key = range.align_to(constraint.align).unwrap(); + let result = Range::new(aligned_key.min, aligned_key.min + constraint.size - 1); + + // Allocate a resource from the node, no need to split the candidate node. + if node_key.min == aligned_key.min && node_key.len() == constraint.size { + self.root + .as_mut() + .unwrap() + .update(&node_key, NodeState::::Allocated); + return Some(node_key); + } + + // Split the candidate node. + // TODO: following algorithm is not optimal in preference of simplicity. + self.delete(&node_key); + if aligned_key.min > node_key.min { + self.insert(Range::new(node_key.min, aligned_key.min - 1), None); + } + self.insert(result, None); + if result.max < node_key.max { + self.insert(Range::new(result.max + 1, node_key.max), None); + } + + self.root + .as_mut() + .unwrap() + .update(&result, NodeState::::Allocated); + Some(result) + } + } + } + + /// Free an allocated range and return the associated data. + pub fn free(&mut self, key: &Range) -> Option { + let result = self.delete(key); + let mut range = *key; + + // Try to merge with adjacent free nodes. + if range.min > 0 { + if let Some((r, v)) = self.get_superset(&Range::new(range.min - 1, range.min - 1)) { + if v.is_free() { + range.min = r.min; + } + } + } + if range.max < std::u64::MAX { + if let Some((r, v)) = self.get_superset(&Range::new(range.max + 1, range.max + 1)) { + if v.is_free() { + range.max = r.max; + } + } + } + + if range.min < key.min { + self.delete(&Range::new(range.min, key.min - 1)); + } + if range.max > key.max { + self.delete(&Range::new(key.max + 1, range.max)); + } + self.insert(range, None); + + result + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + #[should_panic] + fn test_new_range() { + let _ = Range::new(2u8, 1u8); + } + + #[test] + #[should_panic] + fn test_new_range_overflow() { + let _ = Range::new(0u64, std::u64::MAX); + } + + #[test] + fn test_range_intersect() { + let range_a = Range::new(1u8, 4u8); + let range_b = Range::new(4u16, 6u16); + let range_c = Range::new(2u32, 3u32); + let range_d = Range::new(4u64, 4u64); + let range_e = Range::new(5u32, 6u32); + + assert!(range_a.intersect(&range_b)); + assert!(range_b.intersect(&range_a)); + assert!(range_a.intersect(&range_c)); + assert!(range_c.intersect(&range_a)); + assert!(range_a.intersect(&range_d)); + assert!(range_d.intersect(&range_a)); + assert!(!range_a.intersect(&range_e)); + assert!(!range_e.intersect(&range_a)); + + assert_eq!(range_a.len(), 4); + assert_eq!(range_d.len(), 1); + } + + #[test] + fn test_range_contain() { + let range_a = Range::new(2u8, 6u8); + assert!(range_a.contain(&Range::new(2u8, 3u8))); + assert!(range_a.contain(&Range::new(3u8, 4u8))); + assert!(range_a.contain(&Range::new(5u8, 5u8))); + assert!(range_a.contain(&Range::new(5u8, 6u8))); + assert!(range_a.contain(&Range::new(6u8, 6u8))); + assert!(!range_a.contain(&Range::new(1u8, 1u8))); + assert!(!range_a.contain(&Range::new(1u8, 2u8))); + assert!(!range_a.contain(&Range::new(1u8, 3u8))); + assert!(!range_a.contain(&Range::new(1u8, 7u8))); + assert!(!range_a.contain(&Range::new(7u8, 8u8))); + assert!(!range_a.contain(&Range::new(6u8, 7u8))); + assert!(!range_a.contain(&Range::new(7u8, 8u8))); + } + + #[test] + fn test_range_align_to() { + let range_a = Range::new(2u32, 6); + assert_eq!(range_a.align_to(0), Some(Range::new(2u64, 6u64))); + assert_eq!(range_a.align_to(1), Some(Range::new(2u8, 6u8))); + assert_eq!(range_a.align_to(2), Some(Range::new(2u16, 6u16))); + assert_eq!(range_a.align_to(4), Some(Range::new(4u32, 6u32))); + assert_eq!(range_a.align_to(8), None); + assert_eq!(range_a.align_to(3), None); + + let range_b = Range::new(0xFFFF_FFFF_FFFF_FFFDu64, 0xFFFF_FFFF_FFFF_FFFFu64); + assert_eq!( + range_b.align_to(2), + Some(Range::new(0xFFFF_FFFF_FFFF_FFFEu64, 0xFFFF_FFFF_FFFF_FFFF)) + ); + assert_eq!(range_b.align_to(4), None); + } + + #[test] + fn test_range_ord() { + let range_a = Range::new(1u32, 4u32); + let range_b = Range::new(1u32, 4u32); + let range_c = Range::new(1u32, 3u32); + let range_d = Range::new(1u32, 5u32); + let range_e = Range::new(2u32, 2u32); + + assert_eq!(range_a, range_b); + assert_eq!(range_b, range_a); + assert!(range_a > range_c); + assert!(range_c < range_a); + assert!(range_a < range_d); + assert!(range_d > range_a); + assert!(range_a < range_e); + assert!(range_e > range_a); + } + + #[should_panic] + #[test] + fn test_tree_insert_equal() { + let mut tree = IntervalTree::::new(); + tree.insert(Range::new(0x100u16, 0x200), Some(1)); + tree.insert(Range::new(0x100u32, 0x200), None); + } + + #[should_panic] + #[test] + fn test_tree_insert_intersect_on_right() { + let mut tree = IntervalTree::::new(); + tree.insert(Range::new(0x100, 0x200u32), Some(1)); + tree.insert(Range::new(0x200, 0x2ffu64), None); + } + + #[should_panic] + #[test] + fn test_tree_insert_intersect_on_left() { + let mut tree = IntervalTree::::new(); + tree.insert(Range::new(0x100, 0x200u32), Some(1)); + tree.insert(Range::new(0x000, 0x100u64), None); + } + + #[test] + fn test_tree_get_superset() { + let mut tree = IntervalTree::::new(); + tree.insert(Range::new(0x100u32, 0x100u32), Some(1)); + tree.insert(Range::new(0x001u16, 0x008u16), None); + tree.insert(Range::new(0x009u16, 0x00fu16), None); + tree.insert(Range::new(0x200u16, 0x2ffu16), None); + let mut constraint = Constraint::new(8u64); + constraint.min = 0x211; + constraint.max = 0x21f; + constraint.align = 0x8; + tree.allocate(&constraint); + + // Valued case. + assert_eq!( + tree.get_superset(&Range::new(0x100u32, 0x100)), + Some((&Range::new(0x100, 0x100u32), NodeState::Valued(&1))) + ); + + // Free case. + assert_eq!( + tree.get_superset(&Range::new(0x200u16, 0x200)), + Some((&Range::new(0x200, 0x217u64), NodeState::Free)) + ); + assert_eq!( + tree.get_superset(&Range::new(0x2ffu32, 0x2ff)), + Some((&Range::new(0x220, 0x2ffu32), NodeState::Free)) + ); + + // Allocated case. + assert_eq!( + tree.get_superset(&Range::new(0x218u16, 0x21f)), + Some((&Range::new(0x218, 0x21fu16), NodeState::Allocated)) + ); + + // None case. + assert_eq!(tree.get_superset(&Range::new(0x2ffu32, 0x300)), None); + assert_eq!(tree.get_superset(&Range::new(0x300u32, 0x300)), None); + assert_eq!(tree.get_superset(&Range::new(0x1ffu32, 0x300)), None); + } + + #[test] + fn test_tree_get_superset_mut() { + let mut tree = IntervalTree::::new(); + tree.insert(Range::new(0x100u32, 0x100u32), Some(1)); + tree.insert(Range::new(0x200u16, 0x2ffu16), None); + let mut constraint = Constraint::new(8u64); + constraint.min = 0x211; + constraint.max = 0x21f; + constraint.align = 0x8; + tree.allocate(&constraint); + + // Valued case. + assert_eq!( + tree.get_superset_mut(&Range::new(0x100u32, 0x100u32)), + Some((&Range::new(0x100u32, 0x100u32), NodeState::Valued(&mut 1))) + ); + + // Allocated case. + assert_eq!( + tree.get_superset_mut(&Range::new(0x218u64, 0x21fu64)), + Some((&Range::new(0x218u64, 0x21fu64), NodeState::Allocated)) + ); + + // Free case. + assert_eq!( + tree.get_superset_mut(&Range::new(0x2ffu32, 0x2ffu32)), + Some((&Range::new(0x220u32, 0x2ffu32), NodeState::Free)) + ); + + // None case. + assert_eq!(tree.get_superset(&Range::new(0x2ffu32, 0x300)), None); + assert_eq!(tree.get_superset(&Range::new(0x300u32, 0x300)), None); + assert_eq!(tree.get_superset(&Range::new(0x1ffu32, 0x300)), None); + } + + #[test] + fn test_tree_update() { + let mut tree = IntervalTree::::new(); + tree.insert(Range::new(0x100u32, 0x100u32), None); + tree.insert(Range::new(0x200u32, 0x2ffu32), None); + + let constraint = Constraint::new(2u32); + let key = tree.allocate(&constraint); + assert_eq!(key, Some(Range::new(0x200u32, 0x201u32))); + let old = tree.update(&Range::new(0x200u32, 0x201u32), 2); + assert_eq!(old, None); + let old = tree.update(&Range::new(0x200u32, 0x201u32), 3); + assert_eq!(old, Some(2)); + let old = tree.update(&Range::new(0x200u32, 0x200u32), 4); + assert_eq!(old, None); + let old = tree.update(&Range::new(0x200u32, 0x203u32), 5); + assert_eq!(old, None); + + tree.delete(&Range::new(0x200u32, 0x201u32)); + let old = tree.update(&Range::new(0x200u32, 0x201u32), 2); + assert_eq!(old, None); + } + + #[test] + fn test_tree_delete() { + let mut tree = IntervalTree::::new(); + assert_eq!(tree.get(&Range::new(0x101u32, 0x101u32)), None); + assert!(tree.is_empty()); + tree.insert(Range::new(0x100u32, 0x100u32), Some(1)); + tree.insert(Range::new(0x001u16, 0x00fu16), None); + tree.insert(Range::new(0x200u32, 0x2ffu32), None); + assert!(!tree.is_empty()); + assert_eq!( + tree.get(&Range::new(0x100u32, 0x100u32)), + Some(NodeState::Valued(&1)) + ); + assert_eq!( + tree.get(&Range::new(0x200u32, 0x2ffu32)), + Some(NodeState::Free) + ); + assert_eq!(tree.get(&Range::new(0x101u32, 0x101u32)), None); + + let old = tree.delete(&Range::new(0x001u16, 0x00fu16)); + assert_eq!(old, None); + let old = tree.delete(&Range::new(0x100u32, 0x100u32)); + assert_eq!(old, Some(1)); + let old = tree.delete(&Range::new(0x200u32, 0x2ffu32)); + assert_eq!(old, None); + + assert!(tree.is_empty()); + assert_eq!(tree.get(&Range::new(0x100u32, 0x100u32)), None); + assert_eq!(tree.get(&Range::new(0x200u32, 0x2ffu32)), None); + } + + #[test] + fn test_allocate_free() { + let mut tree = IntervalTree::::new(); + let mut constraint = Constraint::new(1u8); + + assert_eq!(tree.allocate(&constraint), None); + tree.insert(Range::new(0x100u16, 0x100u16), None); + tree.insert(Range::new(0x200u16, 0x2ffu16), None); + + let key = tree.allocate(&constraint); + assert_eq!(key, Some(Range::new(0x100u16, 0x100u16))); + let old = tree.update(&Range::new(0x100u16, 0x100u16), 2); + assert_eq!(old, None); + let val = tree.get(&Range::new(0x100u16, 0x100u16)); + assert_eq!(val, Some(NodeState::Valued(&2))); + + constraint.min = 0x100; + constraint.max = 0x100; + assert_eq!(tree.allocate(&constraint), None); + + constraint.min = 0x201; + constraint.max = 0x300; + constraint.align = 0x8; + constraint.size = 0x10; + assert_eq!( + tree.allocate(&constraint), + Some(Range::new(0x208u16, 0x217u16)) + ); + + // Free the node when it's still in 'Allocated' state. + let old = tree.free(&Range::new(0x208u16, 0x217u16)); + assert_eq!(old, None); + + // Reallocate the freed resource. + assert_eq!( + tree.allocate(&constraint), + Some(Range::new(0x208u16, 0x217u16)) + ); + + constraint.size = 0x100; + assert_eq!(tree.allocate(&constraint), None); + + // Verify that allocating a bigger range with smaller allocated range fails. + constraint.min = 0x200; + constraint.max = 0x2ff; + constraint.align = 0x8; + constraint.size = 0x100; + assert_eq!(tree.allocate(&constraint), None); + + // Free the node when it's in 'Valued' state. + tree.update(&Range::new(0x208u16, 0x217u16), 0x10); + assert_eq!(tree.allocate(&constraint), None); + let old = tree.free(&Range::new(0x208u16, 0x217u16)); + assert_eq!(old, Some(0x10)); + + // Reallocate the freed resource, verify that adjacent free nodes have been merged. + assert_eq!( + tree.allocate(&constraint), + Some(Range::new(0x200u32, 0x2ffu32)) + ); + } + + #[test] + fn test_with_size() { + let range_a = Range::with_size(1u8, 3u8); + let range_b = Range::with_size(4u16, 2u16); + let range_c = Range::with_size(2u32, 1u32); + let range_d = Range::with_size(4u64, 0u64); + let range_e = Range::with_size(5u32, 1u32); + + assert_eq!(range_a, Range::new(1u8, 4u8)); + assert_eq!(range_b, Range::new(4u16, 6u16)); + assert_eq!(range_c, Range::new(2u32, 3u32)); + assert_eq!(range_d, Range::new(4u64, 4u64)); + assert_eq!(range_e, Range::new(5u32, 6u32)); + } + + #[test] + fn test_new_point() { + let range_a = Range::new_point(1u8); + let range_b = Range::new_point(2u16); + let range_c = Range::new_point(3u32); + let range_d = Range::new_point(4u64); + let range_e = Range::new_point(5u32); + + assert_eq!(range_a, Range::with_size(1u8, 0u8)); + assert_eq!(range_b, Range::with_size(2u16, 0u16)); + assert_eq!(range_c, Range::with_size(3u32, 0u32)); + assert_eq!(range_d, Range::with_size(4u64, 0u64)); + assert_eq!(range_e, Range::with_size(5u32, 0u32)); + } + + #[test] + fn test_get_by_id() { + let mut tree = IntervalTree::::new(); + tree.insert(Range::new(0x100u16, 0x100u16), Some(1)); + tree.insert(Range::new(0x001u32, 0x005u32), Some(2)); + tree.insert(Range::new(0x200u16, 0x2ffu16), None); + + assert_eq!(tree.get_by_id(0x100u16), Some(&1)); + assert_eq!(tree.get_by_id(0x002u32), Some(&2)); + assert_eq!(tree.get_by_id(0x210u32), None); + assert_eq!(tree.get_by_id(0x2ffu64), None); + } + + #[test] + fn test_get_by_id_mut() { + let mut tree = IntervalTree::::new(); + tree.insert(Range::new(0x100u16, 0x100u16), Some(1)); + tree.insert(Range::new(0x001u32, 0x005u32), Some(2)); + tree.insert(Range::new(0x200u16, 0x2ffu16), None); + + assert_eq!(tree.get_by_id_mut(0x100u16), Some(&mut 1)); + assert_eq!(tree.get_by_id_mut(0x002u32), Some(&mut 2)); + assert_eq!(tree.get_by_id_mut(0x210u32), None); + assert_eq!(tree.get_by_id_mut(0x2ffu64), None); + } +} diff --git a/src/dragonball/src/dbs_allocator/src/lib.rs b/src/dragonball/src/dbs_allocator/src/lib.rs new file mode 100644 index 000000000..c489290d9 --- /dev/null +++ b/src/dragonball/src/dbs_allocator/src/lib.rs @@ -0,0 +1,164 @@ +// Copyright (C) 2019, 2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Data structures and algorithms to support resource allocation and management. +//! +//! The `dbs-allocator` crate provides data structures and algorithms to manage and allocate +//! integer identifiable resources. The resource manager in virtual machine monitor (VMM) may +//! manage and allocate resources for virtual machines by using: +//! - [Constraint]: Struct to declare constraints for resource allocation. +//! - [IntervalTree]: An interval tree implementation specialized for VMM resource management. + +#![deny(missing_docs)] + +pub mod interval_tree; +pub use interval_tree::{IntervalTree, NodeState, Range}; + +/// Error codes for resource allocation operations. +#[derive(thiserror::Error, Debug, Eq, PartialEq)] +pub enum Error { + /// Invalid boundary for resource allocation. + #[error("invalid boundary constraint: min ({0}), max ({1})")] + InvalidBoundary(u64, u64), +} + +/// Specialized version of [`std::result::Result`] for resource allocation operations. +pub type Result = std::result::Result; + +/// Resource allocation policies. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum AllocPolicy { + /// Default resource allocation policy. + Default, + /// Return the first available resource matching the allocation constraints. + FirstMatch, +} + +/// Struct to declare resource allocation constraints. +#[derive(Copy, Clone, Debug)] +pub struct Constraint { + /// Size of resource to allocate. + pub size: u64, + /// Lower boundary for resource allocation. + pub min: u64, + /// Upper boundary for resource allocation. + pub max: u64, + /// Alignment for allocated resource. + pub align: u64, + /// Policy for resource allocation. + pub policy: AllocPolicy, +} + +impl Constraint { + /// Create a new instance of [`Constraint`] with default settings. + pub fn new(size: T) -> Self + where + u64: From, + { + Constraint { + size: u64::from(size), + min: 0, + max: u64::MAX, + align: 1, + policy: AllocPolicy::Default, + } + } + + /// Set the lower boundary constraint for resource allocation. + pub fn min(mut self, min: T) -> Self + where + u64: From, + { + self.min = u64::from(min); + self + } + + /// Set the upper boundary constraint for resource allocation. + pub fn max(mut self, max: T) -> Self + where + u64: From, + { + self.max = u64::from(max); + self + } + + /// Set the alignment constraint for allocated resource. + pub fn align(mut self, align: T) -> Self + where + u64: From, + { + self.align = u64::from(align); + self + } + + /// Set the resource allocation policy. + pub fn policy(mut self, policy: AllocPolicy) -> Self { + self.policy = policy; + self + } + + /// Validate the resource allocation constraints. + pub fn validate(&self) -> Result<()> { + if self.max < self.min { + return Err(Error::InvalidBoundary(self.min, self.max)); + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_set_min() { + let constraint = Constraint::new(2_u64).min(1_u64); + assert_eq!(constraint.min, 1_u64); + } + + #[test] + fn test_set_max() { + let constraint = Constraint::new(2_u64).max(100_u64); + assert_eq!(constraint.max, 100_u64); + } + + #[test] + fn test_set_align() { + let constraint = Constraint::new(2_u64).align(8_u64); + assert_eq!(constraint.align, 8_u64); + } + + #[test] + fn test_set_policy() { + let mut constraint = Constraint::new(2_u64).policy(AllocPolicy::FirstMatch); + assert_eq!(constraint.policy, AllocPolicy::FirstMatch); + constraint = constraint.policy(AllocPolicy::Default); + assert_eq!(constraint.policy, AllocPolicy::Default); + } + + #[test] + fn test_consistently_change_constraint() { + let constraint = Constraint::new(2_u64) + .min(1_u64) + .max(100_u64) + .align(8_u64) + .policy(AllocPolicy::FirstMatch); + assert_eq!(constraint.min, 1_u64); + assert_eq!(constraint.max, 100_u64); + assert_eq!(constraint.align, 8_u64); + assert_eq!(constraint.policy, AllocPolicy::FirstMatch); + } + + #[test] + fn test_set_invalid_boundary() { + // Normal case. + let constraint = Constraint::new(2_u64).max(1000_u64).min(999_u64); + assert!(constraint.validate().is_ok()); + + // Error case. + let constraint = Constraint::new(2_u64).max(999_u64).min(1000_u64); + assert_eq!( + constraint.validate(), + Err(Error::InvalidBoundary(1000u64, 999u64)) + ); + } +} diff --git a/src/dragonball/src/dbs_arch/Cargo.toml b/src/dragonball/src/dbs_arch/Cargo.toml new file mode 100644 index 000000000..b6deb0ba1 --- /dev/null +++ b/src/dragonball/src/dbs_arch/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "dbs-arch" +version = "0.2.3" +authors = ["Alibaba Dragonball Team"] +license = "Apache-2.0 AND BSD-3-Clause" +edition = "2018" +description = "A collection of CPU architecture specific constants and utilities." +homepage = "https://github.com/openanolis/dragonball-sandbox" +repository = "https://github.com/openanolis/dragonball-sandbox" +keywords = ["dragonball", "secure-sandbox", "arch", "ARM64", "x86"] +readme = "README.md" + +[dependencies] +memoffset = "0.6" +kvm-bindings = { version = "0.6.0", features = ["fam-wrappers"] } +kvm-ioctls = "0.12.0" +thiserror = "1" +vm-memory = { version = "0.9" } +vmm-sys-util = "0.11.0" +libc = ">=0.2.39" + +[dev-dependencies] +vm-memory = { version = "0.9", features = ["backend-mmap"] } + +[package.metadata.docs.rs] +all-features = true diff --git a/src/dragonball/src/dbs_arch/LICENSE b/src/dragonball/src/dbs_arch/LICENSE new file mode 120000 index 000000000..30cff7403 --- /dev/null +++ b/src/dragonball/src/dbs_arch/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/src/dragonball/src/dbs_arch/README.md b/src/dragonball/src/dbs_arch/README.md new file mode 100644 index 000000000..60ff3a9aa --- /dev/null +++ b/src/dragonball/src/dbs_arch/README.md @@ -0,0 +1,29 @@ +# dbs-arch + +## Design + +The `dbs-arch` crate is a collection of CPU architecture specific constants and utilities to hide CPU architecture details away from the Dragonball Sandbox or other VMMs. +Also, we have provided x86_64 CPUID support in this crate, for more details you could look at [this document](docs/x86_64_cpuid.md) + +## Supported Architectures + +- AMD64 (x86_64) +- ARM64 (aarch64) + +## Submodule List + +This repository contains the following submodules: +| Name | Arch| Description | +| --- | --- | --- | +| [x86_64::cpuid](src/x86_64/cpuid/) | x86_64 |Facilities to process CPUID information. | +| [x86_64::msr](src/x86_64/msr.rs) | x86_64 | Constants and functions for Model Specific Registers | +| [aarch64::gic](src/aarch64/gic) | aarch64 | Structures to manage GICv2/GICv3/ITS devices for ARM64 | +| [aarch64::regs](src/aarch64/regs.rs) | aarch64 | Constants and functions to configure and manage CPU registers | + +## Acknowledgement + +Part of the code is derived from the [Firecracker](https://github.com/firecracker-microvm/firecracker) project. + +## License + +This project is licensed under [Apache License](http://www.apache.org/licenses/LICENSE-2.0), Version 2.0. diff --git a/src/dragonball/src/dbs_arch/THIRD-PARTY b/src/dragonball/src/dbs_arch/THIRD-PARTY new file mode 120000 index 000000000..301d0a498 --- /dev/null +++ b/src/dragonball/src/dbs_arch/THIRD-PARTY @@ -0,0 +1 @@ +../../THIRD-PARTY \ No newline at end of file diff --git a/src/dragonball/src/dbs_arch/docs/x86_64_cpuid.md b/src/dragonball/src/dbs_arch/docs/x86_64_cpuid.md new file mode 100644 index 000000000..57272ac73 --- /dev/null +++ b/src/dragonball/src/dbs_arch/docs/x86_64_cpuid.md @@ -0,0 +1,68 @@ +# CPUID + +## Design + +CPUID is designed as the CPUID filter for Intel and AMD CPU Identification. Through CPUID configuration, we could set CPU topology, Cache topology, PMU status and other features for the VMs. + +CPUID is developed based on the Firecracker CPUID code while we add other extensions such as CPU Topology and VPMU features. + +## Usage +To use CPUID, you should first use KVM_GET_CPUID2 ioctl to get the original CPUID then use process_cpuid() provided by the db-arch to filter CPUID with the information you want and suitable for VM conditions. + +Currently, we support following specifications that db-arch could use to filter CPUID: +```rust +pub struct VmSpec { + /// The vendor id of the CPU + cpu_vendor_id: [u8; 12], + /// The id of the current logical cpu in the range [0..cpu_count]. + cpu_id: u8, + /// The total number of logical cpus (includes cpus that could be hotplugged). + cpu_count: u8, + /// The desired brand string for the guest. + brand_string: BrandString, + /// threads per core for cpu topology information + threads_per_core: u8, + /// cores per die for cpu topology information + cores_per_die: u8, + /// dies per socket for cpu topology information + dies_per_socket: u8, + /// if vpmu feature is Disabled, it means vpmu feature is off (by default) + /// if vpmu feature is LimitedlyEnabled, it means minimal vpmu counters are supported (cycles and instructions) + /// if vpmu feature is FullyEnabled, it means all vpmu counters are supported + vpmu_feature: VpmuFeatureLevel, +} +``` + +## Example +We will show examples for filtering CPUID. +First, you need to use KVM_GET_CPUID2 ioctl to get the original CPUID, this part is not included in the db-cpuid. + +```rust +// an example for getting the cpuid in the vmm. +let mut cpuid = CpuId::new(num_entries).map_err(|_| errno::Error::new(libc::ENOMEM))?; +let ret = unsafe {ioctl_with_mut_ptr(self, KVM_GET_CPUID2(), cpuid.as_mut_fam_struct_ptr())}; +if ret != 0 { + return Err(errno::Error::last()); +} +``` + +Then we could create the `VmSpec` to describe the VM specification we want and use process_cpuid() to filter CPUID. + +```rust +let cpuid_vm_spec = VmSpec::new( + self.id, + vcpu_config.max_all_vcpu_count as u8, + vcpu_config.threads_per_core, + vcpu_config.cores_per_die, + vcpu_config.dies_per_socket, + vcpu_config.vpmu_feature, + ) + .map_err(VcpuError::CpuId)?; + process_cpuid(&mut self.cpuid, &cpuid_vm_spec).map_err(|e| { + METRICS.vcpu.process_cpuid.inc(); + error!("Failure in configuring CPUID for vcpu {}: {:?}", self.id, e); + VcpuError::CpuId(e) + })?; +``` + +After the CPUID is filtered, we could use it to set the guest's CPUID. diff --git a/src/dragonball/src/dbs_arch/src/aarch64/gic/gicv2.rs b/src/dragonball/src/dbs_arch/src/aarch64/gic/gicv2.rs new file mode 100644 index 000000000..5984570a9 --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/aarch64/gic/gicv2.rs @@ -0,0 +1,110 @@ +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_ioctls::DeviceFd; + +use super::{GICDevice, Result}; + +/// Represent a GIC v2 device +pub struct GICv2 { + /// The file descriptor for the KVM device + fd: DeviceFd, + + /// GIC device properties, to be used for setting up the fdt entry + properties: [u64; 4], + + /// Number of CPUs handled by the device + vcpu_count: u64, +} + +impl GICv2 { + // Unfortunately bindgen omits defines that are based on other defines. + // See arch/arm64/include/uapi/asm/kvm.h file from the linux kernel. + const KVM_VGIC_V2_DIST_SIZE: u64 = 0x1000; + const KVM_VGIC_V2_CPU_SIZE: u64 = 0x2000; + + // Device trees specific constants + const ARCH_GIC_V2_MAINT_IRQ: u32 = 8; + + /// Get the address of the GICv2 distributor. + const fn get_dist_addr() -> u64 { + crate::aarch64::gic::GIC_REG_END_ADDRESS - GICv2::KVM_VGIC_V2_DIST_SIZE + } + + /// Get the size of the GIC_v2 distributor. + const fn get_dist_size() -> u64 { + GICv2::KVM_VGIC_V2_DIST_SIZE + } + + /// Get the address of the GIC_v2 CPU. + const fn get_cpu_addr() -> u64 { + GICv2::get_dist_addr() - GICv2::KVM_VGIC_V2_CPU_SIZE + } + + /// Get the size of the GIC_v2 CPU. + const fn get_cpu_size() -> u64 { + GICv2::KVM_VGIC_V2_CPU_SIZE + } +} + +impl GICDevice for GICv2 { + fn device_fd(&self) -> &DeviceFd { + &self.fd + } + + fn device_properties(&self) -> &[u64] { + &self.properties + } + + fn vcpu_count(&self) -> u64 { + self.vcpu_count + } + + fn fdt_compatibility(&self) -> &str { + "arm,gic-400" + } + + fn fdt_maint_irq(&self) -> u32 { + GICv2::ARCH_GIC_V2_MAINT_IRQ + } + + fn version() -> u32 { + kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V2 + } + + fn create_device(fd: DeviceFd, vcpu_count: u64) -> Box { + Box::new(GICv2 { + fd, + properties: [ + GICv2::get_dist_addr(), + GICv2::get_dist_size(), + GICv2::get_cpu_addr(), + GICv2::get_cpu_size(), + ], + vcpu_count, + }) + } + + fn init_device_attributes(gic_device: &dyn GICDevice) -> Result<()> { + /* Setting up the distributor attribute. + We are placing the GIC below 1GB so we need to substract the size of the distributor. */ + Self::set_device_attribute( + gic_device.device_fd(), + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + u64::from(kvm_bindings::KVM_VGIC_V2_ADDR_TYPE_DIST), + &GICv2::get_dist_addr() as *const u64 as u64, + 0, + )?; + + /* Setting up the CPU attribute. */ + Self::set_device_attribute( + gic_device.device_fd(), + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + u64::from(kvm_bindings::KVM_VGIC_V2_ADDR_TYPE_CPU), + &GICv2::get_cpu_addr() as *const u64 as u64, + 0, + )?; + + Ok(()) + } +} diff --git a/src/dragonball/src/dbs_arch/src/aarch64/gic/gicv3.rs b/src/dragonball/src/dbs_arch/src/aarch64/gic/gicv3.rs new file mode 100644 index 000000000..87a9081d5 --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/aarch64/gic/gicv3.rs @@ -0,0 +1,136 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::boxed::Box; +use std::collections::HashMap; + +use kvm_ioctls::{DeviceFd, VmFd}; + +use super::its::ItsType::{PciMsiIts, PlatformMsiIts}; +use super::its::{ItsType, ITS}; +use super::{GICDevice, Result}; + +/// GICv3 instance +pub struct GICv3 { + /// The file descriptor for the KVM device + fd: DeviceFd, + + /// GIC device properties, to be used for setting up the fdt entry + properties: [u64; 4], + + /// Number of CPUs handled by the device + vcpu_count: u64, + + /// ITS instance of this gic control + its: HashMap, +} + +impl GICv3 { + // Unfortunately bindgen omits defines that are based on other defines. + // See arch/arm64/include/uapi/asm/kvm.h file from the linux kernel. + const SZ_64K: u64 = 0x0001_0000; + const KVM_VGIC_V3_DIST_SIZE: u64 = GICv3::SZ_64K; + const KVM_VGIC_V3_REDIST_SIZE: u64 = (2 * GICv3::SZ_64K); + + // Device trees specific constants + const ARCH_GIC_V3_MAINT_IRQ: u32 = 9; + + /// Get the address of the GIC distributor. + fn get_dist_addr() -> u64 { + crate::aarch64::gic::GIC_REG_END_ADDRESS - GICv3::KVM_VGIC_V3_DIST_SIZE + } + + /// Get the size of the GIC distributor. + fn get_dist_size() -> u64 { + GICv3::KVM_VGIC_V3_DIST_SIZE + } + + /// Get the address of the GIC redistributors. + pub fn get_redists_addr(vcpu_count: u64) -> u64 { + GICv3::get_dist_addr() - GICv3::get_redists_size(vcpu_count) + } + + /// Get the size of the GIC redistributors. + fn get_redists_size(vcpu_count: u64) -> u64 { + vcpu_count * GICv3::KVM_VGIC_V3_REDIST_SIZE + } +} + +impl GICDevice for GICv3 { + fn device_fd(&self) -> &DeviceFd { + &self.fd + } + + fn device_properties(&self) -> &[u64] { + &self.properties + } + + fn vcpu_count(&self) -> u64 { + self.vcpu_count + } + + fn fdt_compatibility(&self) -> &str { + "arm,gic-v3" + } + + fn fdt_maint_irq(&self) -> u32 { + GICv3::ARCH_GIC_V3_MAINT_IRQ + } + + fn get_its_reg_range(&self, its_type: &ItsType) -> Option<[u64; 2]> { + self.its.get(its_type).map(|its| its.get_reg_range()) + } + + fn attach_its(&mut self, vm: &VmFd) -> Result<()> { + let its = ITS::new(vm, self, PlatformMsiIts)?; + self.its.insert(PlatformMsiIts, its); + let its = ITS::new(vm, self, PciMsiIts)?; + self.its.insert(PciMsiIts, its); + Ok(()) + } + + fn version() -> u32 { + kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3 + } + + fn create_device(fd: DeviceFd, vcpu_count: u64) -> Box { + Box::new(GICv3 { + fd, + properties: [ + GICv3::get_dist_addr(), + GICv3::get_dist_size(), + GICv3::get_redists_addr(vcpu_count), + GICv3::get_redists_size(vcpu_count), + ], + vcpu_count, + its: HashMap::new(), + }) + } + + fn init_device_attributes(gic_device: &dyn GICDevice) -> Result<()> { + /* Setting up the distributor attribute. + We are placing the GIC below 1GB so we need to substract the size of the distributor. + */ + Self::set_device_attribute( + gic_device.device_fd(), + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + kvm_bindings::KVM_VGIC_V3_ADDR_TYPE_DIST.into(), + &GICv3::get_dist_addr() as *const u64 as u64, + 0, + )?; + + /* Setting up the redistributors' attribute. + We are calculating here the start of the redistributors address. We have one per CPU. + */ + Self::set_device_attribute( + gic_device.device_fd(), + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + kvm_bindings::KVM_VGIC_V3_ADDR_TYPE_REDIST.into(), + &GICv3::get_redists_addr(gic_device.vcpu_count()) as *const u64 as u64, + 0, + )?; + + Ok(()) + } +} diff --git a/src/dragonball/src/dbs_arch/src/aarch64/gic/its.rs b/src/dragonball/src/dbs_arch/src/aarch64/gic/its.rs new file mode 100644 index 000000000..0f2384b68 --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/aarch64/gic/its.rs @@ -0,0 +1,81 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_ioctls::{DeviceFd, VmFd}; + +use super::gicv3::GICv3; +use super::{Error, GICDevice, Result}; + +// ITS register range +const REG_RANGE_LEN: u64 = 0x20000; + +/// ITS type +#[derive(Hash, PartialEq, Eq)] +pub enum ItsType { + /// platform msi its + PlatformMsiIts, + /// pci msi its + PciMsiIts, +} + +/// Only GIC-V3 can use ITS +pub struct ITS { + /// The file descriptor for the KVM device + fd: DeviceFd, + reg_range: [u64; 2], +} + +impl ITS { + /// Create an ITS device + pub fn new(vm: &VmFd, gic_ctl: &GICv3, its_type: ItsType) -> Result { + let fd = ITS::create_device_fd(vm)?; + // Define the mmio space of platform msi its after the mmio space of pci msi its + let offset = match its_type { + ItsType::PlatformMsiIts => REG_RANGE_LEN, + ItsType::PciMsiIts => REG_RANGE_LEN * 2, + }; + let vcpu_count = gic_ctl.vcpu_count(); + // No document has been found to accurately describe the storage location and + // length of the ITS register. Currently, we store the ITS register in front of + // the redistributor register. And temporarily refer to the "arm, gic-v3-its" + // kernel document to set the ITS register length to 0x20000.In addition, + // reg_range is a two-tuple, representing the register base address and the + // length of the register address space. + let reg_range: [u64; 2] = [GICv3::get_redists_addr(vcpu_count) - offset, REG_RANGE_LEN]; + let its = ITS { fd, reg_range }; + let reg_base_addr = its.get_reg_range_base_addr(); + its.set_attribute(reg_base_addr)?; + Ok(its) + } + + fn create_device_fd(vm: &VmFd) -> Result { + let mut its_device = kvm_bindings::kvm_create_device { + type_: kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_ITS, + fd: 0, + flags: 0, + }; + vm.create_device(&mut its_device).map_err(Error::CreateITS) + } + + fn set_attribute(&self, reg_base_addr: u64) -> Result<()> { + let attribute = kvm_bindings::kvm_device_attr { + group: kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + attr: u64::from(kvm_bindings::KVM_VGIC_ITS_ADDR_TYPE), + addr: ®_base_addr as *const u64 as u64, + flags: 0, + }; + self.fd + .set_device_attr(&attribute) + .map_err(Error::SetITSAttribute)?; + Ok(()) + } + + fn get_reg_range_base_addr(&self) -> u64 { + self.reg_range[0] + } + + /// Get its reg range + pub fn get_reg_range(&self) -> [u64; 2] { + self.reg_range + } +} diff --git a/src/dragonball/src/dbs_arch/src/aarch64/gic/mod.rs b/src/dragonball/src/dbs_arch/src/aarch64/gic/mod.rs new file mode 100644 index 000000000..80099aaf7 --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/aarch64/gic/mod.rs @@ -0,0 +1,218 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +/// Export gicv2 interface +pub mod gicv2; +/// Export gicv3 interface +pub mod gicv3; +/// Export ITS interface +pub mod its; + +use std::{boxed::Box, result}; + +use kvm_ioctls::{DeviceFd, VmFd}; + +use gicv2::GICv2; +use gicv3::GICv3; + +// As per virt/kvm/arm/vgic/vgic-kvm-device.c we need +// the number of interrupts our GIC will support to be: +// * bigger than 32 +// * less than 1023 and +// * a multiple of 32. +// We are setting up our interrupt controller to support a maximum of 128 interrupts. + +/// First usable interrupt on aarch64. +pub const IRQ_BASE: u32 = 32; + +/// Last usable interrupt on aarch64. +pub const IRQ_MAX: u32 = 159; + +/// Define the gic register end address. +pub const GIC_REG_END_ADDRESS: u64 = 1 << 30; // 1GB + +/// Errors thrown while setting up the GIC. +#[derive(Debug)] +pub enum Error { + /// Error while calling KVM ioctl for setting up the global interrupt controller. + CreateGIC(kvm_ioctls::Error), + /// Error while setting device attributes for the GIC. + SetDeviceAttribute(kvm_ioctls::Error), + /// The number of vCPUs in the GicState doesn't match the number of vCPUs on the system + InconsistentVcpuCount, + /// The VgicSysRegsState is invalid + InvalidVgicSysRegState, + /// ERROR while create ITS fail + CreateITS(kvm_ioctls::Error), + /// ERROR while set ITS attr fail + SetITSAttribute(kvm_ioctls::Error), +} +type Result = result::Result; + +/// Function that flushes `RDIST` pending tables into guest RAM. +/// +/// The tables get flushed to guest RAM whenever the VM gets stopped. +pub fn save_pending_tables(fd: &DeviceFd) -> Result<()> { + let init_gic_attr = kvm_bindings::kvm_device_attr { + group: kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, + attr: u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES), + addr: 0, + flags: 0, + }; + fd.set_device_attr(&init_gic_attr) + .map_err(Error::SetDeviceAttribute) +} + +/// Trait for GIC devices. +pub trait GICDevice: Send { + /// Returns the file descriptor of the GIC device + fn device_fd(&self) -> &DeviceFd; + + /// Returns an array with GIC device properties + fn device_properties(&self) -> &[u64]; + + /// Returns the number of vCPUs this GIC handles + fn vcpu_count(&self) -> u64; + + /// Returns the fdt compatibility property of the device + fn fdt_compatibility(&self) -> &str; + + /// Returns the maint_irq fdt property of the device + fn fdt_maint_irq(&self) -> u32; + + /// Get ITS reg range + fn get_its_reg_range(&self, _its_type: &its::ItsType) -> Option<[u64; 2]> { + None + } + + /// Only gic-v3 has its + fn attach_its(&mut self, _vm: &VmFd) -> Result<()> { + Ok(()) + } + + /// Returns the GIC version of the device + fn version() -> u32 + where + Self: Sized; + + /// Create the GIC device object + fn create_device(fd: DeviceFd, vcpu_count: u64) -> Box + where + Self: Sized; + + /// Setup the device-specific attributes + fn init_device_attributes(gic_device: &dyn GICDevice) -> Result<()> + where + Self: Sized; + + /// Initialize a GIC device + fn init_device(vm: &VmFd) -> Result + where + Self: Sized, + { + let mut gic_device = kvm_bindings::kvm_create_device { + type_: Self::version(), + fd: 0, + flags: 0, + }; + + vm.create_device(&mut gic_device).map_err(Error::CreateGIC) + } + + /// Set a GIC device attribute + fn set_device_attribute( + fd: &DeviceFd, + group: u32, + attr: u64, + addr: u64, + flags: u32, + ) -> Result<()> + where + Self: Sized, + { + let attr = kvm_bindings::kvm_device_attr { + group, + attr, + addr, + flags, + }; + fd.set_device_attr(&attr) + .map_err(Error::SetDeviceAttribute)?; + + Ok(()) + } + + /// Finalize the setup of a GIC device + fn finalize_device(gic_device: &dyn GICDevice) -> Result<()> + where + Self: Sized, + { + /* We need to tell the kernel how many irqs to support with this vgic. + * See the `layout` module for details. + */ + let nr_irqs: u32 = IRQ_MAX - IRQ_BASE + 1; + let nr_irqs_ptr = &nr_irqs as *const u32; + Self::set_device_attribute( + gic_device.device_fd(), + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_NR_IRQS, + 0, + nr_irqs_ptr as u64, + 0, + )?; + + /* Finalize the GIC. + * See https://code.woboq.org/linux/linux/virt/kvm/arm/vgic/vgic-kvm-device.c.html#211. + */ + Self::set_device_attribute( + gic_device.device_fd(), + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, + u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_CTRL_INIT), + 0, + 0, + )?; + + Ok(()) + } + + #[allow(clippy::new_ret_no_self)] + /// Method to initialize the GIC device + fn new(vm: &VmFd, vcpu_count: u64) -> Result> + where + Self: Sized, + { + let vgic_fd = Self::init_device(vm)?; + + let mut device = Self::create_device(vgic_fd, vcpu_count); + + device.attach_its(vm)?; + + Self::init_device_attributes(device.as_ref())?; + + Self::finalize_device(device.as_ref())?; + + Ok(device) + } +} + +/// Create a GIC device. +/// +/// It will try to create by default a GICv3 device. If that fails it will try +/// to fall-back to a GICv2 device. +pub fn create_gic(vm: &VmFd, vcpu_count: u64) -> Result> { + GICv3::new(vm, vcpu_count).or_else(|_| GICv2::new(vm, vcpu_count)) +} + +#[cfg(test)] +mod tests { + + use super::*; + use kvm_ioctls::Kvm; + + #[test] + fn test_create_gic() { + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + assert!(create_gic(&vm, 1).is_ok()); + } +} diff --git a/src/dragonball/src/dbs_arch/src/aarch64/mod.rs b/src/dragonball/src/dbs_arch/src/aarch64/mod.rs new file mode 100644 index 000000000..89892e45d --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/aarch64/mod.rs @@ -0,0 +1,139 @@ +// Copyright 2021 Alibaba Cloud. All Rights Reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! CPU architecture specific constants, structures and utilities for the `aarch64` architecture. + +/// Module for the global interrupt controller configuration. +pub mod gic; +/// Module for PMU virtualization. +pub mod pmu; +/// Logic for configuring aarch64 registers. +pub mod regs; + +use std::{fmt, result}; + +const MMIO_DEVICE_LEGACY_IRQ_NUMBER: usize = 1; + +/// Error for ARM64 architecture information +#[derive(Debug)] +pub enum Error { + /// MMIO device information error + MMIODeviceInfoError, + /// Invalid arguments + InvalidArguments, +} + +type Result = result::Result; + +/// Types of devices that can get attached to this platform. +#[derive(Clone, Debug, PartialEq, Eq, Hash, Copy)] +pub enum DeviceType { + /// Device Type: Virtio. + Virtio(u32), + /// Device Type: Serial. + #[cfg(target_arch = "aarch64")] + Serial, + /// Device Type: RTC. + #[cfg(target_arch = "aarch64")] + RTC, +} + +impl fmt::Display for DeviceType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{self:?}") + } +} + +/// Trait for devices to be added to the Flattened Device Tree. +pub trait DeviceInfoForFDT { + /// Returns the address where this device will be loaded. + fn addr(&self) -> u64; + /// Returns the amount of memory that needs to be reserved for this device. + fn length(&self) -> u64; + /// Returns the associated interrupt for this device. + fn irq(&self) -> Result; + /// Get device id + fn get_device_id(&self) -> Option; +} + +/// MMIO device info used for FDT generating. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct MMIODeviceInfo { + /// MMIO address base + pub base: u64, + /// MMIO address size + pub size: u64, + /// Device irq + pub irqs: Vec, + /// Only virtio devices that support platform msi have device id + pub device_id: Option, +} + +impl MMIODeviceInfo { + /// Create mmio device info. + pub fn new(base: u64, size: u64, irqs: Vec, device_id: Option) -> Self { + MMIODeviceInfo { + base, + size, + irqs, + device_id, + } + } +} + +impl DeviceInfoForFDT for MMIODeviceInfo { + fn addr(&self) -> u64 { + self.base + } + + fn length(&self) -> u64 { + self.size + } + + fn irq(&self) -> Result { + // Currently mmio devices have only one legacy irq. + if self.irqs.len() != MMIO_DEVICE_LEGACY_IRQ_NUMBER { + return Err(Error::MMIODeviceInfoError); + } + let irq = self.irqs[0]; + if !(gic::IRQ_BASE..=gic::IRQ_MAX).contains(&irq) { + return Err(Error::MMIODeviceInfoError); + } + + Ok(irq) + } + + fn get_device_id(&self) -> Option { + self.device_id + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_mmo_device_info() { + let info = MMIODeviceInfo::new(0x1000, 0x2000, vec![gic::IRQ_BASE], Some(5)); + assert_eq!(info.addr(), 0x1000); + assert_eq!(info.length(), 0x2000); + assert_eq!(info.irq().unwrap(), gic::IRQ_BASE); + assert_eq!(info.get_device_id(), Some(5)); + + let info = MMIODeviceInfo::new(0x1000, 0x2000, vec![gic::IRQ_BASE], None); + assert_eq!(info.get_device_id(), None); + } + + #[test] + fn test_mmo_device_info_get_irq() { + let info = MMIODeviceInfo::new(0x1000, 0x1000, vec![], None); + assert!(info.irq().is_err()); + let info = MMIODeviceInfo::new(0x1000, 0x1000, vec![1, 2], None); + assert!(info.irq().is_err()); + let info = MMIODeviceInfo::new(0x1000, 0x1000, vec![gic::IRQ_BASE - 1], None); + assert!(info.irq().is_err()); + let info = MMIODeviceInfo::new(0x1000, 0x1000, vec![gic::IRQ_MAX + 1], None); + assert!(info.irq().is_err()); + } +} diff --git a/src/dragonball/src/dbs_arch/src/aarch64/pmu.rs b/src/dragonball/src/dbs_arch/src/aarch64/pmu.rs new file mode 100644 index 000000000..8d939a576 --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/aarch64/pmu.rs @@ -0,0 +1,172 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Constants and utilities for aarch64 PMU virtualization. + +use kvm_bindings::{ + kvm_device_attr, KVM_ARM_VCPU_PMU_V3_CTRL, KVM_ARM_VCPU_PMU_V3_INIT, KVM_ARM_VCPU_PMU_V3_IRQ, +}; +use kvm_ioctls::{Error as KvmError, VcpuFd, VmFd}; +use thiserror::Error; + +/// PPI base number on aarch64. +pub const PPI_BASE: u32 = 16; +/// Pmu ppi number +pub const VIRTUAL_PMU_IRQ: u32 = 7; + +/// Errors thrown while setting up the PMU. +#[derive(Error, Debug)] +pub enum PmuError { + /// Error while check kvm pmu capability + #[error("Check kvm pmu capability failed: {0}")] + CheckKvmPmuCap(#[source] KvmError), + /// Error while check pmu irq. + #[error("Check pmu irq error: {0}")] + HasPmuIrq(#[source] KvmError), + /// Error while check pmu init. + #[error("Check pmu init error: {0}")] + HasPmuInit(#[source] KvmError), + /// Error while set pmu irq. + #[error("Set pmu irq error: {0}")] + SetPmuIrq(#[source] KvmError), + /// Error while set pmu init. + #[error("Set pmu init error: {0}")] + SetPmuInit(#[source] KvmError), +} + +type Result = std::result::Result; + +/// Tests whether a cpu supports KVM_ARM_VCPU_PMU_V3_IRQ attribute. +/// +/// # Arguments +/// * `vcpu` - The VCPU file descriptor +fn has_pmu_irq(vcpu: &VcpuFd) -> Result<()> { + let irq = (VIRTUAL_PMU_IRQ + PPI_BASE) as u64; + let attribute = kvm_device_attr { + group: KVM_ARM_VCPU_PMU_V3_CTRL, + attr: u64::from(KVM_ARM_VCPU_PMU_V3_IRQ), + addr: &irq as *const u64 as u64, + flags: 0, + }; + vcpu.has_device_attr(&attribute) + .map_err(PmuError::HasPmuIrq) +} + +/// Tests whether a cpu supports KVM_ARM_VCPU_PMU_V3_INIT attribute. +/// +/// # Arguments +/// * `vcpu` - The VCPU file descriptor +fn has_pmu_init(vcpu: &VcpuFd) -> Result<()> { + let attribute = kvm_device_attr { + group: KVM_ARM_VCPU_PMU_V3_CTRL, + attr: u64::from(KVM_ARM_VCPU_PMU_V3_INIT), + addr: 0, + flags: 0, + }; + vcpu.has_device_attr(&attribute) + .map_err(PmuError::HasPmuInit) +} + +/// Set KVM_ARM_VCPU_PMU_V3_IRQ for a specific vcpu. +/// +/// # Arguments +/// * `vcpu` - The VCPU file descriptor +fn set_pmu_irq(vcpu: &VcpuFd) -> Result<()> { + let irq = (VIRTUAL_PMU_IRQ + PPI_BASE) as u64; + let attribute = kvm_device_attr { + group: KVM_ARM_VCPU_PMU_V3_CTRL, + attr: u64::from(KVM_ARM_VCPU_PMU_V3_IRQ), + addr: &irq as *const u64 as u64, + flags: 0, + }; + vcpu.set_device_attr(&attribute) + .map_err(PmuError::SetPmuIrq) +} + +/// Set KVM_ARM_VCPU_PMU_V3_INIT for a specific vcpu. +/// +/// # Arguments +/// * `vcpu` - The VCPU file descriptor +fn set_pmu_init(vcpu: &VcpuFd) -> Result<()> { + let attribute = kvm_device_attr { + group: KVM_ARM_VCPU_PMU_V3_CTRL, + attr: u64::from(KVM_ARM_VCPU_PMU_V3_INIT), + addr: 0, + flags: 0, + }; + vcpu.set_device_attr(&attribute) + .map_err(PmuError::SetPmuInit) +} + +/// Check kvm pmu capability +/// +/// # Arguments +/// * `vm` - The VM file descriptor +fn check_kvm_pmu_cap(_vm: &VmFd) -> Result<()> { + // TODO: check KVM_CAP_ARM_PMU_V3 capability before setting PMU + // Cap for KVM_CAP_ARM_PMU_V3 isn't supported in kvm-ioctls upstream, so + // leave a todo here for supporting this check in the future. + // Interface: vm.check_extension(kvm_ioctls::Cap) + + Ok(()) +} + +/// Check pmu feature +/// +/// # Arguments +/// * `vcpu` - The VCPU file descriptor +fn check_pmu_feature(vcpu: &VcpuFd) -> Result<()> { + has_pmu_irq(vcpu)?; + has_pmu_init(vcpu) +} + +/// Set pmu feature +/// +/// # Arguments +/// * `vcpu` - The VCPU file descriptor +fn set_pmu_feature(vcpu: &VcpuFd) -> Result<()> { + set_pmu_irq(vcpu)?; + set_pmu_init(vcpu) +} + +/// Initialize PMU in for vcpu +/// +/// # Arguments +/// * `vm` - The VM file descriptor +/// * `vcpu` - The VCPU file descriptor +pub fn initialize_pmu(vm: &VmFd, vcpu: &VcpuFd) -> Result<()> { + check_kvm_pmu_cap(vm)?; + check_pmu_feature(vcpu)?; + set_pmu_feature(vcpu) +} + +#[cfg(test)] +mod tests { + use kvm_bindings::{kvm_vcpu_init, KVM_ARM_VCPU_PMU_V3, KVM_ARM_VCPU_PSCI_0_2}; + use kvm_ioctls::Kvm; + + use super::*; + use crate::gic::create_gic; + + #[test] + fn test_create_pmu() { + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + let vcpu = vm.create_vcpu(0).unwrap(); + + assert!(create_gic(&vm, 1).is_ok()); + assert!(initialize_pmu(&vm, &vcpu).is_err()); + + if check_kvm_pmu_cap(&vm).is_err() { + return; + } + + let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); + vm.get_preferred_target(&mut kvi) + .expect("Cannot get preferred target"); + kvi.features[0] = 1 << KVM_ARM_VCPU_PSCI_0_2 | 1 << KVM_ARM_VCPU_PMU_V3; + + assert!(vcpu.vcpu_init(&kvi).is_ok()); + assert!(initialize_pmu(&vm, &vcpu).is_ok()); + } +} diff --git a/src/dragonball/src/dbs_arch/src/aarch64/regs.rs b/src/dragonball/src/dbs_arch/src/aarch64/regs.rs new file mode 100644 index 000000000..ff57edd1a --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/aarch64/regs.rs @@ -0,0 +1,200 @@ +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! Constants and utilities for aarch64 CPU generic, system and model specific registers. + +use std::{mem, result}; + +use kvm_bindings::*; +use kvm_ioctls::VcpuFd; +use memoffset::offset_of; +use vmm_sys_util; + +/// Errors thrown while setting aarch64 registers. +#[derive(Debug)] +pub enum Error { + /// Failed to get core register (PC, PSTATE or general purpose ones). + GetCoreRegister(kvm_ioctls::Error), + /// Failed to set core register (PC, PSTATE or general purpose ones). + SetCoreRegister(kvm_ioctls::Error), + /// Failed to get a system register. + GetSysRegister(kvm_ioctls::Error), + /// Failed to get the register list. + GetRegList(kvm_ioctls::Error), + /// Failed to get a system register. + SetRegister(kvm_ioctls::Error), + /// Failed to init fam reglist + FamRegister(vmm_sys_util::fam::Error), +} +type Result = result::Result; + +#[allow(non_upper_case_globals)] +// PSR (Processor State Register) bits. +// Taken from arch/arm64/include/uapi/asm/ptrace.h. +const PSR_MODE_EL1h: u64 = 0x0000_0005; +const PSR_F_BIT: u64 = 0x0000_0040; +const PSR_I_BIT: u64 = 0x0000_0080; +const PSR_A_BIT: u64 = 0x0000_0100; +const PSR_D_BIT: u64 = 0x0000_0200; +// Taken from arch/arm64/kvm/inject_fault.c. +const PSTATE_FAULT_BITS_64: u64 = PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT; + +// Following are macros that help with getting the ID of a aarch64 core register. +// The core register are represented by the user_pt_regs structure. Look for it in +// arch/arm64/include/uapi/asm/ptrace.h. + +macro_rules! arm64_core_reg { + ($reg: tt) => { + // As per `kvm_arm_copy_reg_indices`, the id of a core register can be obtained like this: + // `const u64 core_reg = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | i`, where i is obtained with: + // `for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) {` + // We are using here `user_pt_regs` since this structure contains the core register and it is at + // the start of `kvm_regs`. + // struct kvm_regs { + // struct user_pt_regs regs; /* sp = sp_el0 */ + // + // __u64 sp_el1; + // __u64 elr_el1; + // + // __u64 spsr[KVM_NR_SPSR]; + // + // struct user_fpsimd_state fp_regs; + //}; + // struct user_pt_regs { + // __u64 regs[31]; + // __u64 sp; + // __u64 pc; + // __u64 pstate; + //}; + // In our implementation we need: pc, pstate and user_pt_regs->regs[0]. + KVM_REG_ARM64 as u64 + | KVM_REG_SIZE_U64 as u64 + | u64::from(KVM_REG_ARM_CORE) + | ((offset_of!(user_pt_regs, $reg) / mem::size_of::()) as u64) + }; +} + +// This macro computes the ID of a specific ARM64 system register similar to how +// the kernel C macro does. +// https://elixir.bootlin.com/linux/v4.20.17/source/arch/arm64/include/uapi/asm/kvm.h#L203 +macro_rules! arm64_sys_reg { + ($name: tt, $op0: tt, $op1: tt, $crn: tt, $crm: tt, $op2: tt) => { + const $name: u64 = KVM_REG_ARM64 as u64 + | KVM_REG_SIZE_U64 as u64 + | KVM_REG_ARM64_SYSREG as u64 + | ((($op0 as u64) << KVM_REG_ARM64_SYSREG_OP0_SHIFT) + & KVM_REG_ARM64_SYSREG_OP0_MASK as u64) + | ((($op1 as u64) << KVM_REG_ARM64_SYSREG_OP1_SHIFT) + & KVM_REG_ARM64_SYSREG_OP1_MASK as u64) + | ((($crn as u64) << KVM_REG_ARM64_SYSREG_CRN_SHIFT) + & KVM_REG_ARM64_SYSREG_CRN_MASK as u64) + | ((($crm as u64) << KVM_REG_ARM64_SYSREG_CRM_SHIFT) + & KVM_REG_ARM64_SYSREG_CRM_MASK as u64) + | ((($op2 as u64) << KVM_REG_ARM64_SYSREG_OP2_SHIFT) + & KVM_REG_ARM64_SYSREG_OP2_MASK as u64); + }; +} + +// Constant imported from the Linux kernel: +// https://elixir.bootlin.com/linux/v4.20.17/source/arch/arm64/include/asm/sysreg.h#L135 +arm64_sys_reg!(MPIDR_EL1, 3, 0, 0, 0, 5); + +/// Configure core registers for a given CPU. +/// +/// # Arguments +/// +/// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. +/// * `cpu_id` - Index of current vcpu. +/// * `boot_ip` - Starting instruction pointer. +/// * `mem` - Reserved DRAM for current VM. +pub fn setup_regs(vcpu: &VcpuFd, cpu_id: u8, boot_ip: u64, fdt_address: u64) -> Result<()> { + // Get the register index of the PSTATE (Processor State) register. + vcpu.set_one_reg(arm64_core_reg!(pstate), PSTATE_FAULT_BITS_64 as u128) + .map_err(Error::SetCoreRegister)?; + + // Other vCPUs are powered off initially awaiting PSCI wakeup. + if cpu_id == 0 { + // Setting the PC (Processor Counter) to the current program address (kernel address). + vcpu.set_one_reg(arm64_core_reg!(pc), boot_ip as u128) + .map_err(Error::SetCoreRegister)?; + + // Last mandatory thing to set -> the address pointing to the FDT (also called DTB). + // "The device tree blob (dtb) must be placed on an 8-byte boundary and must + // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt. + // We are choosing to place it the end of DRAM. See `get_fdt_addr`. + vcpu.set_one_reg(arm64_core_reg!(regs), fdt_address as u128) + .map_err(Error::SetCoreRegister)?; + } + Ok(()) +} + +/// Specifies whether a particular register is a system register or not. +/// The kernel splits the registers on aarch64 in core registers and system registers. +/// So, below we get the system registers by checking that they are not core registers. +/// +/// # Arguments +/// +/// * `regid` - The index of the register we are checking. +pub fn is_system_register(regid: u64) -> bool { + if (regid & KVM_REG_ARM_COPROC_MASK as u64) == KVM_REG_ARM_CORE as u64 { + return false; + } + + let size = regid & KVM_REG_SIZE_MASK; + if size != KVM_REG_SIZE_U32 && size != KVM_REG_SIZE_U64 { + panic!("Unexpected register size for system register {}", size); + } + true +} + +/// Read the MPIDR - Multiprocessor Affinity Register. +/// +/// # Arguments +/// +/// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. +pub fn read_mpidr(vcpu: &VcpuFd) -> Result { + vcpu.get_one_reg(MPIDR_EL1) + .map(|value| value as u64) + .map_err(Error::GetSysRegister) +} + +#[cfg(test)] +mod tests { + use super::*; + use kvm_ioctls::Kvm; + + #[test] + fn test_setup_regs() { + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + let vcpu = vm.create_vcpu(0).unwrap(); + match setup_regs(&vcpu, 0, 0x0, crate::gic::GIC_REG_END_ADDRESS).unwrap_err() { + Error::SetCoreRegister(ref e) => assert_eq!(e.errno(), libc::ENOEXEC), + _ => panic!("Expected to receive Error::SetCoreRegister"), + } + let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); + vm.get_preferred_target(&mut kvi).unwrap(); + vcpu.vcpu_init(&kvi).unwrap(); + + assert!(setup_regs(&vcpu, 0, 0x0, crate::gic::GIC_REG_END_ADDRESS).is_ok()); + } + + #[test] + fn test_read_mpidr() { + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + let vcpu = vm.create_vcpu(0).unwrap(); + let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); + vm.get_preferred_target(&mut kvi).unwrap(); + + // Must fail when vcpu is not initialized yet. + assert!(read_mpidr(&vcpu).is_err()); + + vcpu.vcpu_init(&kvi).unwrap(); + assert_eq!(read_mpidr(&vcpu).unwrap(), 0x80000000); + } +} diff --git a/src/dragonball/src/dbs_arch/src/lib.rs b/src/dragonball/src/dbs_arch/src/lib.rs new file mode 100644 index 000000000..749ae181f --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/lib.rs @@ -0,0 +1,67 @@ +// Copyright 2021-2022 Alibaba Cloud. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![deny(missing_docs)] + +//! CPU architecture specific constants, structures and utilities. +//! +//! This crate provides CPU architecture specific constants, structures and utilities to abstract +//! away CPU architecture specific details from the Dragonball Secure Sandbox or other VMMs. +//! +//! # Supported CPU Architectures +//! - **x86_64**: x86_64 (also known as x64, x86-64, AMD64, and Intel 64) is a 64-bit +//! version of the x86 instruction set. +//! - **ARM64**: AArch64 or ARM64 is the 64-bit extension of the ARM architecture. + +#[cfg(target_arch = "x86_64")] +mod x86_64; +#[cfg(target_arch = "x86_64")] +pub use x86_64::*; + +#[cfg(target_arch = "aarch64")] +mod aarch64; +#[cfg(target_arch = "aarch64")] +pub use aarch64::*; + +/// Enum indicating vpmu feature level +#[derive(Debug, Eq, PartialEq, Copy, Clone)] +pub enum VpmuFeatureLevel { + /// Disabled means vpmu feature is off (by default) + Disabled, + /// LimitedlyEnabled means minimal vpmu counters are supported( only cycles and instructions ) + /// For aarch64, LimitedlyEnabled isn't supported currently. The ability will be implemented in the future. + LimitedlyEnabled, + /// FullyEnabled means all vpmu counters are supported + FullyEnabled, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_debug_trait() { + let level = VpmuFeatureLevel::Disabled; + assert_eq!(format!("{level:#?}"), "Disabled"); + + let level = VpmuFeatureLevel::LimitedlyEnabled; + assert_eq!(format!("{level:#?}"), "LimitedlyEnabled"); + + let level = VpmuFeatureLevel::FullyEnabled; + assert_eq!(format!("{level:#?}"), "FullyEnabled"); + } + + #[test] + fn test_eq_trait() { + let level = VpmuFeatureLevel::Disabled; + assert!(level == VpmuFeatureLevel::Disabled); + assert!(level != VpmuFeatureLevel::LimitedlyEnabled); + } + + #[test] + fn test_copy_trait() { + let level1 = VpmuFeatureLevel::Disabled; + let level2 = level1; + assert_eq!(level1, level2); + } +} diff --git a/src/dragonball/src/dbs_arch/src/x86_64/cpuid/bit_helper.rs b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/bit_helper.rs new file mode 100644 index 000000000..108578c62 --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/bit_helper.rs @@ -0,0 +1,599 @@ +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Helper to manipulate CPUID register content. + +#![macro_use] + +/// Structure representing a range of bits in a number. +/// +/// # Example +/// +/// ``` +/// #[macro_use] +/// use dbs_arch::cpuid::bit_helper::*; +/// +/// let range = BitRange { +/// msb_index: 7, +/// lsb_index: 3, +/// }; +/// ``` +/// The BitRange specified above will represent the following part of the number 72: +/// +-------------------------------------+---+---+---+---+---+---+---+---+---+---+ +/// | Base 2 Representation of the number | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | +/// +-------------------------------------+---+---+---+---+---+---+---+---+---+---+ +/// | bits indexes | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | +/// +-------------------------------------+---+---+---+---+---+---+---+---+---+---+ +/// | BitRange | | | * | * | * | * | * | | | | +/// +-------------------------------------+---+---+---+---+---+---+---+---+---+---+ +pub struct BitRange { + /// most significant bit index + pub msb_index: u32, + /// least significant bit index + pub lsb_index: u32, +} + +/// Trait containing helper methods for [`BitRange`](struct.BitRange.html) +/// +/// The methods are needed for: +/// - checking if the `BitRange` is valid for a type `T` +/// - creating masks for a type `T` +pub trait BitRangeExt { + /// Returns a value of type `T` that has all the bits in the specified bit range set to 1. + /// + /// # Example + /// + /// ``` + /// #[macro_use] + /// use dbs_arch::cpuid::bit_helper::*; + /// + /// let range = BitRange { + /// msb_index: 7, + /// lsb_index: 3, + /// }; + /// println!("binary value: {:b}", range.get_mask()); + /// ``` + /// The code above will print: + /// ```bash + /// binary value: 11111000 + /// ``` + fn get_mask(&self) -> T; + + /// Checks if the current BitRange is valid for type `T`. + fn is_valid(&self) -> bool; + + /// Asserts if `self.is_valid()` returns true. + fn check(&self) { + assert!(self.is_valid(), "Invalid BitRange"); + } +} + +const MAX_U32_BIT_INDEX: u32 = 31; + +impl BitRangeExt for BitRange { + fn get_mask(&self) -> u32 { + self.check(); + + ((((1_u64) << (self.msb_index - self.lsb_index + 1)) - 1) << self.lsb_index) as u32 + } + + fn is_valid(&self) -> bool { + self.msb_index >= self.lsb_index && self.msb_index <= MAX_U32_BIT_INDEX + } +} + +macro_rules! bit_range { + ($msb_index:expr, $lsb_index:expr) => { + BitRange { + msb_index: $msb_index, + lsb_index: $lsb_index, + } + }; +} + +/// Trait containing helper methods for bit operations. +pub trait BitHelper { + /// Reads the value of the bit at position `pos` + fn read_bit(&self, pos: u32) -> bool; + + /// Changes the value of the bit at position `pos` to `val` + fn write_bit(&mut self, pos: u32, val: bool) -> &mut Self; + + /// Reads the value stored within the specified range of bits + /// + /// # Example + /// + /// ``` + /// #[macro_use] + /// use dbs_arch::cpuid::bit_helper::*; + /// + /// let val: u32 = 0b000010001000; + /// let range = BitRange { + /// msb_index: 7, + /// lsb_index: 3, + /// }; + /// println!("binary value: {:b}", val.read_bits_in_range(&range)); + /// ``` + /// The code above will print: + /// ```bash + /// binary value: 10001 + /// ``` + fn read_bits_in_range(&self, bit_range: &BitRange) -> Self; + + /// Stores a value within the specified range of bits + /// + /// # Example + /// + /// ``` + /// #[macro_use] + /// use dbs_arch::cpuid::bit_helper::*; + /// + /// let mut val: u32 = 0; + /// let range = BitRange { + /// msb_index: 7, + /// lsb_index: 3, + /// }; + /// val.write_bits_in_range(&range, 0b10001 as u32); + /// println!("binary value: {:b}", val); + /// ``` + /// The code above will print: + /// ```bash + /// binary value: 10001000 + /// ``` + fn write_bits_in_range(&mut self, bit_range: &BitRange, val: Self) -> &mut Self; +} + +impl BitHelper for u32 { + fn read_bit(&self, pos: u32) -> bool { + assert!(pos <= MAX_U32_BIT_INDEX, "Invalid pos"); + + (*self & (1 << pos)) > 0 + } + + fn write_bit(&mut self, pos: u32, val: bool) -> &mut Self { + assert!(pos <= MAX_U32_BIT_INDEX, "Invalid pos"); + + *self &= !(1 << pos); + *self |= (val as u32) << pos; + self + } + + fn read_bits_in_range(&self, range: &BitRange) -> Self { + range.check(); + + (self & range.get_mask()) >> range.lsb_index + } + + fn write_bits_in_range(&mut self, range: &BitRange, val: Self) -> &mut Self { + range.check(); + let mask = range.get_mask(); + let max_val = mask >> range.lsb_index; + assert!(val <= max_val, "Invalid val"); + + *self &= !mask; + *self |= val << range.lsb_index; + self + } +} + +#[cfg(test)] +mod tests { + use crate::cpuid::bit_helper::*; + + #[test] + #[should_panic] + fn test_invalid_msb_index() { + let range = BitRange { + msb_index: 32, + lsb_index: 2, + }; + range.check(); + } + + #[test] + #[should_panic] + fn test_invalid_range() { + let range = BitRange { + msb_index: 10, + lsb_index: 15, + }; + range.check(); + } + + #[test] + #[should_panic] + fn test_invalid_write_bit() { + // Set bit to 1 + let mut val: u32 = 0; + val.write_bit(32, true); + } + + #[test] + fn test_simple_write_bit() { + // Set bit to 1 + let mut val: u32 = 0; + val.write_bit(5, true); + assert!(val == 1 << 5); + + // Set bit to 0 + val = 1 << 5; + val.write_bit(5, false); + assert!(val == 0); + } + + #[test] + #[should_panic] + fn test_invalid_read_bit() { + // Set bit to 1 + let val: u32 = 0; + val.read_bit(32); + } + + #[test] + fn test_simple_read_bit() { + // Set bit to 1 + let val: u32 = 0b10_0000; + assert!(val.read_bit(5)); + assert!(!val.read_bit(4)); + } + + #[test] + fn test_chained_write_bit() { + let mut val: u32 = 1 << 12; + + val.write_bit(5, true) + .write_bit(10, true) + .write_bit(15, true) + .write_bit(12, false); + assert!(val == 1 << 5 | 1 << 10 | 1 << 15); + } + + #[test] + fn test_get_u32_mask_for_range() { + // Test a couple of successive ranges + assert!( + BitRange { + msb_index: 3, + lsb_index: 2 + } + .get_mask() + == 0b1100 + ); + assert!( + BitRange { + msb_index: 4, + lsb_index: 2 + } + .get_mask() + == 0b11100 + ); + assert!( + BitRange { + msb_index: 5, + lsb_index: 2 + } + .get_mask() + == 0b11_1100 + ); + assert!( + BitRange { + msb_index: 6, + lsb_index: 2 + } + .get_mask() + == 0b111_1100 + ); + assert!( + BitRange { + msb_index: 7, + lsb_index: 2 + } + .get_mask() + == 0b1111_1100 + ); + } + + #[test] + #[should_panic] + fn test_invalid_read_bits() { + let val: u32 = 30; + val.read_bits_in_range(&BitRange { + msb_index: 32, + lsb_index: 2, + }); + } + + #[test] + fn test_read_bits() { + let val: u32 = 0b1000_0000_0000_0000_0011_0101_0001_0000; + + // Test a couple of successive ranges + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 3, + lsb_index: 2 + }) == 0b00 + ); + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 4, + lsb_index: 2 + }) == 0b100 + ); + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 5, + lsb_index: 2 + }) == 0b0100 + ); + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 6, + lsb_index: 2 + }) == 0b00100 + ); + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 7, + lsb_index: 2 + }) == 0b00_0100 + ); + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 8, + lsb_index: 2 + }) == 0b100_0100 + ); + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 9, + lsb_index: 2 + }) == 0b0100_0100 + ); + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 10, + lsb_index: 2 + }) == 0b1_0100_0100 + ); + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 11, + lsb_index: 2 + }) == 0b01_0100_0100 + ); + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 12, + lsb_index: 2 + }) == 0b101_0100_0100 + ); + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 13, + lsb_index: 2 + }) == 0b1101_0100_0100 + ); + + // Test max left and max right + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 31, + lsb_index: 15 + }) == 0b1_0000_0000_0000_0000 + ); + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 14, + lsb_index: 0 + }) == 0b011_0101_0001_0000 + ); + assert!( + val.read_bits_in_range(&BitRange { + msb_index: 31, + lsb_index: 0 + }) == 0b1000_0000_0000_0000_0011_0101_0001_0000 + ); + } + + #[test] + #[should_panic] + fn test_invalid_write_bits() { + let mut val: u32 = 0; + + val.write_bits_in_range( + &BitRange { + msb_index: 32, + lsb_index: 2, + }, + 0b100, + ); + } + + #[test] + #[should_panic] + fn test_overflow_write_bits() { + let mut val: u32 = 0; + + val.write_bits_in_range( + &BitRange { + msb_index: 3, + lsb_index: 2, + }, + 0b100, + ); + } + + #[test] + fn test_simple_write_bits() { + let mut val: u32 = 0; + + // Test a couple of successive ranges + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 3, + lsb_index: 2 + }, + 0b00 + ) == &0b0000 + ); + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 4, + lsb_index: 2 + }, + 0b100 + ) == &0b10000 + ); + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 5, + lsb_index: 2 + }, + 0b0100 + ) == &0b01_0000 + ); + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 6, + lsb_index: 2 + }, + 0b0_0100 + ) == &0b001_0000 + ); + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 7, + lsb_index: 2 + }, + 0b00_0100 + ) == &0b0001_0000 + ); + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 8, + lsb_index: 2 + }, + 0b100_0100 + ) == &0b1_0001_0000 + ); + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 9, + lsb_index: 2 + }, + 0b0100_0100 + ) == &0b01_0001_0000 + ); + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 10, + lsb_index: 2 + }, + 0b1_0100_0100 + ) == &0b101_0001_0000 + ); + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 11, + lsb_index: 2 + }, + 0b01_0100_0100 + ) == &0b0101_0001_0000 + ); + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 12, + lsb_index: 2 + }, + 0b101_0100_0100 + ) == &0b1_0101_0001_0000 + ); + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 13, + lsb_index: 2 + }, + 0b1101_0100_0100 + ) == &0b11_0101_0001_0000 + ); + + // Test max left and max right + val = 0; + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 31, + lsb_index: 15 + }, + 0b1_0000_0000_0000_0000 + ) == &0b1000_0000_0000_0000_0000_0000_0000_0000 + ); + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 14, + lsb_index: 0 + }, + 0b011_0101_0001_0000 + ) == &0b1000_0000_0000_0000_0011_0101_0001_0000 + ); + assert!( + val.write_bits_in_range( + &BitRange { + msb_index: 31, + lsb_index: 0 + }, + 0b1000_0000_0000_0000_0011_0101_0001_0000 + ) == &0b1000_0000_0000_0000_0011_0101_0001_0000 + ); + } + + #[test] + fn test_chained_write_bits() { + let mut val: u32 = 0; + + // Test a couple of ranges + val.write_bits_in_range( + &BitRange { + msb_index: 4, + lsb_index: 2, + }, + 0b100, + ) + .write_bits_in_range( + &BitRange { + msb_index: 12, + lsb_index: 10, + }, + 0b110, + ) + .write_bits_in_range( + &BitRange { + msb_index: 24, + lsb_index: 20, + }, + 0b10101, + ) + .write_bits_in_range( + &BitRange { + msb_index: 31, + lsb_index: 28, + }, + 0b1011, + ); + + assert!(val == 0b1011_0001_0101_0000_0001_1000_0001_0000); + } +} diff --git a/src/dragonball/src/dbs_arch/src/x86_64/cpuid/brand_string.rs b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/brand_string.rs new file mode 100644 index 000000000..e9bc1df16 --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/brand_string.rs @@ -0,0 +1,462 @@ +// Copyright 2021 Alibaba Cloud. All Rights Reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::arch::x86_64::__cpuid as host_cpuid; +use std::slice; + +use crate::cpuid::common::{VENDOR_ID_AMD, VENDOR_ID_INTEL}; + +#[derive(Debug, Eq, PartialEq, Clone)] +pub enum Error { + NotSupported, + Overflow(String), +} + +/// Register designations used to get/set specific register values within the brand string buffer. +pub enum Reg { + Eax = 0, + Ebx = 1, + Ecx = 2, + Edx = 3, +} + +const BRAND_STRING_INTEL: &[u8] = b"Intel(R) Xeon(R) Processor"; +const BRAND_STRING_AMD: &[u8] = b"AMD EPYC"; + +/// A CPUID brand string wrapper, providing some efficient manipulation primitives. +/// +/// This is achieved by bypassing the `O(n)` indexing, heap allocation, and the unicode checks +/// done by `std::string::String`. +#[derive(Clone, Debug, Default)] +pub struct BrandString { + /// Flattened buffer, holding an array of 32-bit register values. + /// + /// It has the following layout: + /// reg_buf[0] = leaf_0x80000002.Eax + /// reg_buf[1] = leaf_0x80000002.Ebx + /// reg_buf[2] = leaf_0x80000002.Ecx + /// reg_buf[3] = leaf_0x80000002.Edx + /// reg_buf[4] = leaf_0x80000003.Eax + /// ... + /// reg_buf[10] = leaf_0x80000004.Ecx + /// reg_buf[11] = leaf_0x80000004.Edx + /// When seen as a byte-array, this buffer holds the ASCII-encoded CPU brand string. + reg_buf: [u32; BrandString::REG_BUF_SIZE], + + /// Actual string length, in bytes. + /// + /// E.g. For "Intel CPU", this would be `strlen("Intel CPU") == 9`. + len: usize, +} + +impl BrandString { + /// Register buffer size (in number of registers). + /// + /// There are 3 leaves (0x800000002 through 0x80000004), each with 4 regs (Eax, Ebx, Ecx, Edx). + const REG_BUF_SIZE: usize = 3 * 4; + + /// Max Brand string length, in bytes (also in chars, since it is ASCII-encoded). + /// + /// The string is NULL-terminated, so the max string length is actually one byte + /// less than the buffer size in bytes + const MAX_LEN: usize = Self::REG_BUF_SIZE * 4 - 1; + + /// Creates an empty brand string (0-initialized) + fn new() -> Self { + Default::default() + } + + /// Generates the emulated brand string. + /// + /// For Intel CPUs, the brand string we expose will be: + /// "Intel(R) Xeon(R) Processor @ {host freq}" + /// where {host freq} is the CPU frequency, as present in the + /// host brand string (e.g. 4.01GHz). + /// + /// For AMD CPUs, the brand string we expose will be AMD EPYC. + /// + /// For other CPUs, we'll just expose an empty string. + /// + /// This is safe because we know BRAND_STRING_INTEL and BRAND_STRING_AMD to hold valid data + /// (allowed length and holding only valid ASCII chars). + pub fn from_vendor_id(vendor_id: &[u8; 12]) -> Result { + let brand = match vendor_id { + VENDOR_ID_INTEL => { + let mut this = BrandString::from_bytes_unchecked(BRAND_STRING_INTEL); + if let Ok(host_bstr) = BrandString::from_host_cpuid() { + if let Some(freq) = host_bstr.find_freq() { + this.push_bytes(b" @ ")?; + this.push_bytes(freq)?; + } + } + this + } + VENDOR_ID_AMD => BrandString::from_bytes_unchecked(BRAND_STRING_AMD), + _ => BrandString::from_bytes_unchecked(b""), + }; + + Ok(brand) + } + + /// Creates a brand string, initialized from the CPUID leaves 0x80000002 through 0x80000004 + /// of the host CPU. + fn from_host_cpuid() -> Result { + let mut this = Self::new(); + let mut cpuid_regs = unsafe { host_cpuid(0x8000_0000) }; + + if cpuid_regs.eax < 0x8000_0004 { + // Brand string not supported by the host CPU + return Err(Error::NotSupported); + } + + for leaf in 0x8000_0002..=0x8000_0004 { + cpuid_regs = unsafe { host_cpuid(leaf) }; + this.set_reg_for_leaf(leaf, Reg::Eax, cpuid_regs.eax); + this.set_reg_for_leaf(leaf, Reg::Ebx, cpuid_regs.ebx); + this.set_reg_for_leaf(leaf, Reg::Ecx, cpuid_regs.ecx); + this.set_reg_for_leaf(leaf, Reg::Edx, cpuid_regs.edx); + } + + let mut len = Self::MAX_LEN; + { + let this_bytes = this.as_bytes(); + while this_bytes[len - 1] == 0 && len > 0 { + len -= 1; + } + } + this.len = len; + + Ok(this) + } + + /// Creates a (custom) brand string, initialized from `src`. + /// + /// No checks are performed on the length of `src` or its contents (`src` should be an + /// ASCII-encoded string). + #[inline] + fn from_bytes_unchecked(src: &[u8]) -> Self { + let mut this = Self::new(); + this.len = src.len(); + this.as_bytes_mut()[..src.len()].copy_from_slice(src); + this + } + + /// Returns the given register value for the given CPUID leaf. + /// + /// `leaf` must be between 0x80000002 and 0x80000004. + #[inline] + pub fn get_reg_for_leaf(&self, leaf: u32, reg: Reg) -> u32 { + if (0x80000002u32..=0x80000004).contains(&leaf) { + // It's ok not to validate parameters here, leaf and reg should + // both be compile-time constants. If there's something wrong with them, + // that's a programming error and we should panic anyway. + self.reg_buf[(leaf - 0x8000_0002) as usize * 4 + reg as usize] + } else { + 0 + } + } + + /// Sets the value for the given leaf/register pair. + /// + /// `leaf` must be between 0x80000002 and 0x80000004. + #[inline] + fn set_reg_for_leaf(&mut self, leaf: u32, reg: Reg, val: u32) { + // It's ok not to validate parameters here, leaf and reg should + // both be compile-time constants. If there's something wrong with them, + // that's a programming error and we should panic anyway. + self.reg_buf[(leaf - 0x8000_0002) as usize * 4 + reg as usize] = val; + } + + /// Gets an immutable `u8` slice view into the brand string buffer. + #[inline] + fn as_bytes(&self) -> &[u8] { + // This is actually safe, because self.reg_buf has a fixed, known size, + // and also there's no risk of misalignment, since we're downgrading + // alignment constraints from dword to byte. + unsafe { slice::from_raw_parts(self.reg_buf.as_ptr() as *const u8, Self::REG_BUF_SIZE * 4) } + } + + /// Gets a mutable `u8` slice view into the brand string buffer. + #[inline] + fn as_bytes_mut(&mut self) -> &mut [u8] { + unsafe { + slice::from_raw_parts_mut(self.reg_buf.as_mut_ptr() as *mut u8, Self::REG_BUF_SIZE * 4) + } + } + + /// Asserts whether or not there is enough room to append `src` to the brand string. + fn check_push(&mut self, src: &[u8]) -> bool { + src.len() <= Self::MAX_LEN - self.len + } + + /// Appends `src` to the brand string if there is enough room to append it. + fn push_bytes(&mut self, src: &[u8]) -> Result<(), Error> { + if !self.check_push(src) { + // No room to push all of src. + return Err(Error::Overflow( + "Appending to the brand string failed.".to_string(), + )); + } + let start = self.len; + let count = src.len(); + self.len += count; + self.as_bytes_mut()[start..(start + count)].copy_from_slice(src); + Ok(()) + } + + /// Searches the brand string for the CPU frequency data it may contain (e.g. 4.01GHz), + /// and, if found, returns it as an `u8` slice. + /// + /// Basically, we're implementing a search for this regex: "([0-9]+\.[0-9]+[MGT]Hz)". + fn find_freq(&self) -> Option<&[u8]> { + // The algorithm for matching the regular expression above is based + // on a Moore machine, and 'stage' represents the current state of + // the machine. + enum Stages { + /// Initial state, looking for a digit. + Initial, + /// Found integer part of the frequency. + FoundFreqIntPart, + /// Found the decimal point. + FoundFreqDecimalPoint, + /// Found the decimal part. + FoundFreqDecimalPart, + /// Found the unit size. + FoundFreqUnitSize, + /// Found the H in 'Hz'. + FoundH, + } + + let mut freq_start = 0; + let mut decimal_start = 0; + + let mut stage = Stages::Initial; + + for (i, &ch) in self.as_bytes().iter().enumerate() { + match stage { + Stages::Initial => { + // Looking for one or more digits. + if ch.is_ascii_digit() { + freq_start = i; + stage = Stages::FoundFreqIntPart; + } + } + Stages::FoundFreqIntPart => { + // Looking for a decimal point. + if !ch.is_ascii_digit() { + if ch == b'.' { + stage = Stages::FoundFreqDecimalPoint; + } else { + stage = Stages::Initial; + } + } + } + Stages::FoundFreqDecimalPoint => { + // Looking for the decimal part. + if ch.is_ascii_digit() { + stage = Stages::FoundFreqDecimalPart; + decimal_start = i; + } else { + stage = Stages::Initial; + } + } + Stages::FoundFreqDecimalPart => { + // Looking for the unit of measure. + if !ch.is_ascii_digit() { + if ch == b'.' { + stage = Stages::FoundFreqDecimalPoint; + freq_start = decimal_start; + } else if ch == b'M' || ch == b'G' || ch == b'T' { + stage = Stages::FoundFreqUnitSize; + } else { + stage = Stages::Initial; + } + } + } + Stages::FoundFreqUnitSize => { + // Looking for the 'H' in 'Hz'. + if ch == b'H' { + stage = Stages::FoundH; + } else if ch.is_ascii_digit() { + stage = Stages::FoundFreqIntPart; + freq_start = i; + } else { + stage = Stages::Initial; + } + } + Stages::FoundH => { + // Looking for the 'z' in 'Hz'. + // If found, we stop the search and return the slice. + if ch == b'z' { + let freq_end = i + 1; + return Some(&self.as_bytes()[freq_start..freq_end]); + } else if ch.is_ascii_digit() { + stage = Stages::FoundFreqIntPart; + freq_start = i; + } else { + stage = Stages::Initial; + } + } + }; + } + None + } +} + +#[cfg(test)] +mod tests { + use std::iter::repeat; + + use super::*; + + #[test] + fn test_brand_string() { + #[inline] + fn pack_u32(src: &[u8]) -> u32 { + assert!(src.len() >= 4); + u32::from(src[0]) + | (u32::from(src[1]) << 8) + | (u32::from(src[2]) << 16) + | (u32::from(src[3]) << 24) + } + + const TEST_STR: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + let mut bstr = BrandString::from_bytes_unchecked(TEST_STR); + + // Test the immutable bitwise casts + // + { + for i in 0_usize..=1_usize { + let eax_offs = (4 * 4) * i; + let ebx_offs = (4 * 4) * i + 4; + let ecx_offs = (4 * 4) * i + 8; + let edx_offs = (4 * 4) * i + 12; + assert_eq!( + bstr.get_reg_for_leaf(0x8000_0002 + i as u32, Reg::Eax), + pack_u32(&TEST_STR[eax_offs..(eax_offs + 4)]) + ); + assert_eq!( + bstr.get_reg_for_leaf(0x8000_0002 + i as u32, Reg::Ebx), + pack_u32(&TEST_STR[ebx_offs..(ebx_offs + 4)]) + ); + assert_eq!( + bstr.get_reg_for_leaf(0x8000_0002 + i as u32, Reg::Ecx), + pack_u32(&TEST_STR[ecx_offs..(ecx_offs + 4)]) + ); + assert_eq!( + bstr.get_reg_for_leaf(0x8000_0002 + i as u32, Reg::Edx), + pack_u32(&TEST_STR[edx_offs..(edx_offs + 4)]) + ); + } + } + + assert_eq!(bstr.get_reg_for_leaf(0x8000_0005, Reg::Eax), 0); + + // Test find_freq() failure path + // + assert!(bstr.find_freq().is_none()); + + // Test mutable bitwise casting and finding the frequency substring + // + bstr.set_reg_for_leaf(0x8000_0003, Reg::Ebx, pack_u32(b"5.20")); + bstr.set_reg_for_leaf(0x8000_0003, Reg::Ecx, pack_u32(b"GHz ")); + assert_eq!(bstr.find_freq().unwrap(), b"5.20GHz"); + + let _overflow: [u8; 50] = [b'a'; 50]; + + // Test BrandString::check_push() + // + bstr = BrandString::new(); + assert!(bstr.check_push(b"Hello")); + bstr.push_bytes(b"Hello").unwrap(); + assert!(bstr.check_push(b", world!")); + bstr.push_bytes(b", world!").unwrap(); + + assert!(!bstr.check_push(&_overflow)); + + // Test BrandString::push_bytes() + // + let actual_len = bstr.as_bytes().len(); + let mut old_bytes: Vec = repeat(0).take(actual_len).collect(); + old_bytes.copy_from_slice(bstr.as_bytes()); + assert_eq!( + bstr.push_bytes(&_overflow), + Err(Error::Overflow( + "Appending to the brand string failed.".to_string() + )) + ); + assert!(bstr.as_bytes().to_vec() == old_bytes); + + // Test BrandString::from_host_cpuid() and get_reg_for_leaf() + // + match BrandString::from_host_cpuid() { + Ok(bstr) => { + for leaf in 0x8000_0002..=0x8000_0004_u32 { + let host_regs = unsafe { host_cpuid(leaf) }; + assert_eq!(bstr.get_reg_for_leaf(leaf, Reg::Eax), host_regs.eax); + assert_eq!(bstr.get_reg_for_leaf(leaf, Reg::Ebx), host_regs.ebx); + assert_eq!(bstr.get_reg_for_leaf(leaf, Reg::Ecx), host_regs.ecx); + assert_eq!(bstr.get_reg_for_leaf(leaf, Reg::Edx), host_regs.edx); + } + } + Err(Error::NotSupported) => { + // from_host_cpuid() should only fail if the host CPU doesn't support + // CPUID leaves up to 0x80000004, so let's make sure that's what happened. + let host_regs = unsafe { host_cpuid(0x8000_0000) }; + assert!(host_regs.eax < 0x8000_0004); + } + _ => panic!("This function should not return another type of error"), + } + + // Test BrandString::from_vendor_id() + let bstr = BrandString::from_vendor_id(VENDOR_ID_INTEL).unwrap(); + assert!(bstr.as_bytes().starts_with(BRAND_STRING_INTEL)); + let bstr = BrandString::from_vendor_id(VENDOR_ID_AMD).unwrap(); + assert!(bstr.as_bytes().starts_with(BRAND_STRING_AMD)); + let bstr = BrandString::from_vendor_id(b"............").unwrap(); + assert!(bstr.as_bytes() == vec![b'\0'; 48].as_slice()); + } + + #[test] + fn test_find_freq_fails() { + let bstr_thz = BrandString::from_bytes_unchecked(b"5.20THz"); + assert_eq!(bstr_thz.find_freq().unwrap(), b"5.20THz"); + + let bstr_unused_end = BrandString::from_bytes_unchecked(b"AAA5.20MHzXz"); + assert_eq!(bstr_unused_end.find_freq().unwrap(), b"5.20MHz"); + + let bstr_faulty_unit = BrandString::from_bytes_unchecked(b"5.20BHz "); + assert!(bstr_faulty_unit.find_freq().is_none()); + + let short_bstr = BrandString::from_bytes_unchecked(b"z"); + assert!(short_bstr.find_freq().is_none()); + + let skip_from_unit = BrandString::from_bytes_unchecked(b"Mz"); + assert!(skip_from_unit.find_freq().is_none()); + + let short_bstr = BrandString::from_bytes_unchecked(b"Hz"); + assert!(short_bstr.find_freq().is_none()); + + let short_bstr = BrandString::from_bytes_unchecked(b"GHz"); + assert!(short_bstr.find_freq().is_none()); + + let multiple_points_bstr = BrandString::from_bytes_unchecked(b"50.5.20GHz"); + assert_eq!(multiple_points_bstr.find_freq().unwrap(), b"5.20GHz"); + + let no_decimal_bstr = BrandString::from_bytes_unchecked(b"5GHz"); + assert!(no_decimal_bstr.find_freq().is_none()); + + let interrupted_bstr = BrandString::from_bytes_unchecked(b"500.00M5.20GHz"); + assert_eq!(interrupted_bstr.find_freq().unwrap(), b"5.20GHz"); + + let split_bstr = BrandString::from_bytes_unchecked(b"5.30AMHz"); + assert!(split_bstr.find_freq().is_none()); + + let long_bstr = BrandString::from_bytes_unchecked(b"1.12bc5.30MaHz2.4.25THz"); + assert_eq!(long_bstr.find_freq().unwrap(), b"4.25THz"); + + let found_h_bstr = BrandString::from_bytes_unchecked(b"1.A5.2MH3.20GHx4.30GHz"); + assert_eq!(found_h_bstr.find_freq().unwrap(), b"4.30GHz"); + } +} diff --git a/src/dragonball/src/dbs_arch/src/x86_64/cpuid/common.rs b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/common.rs new file mode 100644 index 000000000..292994a7b --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/common.rs @@ -0,0 +1,105 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::arch::x86_64::{CpuidResult, __cpuid_count, __get_cpuid_max}; + +use super::cpu_leaf::*; + +pub(crate) const VENDOR_ID_INTEL: &[u8; 12] = b"GenuineIntel"; +pub(crate) const VENDOR_ID_AMD: &[u8; 12] = b"AuthenticAMD"; +pub(crate) const VENDOR_ID_HYGON: &[u8; 12] = b"HygonGenuine"; + +#[derive(Clone, Debug)] +pub enum Error { + InvalidParameters(String), + NotSupported, +} + +/// Get CPUID value for (`function`, `count`). +pub fn get_cpuid(function: u32, count: u32) -> Result { + #[cfg(target_env = "sgx")] + { + return Err(Error::NotSupported); + } + + // TODO: replace with validation based on `has_cpuid()` when it becomes stable: + // https://doc.rust-lang.org/core/arch/x86/fn.has_cpuid.html + // this is safe because the host supports the `cpuid` instruction + let max_function = unsafe { __get_cpuid_max(function & leaf_0x80000000::LEAF_NUM).0 }; + if function > max_function { + return Err(Error::InvalidParameters(format!( + "Function not supported: 0x{function:x}", + ))); + } + + // this is safe because the host supports the `cpuid` instruction + let entry = unsafe { __cpuid_count(function, count) }; + if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 { + return Err(Error::InvalidParameters(format!("Invalid count: {count}"))); + } + + Ok(entry) +} + +/// Extracts the CPU vendor id from leaf 0x0. +pub fn get_vendor_id() -> Result<[u8; 12], Error> { + let vendor_entry = get_cpuid(0, 0)?; + let bytes: [u8; 12] = + unsafe { std::mem::transmute([vendor_entry.ebx, vendor_entry.edx, vendor_entry.ecx]) }; + + Ok(bytes) +} + +#[cfg(test)] +pub mod tests { + use super::*; + + pub fn get_topoext_fn() -> u32 { + let vendor_id = get_vendor_id(); + assert!(vendor_id.is_ok()); + let function = match &vendor_id.ok().unwrap() { + VENDOR_ID_INTEL => leaf_0x4::LEAF_NUM, + VENDOR_ID_AMD => leaf_0x8000001d::LEAF_NUM, + _ => 0, + }; + assert!(function != 0); + + function + } + + #[test] + fn test_get_cpu_id() { + // get_cpu_id should work correctly here + let topoext_fn = get_topoext_fn(); + + // check that get_cpuid works for valid parameters + match get_cpuid(topoext_fn, 0) { + Ok(topoext_entry) => { + assert!(topoext_entry.eax != 0); + } + _ => panic!("Wrong behavior"), + } + + // check that get_cpuid returns correct error for invalid `function` + match get_cpuid(0x9000_0000, 0) { + Err(Error::InvalidParameters(s)) => { + assert!(s == "Function not supported: 0x90000000"); + } + _ => panic!("Wrong behavior"), + } + + // check that get_cpuid returns correct error for invalid `count` + match get_cpuid(topoext_fn, 100) { + Err(Error::InvalidParameters(s)) => { + assert!(s == "Invalid count: 100"); + } + _ => panic!("Wrong behavior"), + } + } + + #[test] + fn test_get_vendor_id() { + let vendor_id = get_vendor_id().unwrap(); + assert!(matches!(&vendor_id, VENDOR_ID_INTEL | VENDOR_ID_AMD)); + } +} diff --git a/src/dragonball/src/dbs_arch/src/x86_64/cpuid/cpu_leaf.rs b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/cpu_leaf.rs new file mode 100644 index 000000000..0c121cdd2 --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/cpu_leaf.rs @@ -0,0 +1,439 @@ +// Copyright 2021 Alibaba Cloud. All Rights Reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![allow(missing_docs)] + +//! CPUID leaf registers constant values. + +#![allow(unused)] +pub mod leaf_0x0 { + pub const LEAF_NUM: u32 = 0x0; +} + +pub mod leaf_0x1 { + pub const LEAF_NUM: u32 = 0x1; + + pub mod eax { + use crate::cpuid::bit_helper::BitRange; + + pub const EXTENDED_FAMILY_ID_BITRANGE: BitRange = bit_range!(27, 20); + pub const EXTENDED_PROCESSOR_MODEL_BITRANGE: BitRange = bit_range!(19, 16); + pub const PROCESSOR_TYPE_BITRANGE: BitRange = bit_range!(13, 12); + pub const PROCESSOR_FAMILY_BITRANGE: BitRange = bit_range!(11, 8); + pub const PROCESSOR_MODEL_BITRANGE: BitRange = bit_range!(7, 4); + pub const STEPPING_BITRANGE: BitRange = bit_range!(3, 0); + } + + pub mod ebx { + use crate::cpuid::bit_helper::BitRange; + + // The bit-range containing the (fixed) default APIC ID. + pub const APICID_BITRANGE: BitRange = bit_range!(31, 24); + // The bit-range containing the logical processor count. + pub const CPU_COUNT_BITRANGE: BitRange = bit_range!(23, 16); + // The bit-range containing the number of bytes flushed when executing CLFLUSH. + pub const CLFLUSH_SIZE_BITRANGE: BitRange = bit_range!(15, 8); + } + + pub mod ecx { + // DTES64 = 64-bit debug store + pub const DTES64_BITINDEX: u32 = 2; + // MONITOR = Monitor/MWAIT + pub const MONITOR_BITINDEX: u32 = 3; + // CPL Qualified Debug Store + pub const DS_CPL_SHIFT: u32 = 4; + // 5 = VMX (Virtual Machine Extensions) + // 6 = SMX (Safer Mode Extensions) + // 7 = EIST (Enhanced Intel SpeedStep® technology) + // TM2 = Thermal Monitor 2 + pub const TM2_BITINDEX: u32 = 8; + // CNXT_ID = L1 Context ID (L1 data cache can be set to adaptive/shared mode) + pub const CNXT_ID_BITINDEX: u32 = 10; + // SDBG (cpu supports IA32_DEBUG_INTERFACE MSR for silicon debug) + pub const SDBG_BITINDEX: u32 = 11; + pub const FMA_BITINDEX: u32 = 12; + // XTPR_UPDATE = xTPR Update Control + pub const XTPR_UPDATE_BITINDEX: u32 = 14; + // PDCM = Perfmon and Debug Capability + pub const PDCM_BITINDEX: u32 = 15; + // 18 = DCA Direct Cache Access (prefetch data from a memory mapped device) + pub const MOVBE_BITINDEX: u32 = 22; + pub const TSC_DEADLINE_TIMER_BITINDEX: u32 = 24; + pub const OSXSAVE_BITINDEX: u32 = 27; + // Cpu is running on a hypervisor. + pub const HYPERVISOR_BITINDEX: u32 = 31; + } + + pub mod edx { + pub const PSN_BITINDEX: u32 = 18; // Processor Serial Number + pub const DS_BITINDEX: u32 = 21; // Debug Store. + pub const ACPI_BITINDEX: u32 = 22; // Thermal Monitor and Software Controlled Clock Facilities. + pub const SS_BITINDEX: u32 = 27; // Self Snoop + pub const HTT_BITINDEX: u32 = 28; // Max APIC IDs reserved field is valid + pub const TM_BITINDEX: u32 = 29; // Thermal Monitor. + pub const PBE_BITINDEX: u32 = 31; // Pending Break Enable. + } +} + +pub mod leaf_cache_parameters { + pub mod eax { + use crate::cpuid::bit_helper::BitRange; + + pub const CACHE_LEVEL_BITRANGE: BitRange = bit_range!(7, 5); + pub const MAX_CPUS_PER_CORE_BITRANGE: BitRange = bit_range!(25, 14); + } +} + +// Deterministic Cache Parameters Leaf +pub mod leaf_0x4 { + pub const LEAF_NUM: u32 = 0x4; + + pub mod eax { + use crate::cpuid::bit_helper::BitRange; + + // inherit eax from leaf_cache_parameters + pub use crate::cpuid::cpu_leaf::leaf_cache_parameters::eax::*; + + pub const MAX_CORES_PER_PACKAGE_BITRANGE: BitRange = bit_range!(31, 26); + } +} + +// Thermal and Power Management Leaf +#[allow(dead_code)] +pub mod leaf_0x6 { + pub const LEAF_NUM: u32 = 0x6; + + pub mod eax { + pub const TURBO_BOOST_BITINDEX: u32 = 1; + } + + pub mod ecx { + // "Energy Performance Bias" bit. + pub const EPB_BITINDEX: u32 = 3; + } +} + +// Structured Extended Feature Flags Enumeration Leaf +pub mod leaf_0x7 { + pub const LEAF_NUM: u32 = 0x7; + + pub mod index0 { + pub mod ebx { + // 1 = TSC_ADJUST + pub const SGX_BITINDEX: u32 = 2; + pub const BMI1_BITINDEX: u32 = 3; + pub const HLE_BITINDEX: u32 = 4; + pub const AVX2_BITINDEX: u32 = 5; + // FPU Data Pointer updated only on x87 exceptions if 1. + pub const FPDP_BITINDEX: u32 = 6; + // 7 = SMEP (Supervisor-Mode Execution Prevention if 1) + pub const BMI2_BITINDEX: u32 = 8; + // 9 = Enhanced REP MOVSB/STOSB if 1 + // 10 = INVPCID + pub const INVPCID_BITINDEX: u32 = 10; + pub const RTM_BITINDEX: u32 = 11; + // Intel® Resource Director Technology (Intel® RDT) Monitoring + pub const RDT_M_BITINDEX: u32 = 12; + // 13 = Deprecates FPU CS and FPU DS values if 1 + // Memory Protection Extensions + pub const MPX_BITINDEX: u32 = 14; + // RDT = Intel® Resource Director Technology + pub const RDT_A_BITINDEX: u32 = 15; + // AVX-512 Foundation instructions + pub const AVX512F_BITINDEX: u32 = 16; + // AVX-512 Doubleword and Quadword Instructions + pub const AVX512DQ_BITINDEX: u32 = 17; + pub const RDSEED_BITINDEX: u32 = 18; + pub const ADX_BITINDEX: u32 = 19; + // 20 = SMAP (Supervisor-Mode Access Prevention) + // AVX512IFMA = AVX-512 Integer Fused Multiply-Add Instructions + pub const AVX512IFMA_BITINDEX: u32 = 21; + // 21 = PCOMMIT intruction + // 22 reserved + // CLFLUSHOPT (flushing multiple cache lines in parallel within a single logical processor) + pub const CLFLUSHOPT_BITINDEX: u32 = 23; + // CLWB = Cache Line Write Back + pub const CLWB_BITINDEX: u32 = 24; + // PT = Intel Processor Trace + pub const PT_BITINDEX: u32 = 25; + // AVX512PF = AVX512 Prefetch Instructions + pub const AVX512PF_BITINDEX: u32 = 26; + // AVX512ER = AVX-512 Exponential and Reciprocal Instructions + pub const AVX512ER_BITINDEX: u32 = 27; + // AVX512CD = AVX-512 Conflict Detection Instructions + pub const AVX512CD_BITINDEX: u32 = 28; + // Intel Secure Hash Algorithm Extensions + pub const SHA_BITINDEX: u32 = 29; + // AVX-512 Byte and Word Instructions + pub const AVX512BW_BITINDEX: u32 = 30; + // AVX-512 Vector Length Extensions + pub const AVX512VL_BITINDEX: u32 = 31; + } + + pub mod ecx { + // 0 = PREFETCHWT1 (move data closer to the processor in anticipation of future use) + // AVX512_VBMI = AVX-512 Vector Byte Manipulation Instructions + pub const AVX512_VBMI_BITINDEX: u32 = 1; + // 2 = UMIP (User Mode Instruction Prevention) + // PKU = Protection Keys for user-mode pages + pub const PKU_BITINDEX: u32 = 3; + // OSPKE = If 1, OS has set CR4.PKE to enable protection keys + pub const OSPKE_BITINDEX: u32 = 4; + // 5 = WAITPKG + // 7-6 reserved + // 8 = GFNI + // 13-09 reserved + // AVX512_VPOPCNTDQ = Vector population count instruction (Intel® Xeon Phiâ„¢ only.) + pub const AVX512_VPOPCNTDQ_BITINDEX: u32 = 14; + // 21 - 17 = The value of MAWAU used by the BNDLDX and BNDSTX instructions in 64-bit mode. + // Read Processor ID + pub const RDPID_BITINDEX: u32 = 22; + // 23 - 29 reserved + // SGX_LC = SGX Launch Configuration + pub const SGX_LC_BITINDEX: u32 = 30; + // 31 reserved + } + + pub mod edx { + // AVX-512 4-register Neural Network Instructions + pub const AVX512_4VNNIW_BITINDEX: u32 = 2; + // AVX-512 4-register Multiply Accumulation Single precision + pub const AVX512_4FMAPS_BITINDEX: u32 = 3; + pub const ARCH_CAPABILITIES_BITINDEX: u32 = 29; + } + } +} + +// Architecture Performance Monitor Features +pub mod leaf_0xa { + pub const LEAF_NUM: u32 = 0xa; + + pub mod eax { + use crate::cpuid::bit_helper::BitRange; + pub const PMC_VERSION_ID: BitRange = bit_range!(7, 0); + pub const BIT_LEN_PMEVENT: BitRange = bit_range!(31, 24); + } + + pub mod ebx { + pub const CORE_CYCLES_BITINDEX: u32 = 0; + pub const INST_RETIRED_BITINDEX: u32 = 1; + pub const REF_CYCLES_BITINDEX: u32 = 2; + pub const LLC_REF_BITINDEX: u32 = 3; + pub const LLC_MISSES_BITINDEX: u32 = 4; + pub const BR_INST_RETIRED_BITINDEX: u32 = 5; + pub const BR_MIS_RETIRED_BITINDEX: u32 = 6; + } +} + +// Extended Topology Leaf +pub mod leaf_0xb { + pub const LEAF_NUM: u32 = 0xb; + + pub const LEVEL_TYPE_THREAD: u32 = 1; + pub const LEVEL_TYPE_CORE: u32 = 2; + + pub mod eax { + use crate::cpuid::bit_helper::BitRange; + + // The bit-range containing the number of bits to shift right the APIC ID in order to get + // the next level APIC ID + pub const APICID_BITRANGE: BitRange = bit_range!(4, 0); + } + + pub mod ebx { + use crate::cpuid::bit_helper::BitRange; + + // The bit-range containing the number of factory-configured logical processors + // at the current cache level + pub const NUM_LOGICAL_PROCESSORS_BITRANGE: BitRange = bit_range!(15, 0); + } + + pub mod ecx { + use crate::cpuid::bit_helper::BitRange; + + pub const LEVEL_TYPE_BITRANGE: BitRange = bit_range!(15, 8); + pub const LEVEL_NUMBER_BITRANGE: BitRange = bit_range!(7, 0); + } +} + +// Processor Extended State Enumeration Sub-leaves +pub mod leaf_0xd { + pub const LEAF_NUM: u32 = 0xd; + + pub mod index0 { + pub mod eax { + use crate::cpuid::bit_helper::BitRange; + + pub const MPX_STATE_BITRANGE: BitRange = bit_range!(4, 3); + pub const AVX512_STATE_BITRANGE: BitRange = bit_range!(7, 5); + } + } + + pub mod index1 { + pub mod eax { + pub const XSAVEC_SHIFT: u32 = 1; + pub const XGETBV_SHIFT: u32 = 2; + pub const XSAVES_SHIFT: u32 = 3; + } + } +} + +// V2 Extended Topology Enumeration Leaf +pub mod leaf_0x1f { + pub const LEAF_NUM: u32 = 0x1f; + + pub const LEVEL_TYPE_THREAD: u32 = 1; + pub const LEVEL_TYPE_CORE: u32 = 2; + pub const LEVEL_TYPE_DIE: u32 = 5; + + pub mod eax { + use crate::cpuid::bit_helper::BitRange; + + // The bit-range containing the number of bits to shift right the APIC ID in order to get + // the next level APIC ID + pub const APICID_BITRANGE: BitRange = bit_range!(4, 0); + } + + pub mod ebx { + use crate::cpuid::bit_helper::BitRange; + + // The bit-range containing the number of factory-configured logical processors + // at the current cache level + pub const NUM_LOGICAL_PROCESSORS_BITRANGE: BitRange = bit_range!(15, 0); + } + + pub mod ecx { + use crate::cpuid::bit_helper::BitRange; + + pub const LEVEL_TYPE_BITRANGE: BitRange = bit_range!(15, 8); + pub const LEVEL_NUMBER_BITRANGE: BitRange = bit_range!(7, 0); + } +} + +/// KVM CPUID bits +/// A guest running on a kvm host, can check some of its features using cpuid. This is not always guaranteed to work, +/// since userspace can mask-out some, or even all KVM-related cpuid features before launching a guest. +/// More information: https://docs.kernel.org/virt/kvm/x86/cpuid.html +pub mod leaf_0x4000_0001 { + pub const LEAF_NUM: u32 = 0x4000_0001; + pub mod eax { + /// kvmclock available at msrs 0x11 and 0x12 + pub const KVM_FEATURE_CLOCKSOURCE_BITINDEX: u32 = 0; + /// not necessary to perform delays on PIO operations + pub const KVM_FEATURE_NOP_IO_DELAY_BITINDEX: u32 = 1; + /// deprecated + pub const KVM_FEATURE_MMU_OP_BITINDEX: u32 = 2; + /// kvmclock available at msrs 0x4b564d00 and 0x4b564d01 + pub const KVM_FEATURE_CLOCKSOURCE2_BITINDEX: u32 = 3; + /// async pf can be enabled by writing to msr 0x4b564d02 + pub const KVM_FEATURE_ASYNC_PF_BITINDEX: u32 = 4; + /// steal time can be enabled by writing to msr 0x4b564d03 + pub const KVM_FEATURE_STEAL_TIME_BITINDEX: u32 = 5; + /// paravirtualized end of interrupt handler can be enabled by writing to msr 0x4b564d04 + pub const KVM_FEATURE_PV_EOI_BITINDEX: u32 = 6; + /// guest checks this feature bit before enabling paravirtualized spinlock support + pub const KVM_FEATURE_PV_UNHALT_BITINDEX: u32 = 7; + /// guest checks this feature bit before enabling paravirtualized tlb flush + pub const KVM_FEATURE_PV_TLB_FLUSH_BITINDEX: u32 = 9; + /// paravirtualized async PF VM EXIT can be enabled by setting bit 2 when writing to msr 0x4b564d02 + pub const KVM_FEATURE_ASYNC_PF_VMEXIT_BITINDEX: u32 = 10; + /// guest checks this feature bit before enabling paravirtualized send IPIs + pub const KVM_FEATURE_PV_SEND_IPI_BITINDEX: u32 = 11; + /// host-side polling on HLT can be disabled by writing to msr 0x4b564d05. + pub const KVM_FEATURE_POLL_CONTROL_BITINDEX: u32 = 12; + /// guest checks this feature bit before using paravirtualized sched yield. + pub const KVM_FEATURE_PV_SCHED_YIELD_BITINDEX: u32 = 13; + /// guest checks this feature bit before using the second async pf control msr 0x4b564d06 and async pf acknowledgment msr 0x4b564d07. + pub const KVM_FEATURE_ASYNC_PF_INT_BITINDEX: u32 = 14; + /// guest checks this feature bit before using extended destination ID bits in MSI address bits 11-5. + pub const KVM_FEATURE_MSI_EXT_DEST_ID_BITINDEX: u32 = 15; + /// guest checks this feature bit before using the map gpa range hypercall to notify the page state change + pub const KVM_FEATURE_HC_MAP_GPA_RANGE_BITINDEX: u32 = 16; + /// guest checks this feature bit before using MSR_KVM_MIGRATION_CONTROL + pub const KVM_FEATURE_MIGRATION_CONTROL_BITINDEX: u32 = 17; + /// host will warn if no guest-side per-cpu warps are expected in kvmclock + pub const KVM_FEATURE_CLOCKSOURCE_STABLE_BITINDEX: u32 = 24; + } +} + +pub mod leaf_0x80000000 { + pub const LEAF_NUM: u32 = 0x8000_0000; + + pub mod eax { + use crate::cpuid::bit_helper::BitRange; + + pub const LARGEST_EXTENDED_FN_BITRANGE: BitRange = bit_range!(31, 0); + } +} + +pub mod leaf_0x80000001 { + pub const LEAF_NUM: u32 = 0x8000_0001; + + pub mod ecx { + pub const TOPOEXT_INDEX: u32 = 22; + pub const PREFETCH_BITINDEX: u32 = 8; // 3DNow! PREFETCH/PREFETCHW instructions + pub const LZCNT_BITINDEX: u32 = 5; // advanced bit manipulation + } + + pub mod edx { + pub const PDPE1GB_BITINDEX: u32 = 26; // 1-GByte pages are available if 1. + } +} + +pub mod leaf_0x80000008 { + pub const LEAF_NUM: u32 = 0x8000_0008; + + pub mod ecx { + use crate::cpuid::bit_helper::BitRange; + + // The number of bits in the initial ApicId value that indicate thread ID within a package + // Possible values: + // 0-3 -> Reserved + // 4 -> 1 Die, up to 16 threads + // 5 -> 2 Die, up to 32 threads + // 6 -> 3,4 Die, up to 64 threads + pub const THREAD_ID_SIZE_BITRANGE: BitRange = bit_range!(15, 12); + // The number of threads in the package - 1 + pub const NUM_THREADS_BITRANGE: BitRange = bit_range!(7, 0); + } +} + +// Extended Cache Topology Leaf +pub mod leaf_0x8000001d { + pub const LEAF_NUM: u32 = 0x8000_001d; + + // inherit eax from leaf_cache_parameters + pub use crate::cpuid::cpu_leaf::leaf_cache_parameters::eax; +} + +// Extended APIC ID Leaf +pub mod leaf_0x8000001e { + pub const LEAF_NUM: u32 = 0x8000_001e; + + pub mod eax { + use crate::cpuid::bit_helper::BitRange; + + pub const EXTENDED_APIC_ID_BITRANGE: BitRange = bit_range!(31, 0); + } + + pub mod ebx { + use crate::cpuid::bit_helper::BitRange; + + // The number of threads per core - 1 + pub const THREADS_PER_CORE_BITRANGE: BitRange = bit_range!(15, 8); + pub const CORE_ID_BITRANGE: BitRange = bit_range!(7, 0); + } + + pub mod ecx { + use crate::cpuid::bit_helper::BitRange; + + // The number of nodes per processor. Possible values: + // 0 -> 1 node per processor + // 1 -> 2 nodes per processor + // 2 -> Reserved + // 3 -> 4 nodes per processor + pub const NODES_PER_PROCESSOR_BITRANGE: BitRange = bit_range!(10, 8); + pub const NODE_ID_BITRANGE: BitRange = bit_range!(7, 0); + } +} diff --git a/src/dragonball/src/dbs_arch/src/x86_64/cpuid/mod.rs b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/mod.rs new file mode 100644 index 000000000..65855a86e --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/mod.rs @@ -0,0 +1,76 @@ +// Copyright 2021 Alibaba Cloud. All Rights Reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! Utilities for configuring the CPUID (CPU identification) for the guest microVM. + +pub mod bit_helper; +pub mod cpu_leaf; + +mod brand_string; +mod common; +mod transformer; + +pub use transformer::{Error, VmSpec}; + +pub use crate::VpmuFeatureLevel; + +type CpuId = kvm_bindings::CpuId; +type CpuIdEntry = kvm_bindings::kvm_cpuid_entry2; + +/// Setup CPUID entries for the given vCPU. +/// +/// # Arguments +/// +/// * `kvm_cpuid` - KVM related structure holding the relevant CPUID info. +/// * `vm_spec` - The specifications of the VM. +/// +/// # Example +/// ```ignore +/// use dbs_arch::cpuid::{process_cpuid, VmSpec, VpmuFeatureLevel}; +/// use kvm_bindings::{CpuId, KVM_MAX_CPUID_ENTRIES}; +/// use kvm_ioctls::Kvm; +/// +/// let kvm = Kvm::new().unwrap(); +/// let mut kvm_cpuid: CpuId = kvm.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES).unwrap(); +/// +/// let vm_spec = VmSpec::new(0, 1, 1, 1, 1, VpmuFeatureLevel::Disabled).unwrap(); +/// +/// process_cpuid(&mut kvm_cpuid, &vm_spec).unwrap(); +/// +/// // Get expected `kvm_cpuid` entries. +/// let entries = kvm_cpuid.as_mut_slice(); +/// ``` +pub fn process_cpuid(kvm_cpuid: &mut CpuId, vm_spec: &VmSpec) -> Result<(), Error> { + use transformer::CpuidTransformer; + + match vm_spec.cpu_vendor_id() { + self::common::VENDOR_ID_INTEL => { + self::transformer::intel::IntelCpuidTransformer::new().process_cpuid(kvm_cpuid, vm_spec) + } + self::common::VENDOR_ID_AMD => { + self::transformer::amd::AmdCpuidTransformer::new().process_cpuid(kvm_cpuid, vm_spec) + } + self::common::VENDOR_ID_HYGON => { + self::transformer::amd::AmdCpuidTransformer::new().process_cpuid(kvm_cpuid, vm_spec) + } + _ => Err(Error::CpuNotSupported), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_invalid_cpuid() { + let mut cpuid = CpuId::new(0).unwrap(); + let vm_spec = VmSpec::new(0, 2, 1, 1, 1, VpmuFeatureLevel::Disabled).unwrap(); + + process_cpuid(&mut cpuid, &vm_spec).unwrap(); + } +} diff --git a/src/dragonball/src/dbs_arch/src/x86_64/cpuid/transformer/amd.rs b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/transformer/amd.rs new file mode 100644 index 000000000..8ed7b73f3 --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/transformer/amd.rs @@ -0,0 +1,412 @@ +// Copyright 2021 Alibaba Cloud. All Rights Reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + +use super::super::bit_helper::BitHelper; +use super::super::cpu_leaf; +use super::*; + +// Largest extended function. It has to be larger than 0x8000001d (Extended Cache Topology). +const LARGEST_EXTENDED_FN: u32 = 0x8000_001f; +// This value allows at most 256 logical threads within a package. But we currently only support +// less than or equal to 254vcpus. +// See also the documentation for leaf_0x80000008::ecx::THREAD_ID_SIZE_BITRANGE +const THREAD_ID_MAX_SIZE: u32 = 8; +// This value means there is 1 node per processor. +// See also the documentation for leaf_0x8000001e::ecx::NODES_PER_PROCESSOR_BITRANGE. +const NODES_PER_PROCESSOR: u32 = 0; + +fn update_structured_extended_entry( + entry: &mut CpuIdEntry, + _vm_spec: &VmSpec, +) -> Result<(), Error> { + use cpu_leaf::leaf_0x7::index0::*; + + // according to the EPYC PPR, only the leaf 0x7 with index 0 contains the + // structured extended feature identifiers + if entry.index == 0 { + // KVM sets this bit no matter what but this feature is not supported by hardware + entry.edx.write_bit(edx::ARCH_CAPABILITIES_BITINDEX, false); + } + + Ok(()) +} + +fn update_largest_extended_fn_entry( + entry: &mut CpuIdEntry, + _vm_spec: &VmSpec, +) -> Result<(), Error> { + use cpu_leaf::leaf_0x80000000::*; + + // KVM sets the largest extended function to 0x80000000. Change it to 0x8000001f + // Since we also use the leaf 0x8000001d (Extended Cache Topology). + entry + .eax + .write_bits_in_range(&eax::LARGEST_EXTENDED_FN_BITRANGE, LARGEST_EXTENDED_FN); + + Ok(()) +} + +fn update_extended_feature_info_entry( + entry: &mut CpuIdEntry, + _vm_spec: &VmSpec, +) -> Result<(), Error> { + use crate::cpuid::cpu_leaf::leaf_0x80000001::*; + + // set the Topology Extension bit since we use the Extended Cache Topology leaf + entry.ecx.write_bit(ecx::TOPOEXT_INDEX, true); + + Ok(()) +} + +fn update_amd_features_entry(entry: &mut CpuIdEntry, vm_spec: &VmSpec) -> Result<(), Error> { + use cpu_leaf::leaf_0x80000008::*; + + // We don't support more then 254 threads right now. + entry + .ecx + .write_bits_in_range(&ecx::THREAD_ID_SIZE_BITRANGE, THREAD_ID_MAX_SIZE) + .write_bits_in_range(&ecx::NUM_THREADS_BITRANGE, u32::from(vm_spec.cpu_count - 1)); + + Ok(()) +} + +fn update_extended_cache_topology_entry( + entry: &mut CpuIdEntry, + vm_spec: &VmSpec, +) -> Result<(), Error> { + entry.flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + + common::update_cache_parameters_entry(entry, vm_spec) +} + +fn update_extended_apic_id_entry(entry: &mut CpuIdEntry, vm_spec: &VmSpec) -> Result<(), Error> { + use crate::cpuid::cpu_leaf::leaf_0x8000001e::*; + + let mut core_id = u32::from(vm_spec.cpu_id); + // When hyper-threading is enabled each pair of 2 consecutive logical CPUs + // will have the same core id since they represent 2 threads in the same core. + // For Example: + // logical CPU 0 -> core id: 0 + // logical CPU 1 -> core id: 0 + // logical CPU 2 -> core id: 1 + // logical CPU 3 -> core id: 1 + if vm_spec.threads_per_core == 2 { + core_id /= 2; + } + + entry + .eax + // the Extended APIC ID is the id of the current logical CPU + .write_bits_in_range(&eax::EXTENDED_APIC_ID_BITRANGE, u32::from(vm_spec.cpu_id)); + + entry + .ebx + .write_bits_in_range(&ebx::CORE_ID_BITRANGE, core_id) + .write_bits_in_range( + &ebx::THREADS_PER_CORE_BITRANGE, + u32::from(vm_spec.threads_per_core - 1), + ); + + entry + .ecx + .write_bits_in_range(&ecx::NODES_PER_PROCESSOR_BITRANGE, NODES_PER_PROCESSOR) + // Put all the cpus in the same node. + .write_bits_in_range(&ecx::NODE_ID_BITRANGE, 0); + + Ok(()) +} + +#[derive(Default)] +pub struct AmdCpuidTransformer {} + +impl AmdCpuidTransformer { + pub fn new() -> Self { + Default::default() + } +} + +impl CpuidTransformer for AmdCpuidTransformer { + fn process_cpuid(&self, cpuid: &mut CpuId, vm_spec: &VmSpec) -> Result<(), Error> { + use cpu_leaf::*; + + common::use_host_cpuid_function(cpuid, leaf_0x0::LEAF_NUM, false)?; + common::use_host_cpuid_function(cpuid, leaf_0x8000001d::LEAF_NUM, false)?; + common::use_host_cpuid_function(cpuid, leaf_0x8000001d::LEAF_NUM, true)?; + self.process_entries(cpuid, vm_spec) + } + + fn entry_transformer_fn(&self, entry: &mut CpuIdEntry) -> Option { + use cpu_leaf::*; + + match entry.function { + leaf_0x1::LEAF_NUM => Some(common::update_feature_info_entry), + leaf_0x7::LEAF_NUM => Some(update_structured_extended_entry), + leaf_0xb::LEAF_NUM => Some(common::update_extended_topology_entry), + leaf_0x1f::LEAF_NUM => Some(common::update_extended_topology_v2_entry), + leaf_0x80000000::LEAF_NUM => Some(update_largest_extended_fn_entry), + leaf_0x80000001::LEAF_NUM => Some(update_extended_feature_info_entry), + leaf_0x80000008::LEAF_NUM => Some(update_amd_features_entry), + leaf_0x8000001d::LEAF_NUM => Some(update_extended_cache_topology_entry), + leaf_0x8000001e::LEAF_NUM => Some(update_extended_apic_id_entry), + 0x8000_0002..=0x8000_0004 => Some(common::update_brand_string_entry), + _ => None, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_transformer_construct() { + use cpu_leaf::leaf_0x7::index0::*; + + let transformer = AmdCpuidTransformer::new(); + + let vm_spec = + VmSpec::new(0, 1, 1, 1, 1, VpmuFeatureLevel::Disabled).expect("Error creating vm_spec"); + let mut cpuid = CpuId::from_entries(&[CpuIdEntry { + function: cpu_leaf::leaf_0x7::LEAF_NUM, + index: 0, + flags: 0, + eax: 0, + ebx: 0, + ecx: 0, + edx: *(0_u32).write_bit(edx::ARCH_CAPABILITIES_BITINDEX, true), + padding: [0, 0, 0], + }]) + .unwrap(); + + transformer.process_cpuid(&mut cpuid, &vm_spec).unwrap(); + } + + #[test] + fn test_update_structured_extended_entry() { + use cpu_leaf::leaf_0x7::index0::*; + + // Check that if index == 0 the entry is processed + let vm_spec = + VmSpec::new(0, 1, 1, 1, 1, VpmuFeatureLevel::Disabled).expect("Error creating vm_spec"); + let entry = &mut CpuIdEntry { + function: cpu_leaf::leaf_0x7::LEAF_NUM, + index: 0, + flags: 0, + eax: 0, + ebx: 0, + ecx: 0, + edx: *(0_u32).write_bit(edx::ARCH_CAPABILITIES_BITINDEX, true), + padding: [0, 0, 0], + }; + assert!(update_structured_extended_entry(entry, &vm_spec).is_ok()); + assert!(!entry.edx.read_bit(edx::ARCH_CAPABILITIES_BITINDEX)); + + // Check that if index != 0 the entry is not processed + entry.index = 1; + entry.edx.write_bit(edx::ARCH_CAPABILITIES_BITINDEX, true); + assert!(update_structured_extended_entry(entry, &vm_spec).is_ok()); + assert!(entry.edx.read_bit(edx::ARCH_CAPABILITIES_BITINDEX)); + } + + #[test] + fn test_update_largest_extended_fn_entry() { + use crate::cpuid::cpu_leaf::leaf_0x80000000::*; + + let vm_spec = + VmSpec::new(0, 1, 1, 1, 1, VpmuFeatureLevel::Disabled).expect("Error creating vm_spec"); + let entry = &mut CpuIdEntry { + function: LEAF_NUM, + index: 0, + flags: 0, + eax: 0, + ebx: 0, + ecx: 0, + edx: 0, + padding: [0, 0, 0], + }; + + assert!(update_largest_extended_fn_entry(entry, &vm_spec).is_ok()); + + assert_eq!( + entry + .eax + .read_bits_in_range(&eax::LARGEST_EXTENDED_FN_BITRANGE), + LARGEST_EXTENDED_FN + ); + } + + #[test] + fn test_update_extended_feature_info_entry() { + use crate::cpuid::cpu_leaf::leaf_0x80000001::*; + + let vm_spec = + VmSpec::new(0, 1, 1, 1, 1, VpmuFeatureLevel::Disabled).expect("Error creating vm_spec"); + let entry = &mut CpuIdEntry { + function: LEAF_NUM, + index: 0, + flags: 0, + eax: 0, + ebx: 0, + ecx: 0, + edx: 0, + padding: [0, 0, 0], + }; + + assert!(update_extended_feature_info_entry(entry, &vm_spec).is_ok()); + + assert!(entry.ecx.read_bit(ecx::TOPOEXT_INDEX)); + } + + fn check_update_amd_features_entry( + cpu_count: u8, + threads_per_core: u8, + cores_per_die: u8, + dies_per_socket: u8, + ) { + use crate::cpuid::cpu_leaf::leaf_0x80000008::*; + + let vm_spec = VmSpec::new( + 0, + cpu_count, + threads_per_core, + cores_per_die, + dies_per_socket, + VpmuFeatureLevel::Disabled, + ) + .expect("Error creating vm_spec"); + let entry = &mut CpuIdEntry { + function: LEAF_NUM, + index: 0, + flags: 0, + eax: 0, + ebx: 0, + ecx: 0, + edx: 0, + padding: [0, 0, 0], + }; + + assert!(update_amd_features_entry(entry, &vm_spec).is_ok()); + + assert_eq!( + entry.ecx.read_bits_in_range(&ecx::NUM_THREADS_BITRANGE), + u32::from(cpu_count - 1) + ); + assert_eq!( + entry.ecx.read_bits_in_range(&ecx::THREAD_ID_SIZE_BITRANGE), + THREAD_ID_MAX_SIZE + ); + } + + fn check_update_extended_apic_id_entry( + cpu_id: u8, + cpu_count: u8, + expected_core_id: u32, + expected_threads_per_core: u32, + threads_per_core: u8, + cores_per_die: u8, + dies_per_socket: u8, + ) { + use crate::cpuid::cpu_leaf::leaf_0x8000001e::*; + + let vm_spec = VmSpec::new( + cpu_id, + cpu_count, + threads_per_core, + cores_per_die, + dies_per_socket, + VpmuFeatureLevel::Disabled, + ) + .expect("Error creating vm_spec"); + let entry = &mut CpuIdEntry { + function: LEAF_NUM, + index: 0, + flags: 0, + eax: 0, + ebx: 0, + ecx: 0, + edx: 0, + padding: [0, 0, 0], + }; + + assert!(update_extended_apic_id_entry(entry, &vm_spec).is_ok()); + + assert_eq!( + entry + .eax + .read_bits_in_range(&eax::EXTENDED_APIC_ID_BITRANGE), + u32::from(cpu_id) + ); + + assert_eq!( + entry.ebx.read_bits_in_range(&ebx::CORE_ID_BITRANGE), + expected_core_id + ); + assert_eq!( + entry + .ebx + .read_bits_in_range(&ebx::THREADS_PER_CORE_BITRANGE), + expected_threads_per_core + ); + + assert_eq!( + entry + .ecx + .read_bits_in_range(&ecx::NODES_PER_PROCESSOR_BITRANGE), + NODES_PER_PROCESSOR + ); + assert_eq!(entry.ecx.read_bits_in_range(&ecx::NODE_ID_BITRANGE), 0); + } + + #[test] + fn test_update_extended_cache_topology_entry() { + let vm_spec = + VmSpec::new(0, 1, 1, 1, 1, VpmuFeatureLevel::Disabled).expect("Error creating vm_spec"); + let entry = &mut CpuIdEntry { + function: cpu_leaf::leaf_0x8000001d::LEAF_NUM, + index: 0, + flags: 0, + eax: 0, + ebx: 0, + ecx: 0, + edx: 0, + padding: [0, 0, 0], + }; + + assert!(update_extended_cache_topology_entry(entry, &vm_spec).is_ok()); + + assert_eq!(entry.flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX, 1); + } + + #[test] + fn test_1vcpu_ht_off() { + check_update_amd_features_entry(1, 1, 1, 1); + + check_update_extended_apic_id_entry(0, 1, 0, 0, 1, 1, 1); + } + + #[test] + fn test_1vcpu_ht_on() { + check_update_amd_features_entry(1, 2, 1, 1); + + check_update_extended_apic_id_entry(0, 1, 0, 1, 2, 1, 1); + } + + #[test] + fn test_2vcpu_ht_off() { + check_update_amd_features_entry(2, 1, 2, 1); + + check_update_extended_apic_id_entry(0, 2, 0, 0, 1, 2, 1); + check_update_extended_apic_id_entry(1, 2, 1, 0, 1, 2, 1); + } + + #[test] + fn test_2vcpu_ht_on() { + check_update_amd_features_entry(2, 2, 2, 1); + + check_update_extended_apic_id_entry(0, 2, 0, 1, 2, 2, 1); + check_update_extended_apic_id_entry(1, 2, 0, 1, 2, 2, 1); + } +} diff --git a/src/dragonball/src/dbs_arch/src/x86_64/cpuid/transformer/common.rs b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/transformer/common.rs new file mode 100644 index 000000000..681ef0232 --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/transformer/common.rs @@ -0,0 +1,628 @@ +// Copyright 2019 Alibaba Cloud. All Rights Reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use super::super::bit_helper::BitHelper; +use super::super::common::get_cpuid; +use super::super::cpu_leaf; +use super::*; + +// constants for setting the fields of kvm_cpuid2 structures +// CPUID bits in ebx, ecx, and edx. +const EBX_CLFLUSH_CACHELINE: u32 = 8; // Flush a cache line size. + +/// Prepare content for CPUID standard level 0000_0001h: get processor type/family/model/stepping +/// and feature flags +pub fn update_feature_info_entry(entry: &mut CpuIdEntry, vm_spec: &VmSpec) -> Result<(), Error> { + use cpu_leaf::leaf_0x1::*; + + // ECX bit 31 (HV): hypervisor present (and intercepting this bit, to advertise its presence) + // ECX bit 24 (TSCD): local APIC supports one-shot operation using TSC deadline value + entry + .ecx + .write_bit(ecx::TSC_DEADLINE_TIMER_BITINDEX, true) + .write_bit(ecx::HYPERVISOR_BITINDEX, true); + + // EBX bit 8-15: The CLFLUSH (8-byte) chunk count + // EBX bit 16-23: The logical processor count + // EBX bit 24-31: The (fixed) default APIC ID + entry + .ebx + .write_bits_in_range(&ebx::APICID_BITRANGE, u32::from(vm_spec.cpu_id)) + .write_bits_in_range(&ebx::CLFLUSH_SIZE_BITRANGE, EBX_CLFLUSH_CACHELINE) + .write_bits_in_range( + &ebx::CPU_COUNT_BITRANGE, + u32::from(vm_spec.threads_per_core * vm_spec.cores_per_die * vm_spec.dies_per_socket), + ); + + // EDX bit 28: Hyper-Threading Technology, PAUSE. A value of 1 for HTT indicates the value in + // CPUID.1.Ebx[23:16] (the Maximum number of addressable IDs for logical processors in this + // package) is valid for the package + entry + .edx + .write_bit(edx::HTT_BITINDEX, vm_spec.cpu_count > 1); + + Ok(()) +} + +/// Prepare content for CPUID standard level 0000_000Bh: get topology enumeration information. +pub fn update_extended_topology_entry( + entry: &mut CpuIdEntry, + vm_spec: &VmSpec, +) -> Result<(), Error> { + use cpu_leaf::leaf_0xb::*; + let thread_width = 8 - (vm_spec.threads_per_core - 1).leading_zeros(); + let core_width = (8 - (vm_spec.cores_per_die - 1).leading_zeros()) + thread_width; + + // EAX bit 0-4: number of bits to shift x2APIC ID right to get unique topology ID of + // next level type all logical processors with same next level ID share current level + // EBX bit 0-15: number of enabled logical processors at this level + // ECX bit 0-8: level number (same as input) + // ECX bit 8-15: level type (00h=invalid, 01h=SMT, 02h=core, 03h...FFh=reserved) + // EDX bits 0-31 contain x2APIC ID of current logical processor + entry.eax = 0_u32; + entry.ebx = 0_u32; + entry.ecx = 0_u32; + entry.edx = u32::from(vm_spec.cpu_id); + + match entry.index { + // Thread Level Topology; index = 0 + 0 => { + // To get the next level APIC ID, shift right with at most 1 because we have + // maximum 2 hyperthreads per core that can be represented by 1 bit. + entry + .eax + .write_bits_in_range(&eax::APICID_BITRANGE, thread_width); + // When cpu_count == 1 or HT is disabled, there is 1 logical core at this level + // Otherwise there are 2 + entry.ebx.write_bits_in_range( + &ebx::NUM_LOGICAL_PROCESSORS_BITRANGE, + vm_spec.threads_per_core as u32, + ); + entry + .ecx + .write_bits_in_range(&ecx::LEVEL_TYPE_BITRANGE, LEVEL_TYPE_THREAD); + } + + // Core Level Processor Topology; index = 1 + 1 => { + entry + .eax + .write_bits_in_range(&eax::APICID_BITRANGE, core_width); + entry.ebx.write_bits_in_range( + &ebx::NUM_LOGICAL_PROCESSORS_BITRANGE, + u32::from(vm_spec.threads_per_core * vm_spec.cores_per_die), + ); + entry + .ecx + .write_bits_in_range(&ecx::LEVEL_NUMBER_BITRANGE, entry.index); + entry + .ecx + .write_bits_in_range(&ecx::LEVEL_TYPE_BITRANGE, LEVEL_TYPE_CORE); + } + // Core Level Processor Topology; index >=2 + // No other levels available; This should already be set to correctly, + // and it is added here as a "re-enforcement" in case we run on + // different hardware + level => { + entry.ecx = level; + } + } + + Ok(()) +} + +/// Prepare content for Intel V2 Extended Topology Enumeration Leaf. +/// +/// Leaf_0x1f is a superset of leaf_0xb. It gives extra information like die_per_socket. +/// When CPUID executes with EAX set to 1FH, the processor returns information about extended +/// topology enumeration data. Software must detect the presence of CPUID leaf 1FH by verifying +/// - the highest leaf index supported by CPUID is >= 1FH +/// - CPUID.1FH:EBX[15:0] reports a non-zero value +/// If leaf_0x1f is not implemented in cpu used in host, guest OS should turn to leaf_0xb to +/// determine the cpu topology. +pub fn update_extended_topology_v2_entry( + entry: &mut CpuIdEntry, + vm_spec: &VmSpec, +) -> Result<(), Error> { + use cpu_leaf::leaf_0x1f::*; + let thread_width = 8 - (vm_spec.threads_per_core - 1).leading_zeros(); + let core_width = (8 - (vm_spec.cores_per_die - 1).leading_zeros()) + thread_width; + let die_width = (8 - (vm_spec.dies_per_socket - 1).leading_zeros()) + core_width; + + // EAX bit 0-4: number of bits to shift x2APIC ID right to get unique topology ID of + // next level type all logical processors with same next level ID share current level + // EBX bit 0-15: number of enabled logical processors at this level + // ECX bit 0-8: level number (same as input) + // ECX bit 8-15: level type (00h=invalid, 01h=SMT, 02h=core, 05h=die, otherwise=reserved) + // EDX bits 0-31 contain x2APIC ID of current logical processor + entry.eax = 0_u32; + entry.ebx = 0_u32; + entry.ecx = 0_u32; + entry.edx = u32::from(vm_spec.cpu_id); + + match entry.index { + // Thread Level Topology; index = 0 + 0 => { + // To get the next level APIC ID, shift right with at most 1 because we have + // maximum 2 hyperthreads per core that can be represented by 1 bit. + entry + .eax + .write_bits_in_range(&eax::APICID_BITRANGE, thread_width); + // When cpu_count == 1 or HT is disabled, there is 1 logical core at this level + // Otherwise there are 2 + entry.ebx.write_bits_in_range( + &ebx::NUM_LOGICAL_PROCESSORS_BITRANGE, + vm_spec.threads_per_core as u32, + ); + entry + .ecx + .write_bits_in_range(&ecx::LEVEL_TYPE_BITRANGE, LEVEL_TYPE_THREAD); + } + // Core Level Processor Topology; index = 1 + 1 => { + entry + .eax + .write_bits_in_range(&eax::APICID_BITRANGE, core_width); + entry.ebx.write_bits_in_range( + &ebx::NUM_LOGICAL_PROCESSORS_BITRANGE, + u32::from(vm_spec.threads_per_core * vm_spec.cores_per_die), + ); + entry + .ecx + .write_bits_in_range(&ecx::LEVEL_NUMBER_BITRANGE, entry.index); + entry + .ecx + .write_bits_in_range(&ecx::LEVEL_TYPE_BITRANGE, LEVEL_TYPE_CORE); + } + // Die Level Processor Topology; index = 5 + 5 => { + entry + .eax + .write_bits_in_range(&eax::APICID_BITRANGE, die_width); + entry.ebx.write_bits_in_range( + &ebx::NUM_LOGICAL_PROCESSORS_BITRANGE, + u32::from( + vm_spec.threads_per_core * vm_spec.cores_per_die * vm_spec.dies_per_socket, + ), + ); + entry + .ecx + .write_bits_in_range(&ecx::LEVEL_NUMBER_BITRANGE, entry.index); + entry + .ecx + .write_bits_in_range(&ecx::LEVEL_TYPE_BITRANGE, LEVEL_TYPE_DIE); + } + level => { + entry.ecx = level; + } + } + + Ok(()) +} + +/// Prepare content for CPUID standard level 8000_0002/3/4h: get processor name string. +pub fn update_brand_string_entry(entry: &mut CpuIdEntry, vm_spec: &VmSpec) -> Result<(), Error> { + let brand_string = &vm_spec.brand_string; + entry.eax = brand_string.get_reg_for_leaf(entry.function, BsReg::Eax); + entry.ebx = brand_string.get_reg_for_leaf(entry.function, BsReg::Ebx); + entry.ecx = brand_string.get_reg_for_leaf(entry.function, BsReg::Ecx); + entry.edx = brand_string.get_reg_for_leaf(entry.function, BsReg::Edx); + + Ok(()) +} + +/// Prepare content for CPUID extended level 8000_001Dh: get cache configuration descriptors. +pub fn update_cache_parameters_entry( + entry: &mut CpuIdEntry, + vm_spec: &VmSpec, +) -> Result<(), Error> { + use cpu_leaf::leaf_cache_parameters::*; + + // EAX bit 14-25: cores per cache - 1 + + match entry.eax.read_bits_in_range(&eax::CACHE_LEVEL_BITRANGE) { + // L1 & L2 Cache + 1 | 2 => { + // The L1 & L2 cache is shared by at most 2 hyperthreads + entry.eax.write_bits_in_range( + &eax::MAX_CPUS_PER_CORE_BITRANGE, + (vm_spec.cpu_count > 1 && vm_spec.threads_per_core == 2) as u32, + ); + } + // L3 Cache + 3 => { + // The L3 cache is shared among all the logical threads + entry.eax.write_bits_in_range( + &eax::MAX_CPUS_PER_CORE_BITRANGE, + u32::from(vm_spec.cpu_count - 1), + ); + } + _ => (), + } + + Ok(()) +} + +/// Replaces the `cpuid` entries corresponding to `function` with the entries from the host's cpuid. +pub fn use_host_cpuid_function( + cpuid: &mut CpuId, + function: u32, + use_count: bool, +) -> Result<(), Error> { + // copy all the CpuId entries, except for the ones with the provided function + cpuid.retain(|entry| entry.function != function); + + // add all the host leaves with the provided function + let mut count: u32 = 0; + while let Ok(entry) = get_cpuid(function, count) { + if count > 0 && !use_count { + break; + } + + cpuid + .push(CpuIdEntry { + function, + index: count, + flags: 0, + eax: entry.eax, + ebx: entry.ebx, + ecx: entry.ecx, + edx: entry.edx, + padding: [0, 0, 0], + }) + .map_err(Error::FamError)?; + + count += 1; + } + + Ok(()) +} + +#[cfg(test)] +mod test { + use kvm_bindings::kvm_cpuid_entry2; + + use super::*; + use crate::cpuid::common::tests::get_topoext_fn; + use crate::cpuid::cpu_leaf::leaf_0x1f::LEVEL_TYPE_DIE; + use crate::cpuid::cpu_leaf::leaf_0xb::LEVEL_TYPE_CORE; + use crate::cpuid::cpu_leaf::leaf_0xb::LEVEL_TYPE_THREAD; + use crate::cpuid::transformer::VmSpec; + + fn check_update_feature_info_entry( + cpu_count: u8, + expected_htt: bool, + threads_per_core: u8, + cores_per_die: u8, + dies_per_socket: u8, + ) { + use crate::cpuid::cpu_leaf::leaf_0x1::*; + + let vm_spec = VmSpec::new( + 0, + cpu_count, + threads_per_core, + cores_per_die, + dies_per_socket, + VpmuFeatureLevel::Disabled, + ) + .expect("Error creating vm_spec"); + let entry = &mut kvm_cpuid_entry2 { + function: 0x0, + index: 0, + flags: 0, + eax: 0, + ebx: 0, + ecx: 0, + edx: 0, + padding: [0, 0, 0], + }; + + assert!(update_feature_info_entry(entry, &vm_spec).is_ok()); + + assert!(entry.edx.read_bit(edx::HTT_BITINDEX) == expected_htt); + assert!(entry.ecx.read_bit(ecx::TSC_DEADLINE_TIMER_BITINDEX)); + } + + fn check_update_cache_parameters_entry( + cpu_count: u8, + cache_level: u32, + expected_max_cpus_per_core: u32, + threads_per_core: u8, + cores_per_die: u8, + dies_per_socket: u8, + ) { + use crate::cpuid::cpu_leaf::leaf_cache_parameters::*; + + let vm_spec = VmSpec::new( + 0, + cpu_count, + threads_per_core, + cores_per_die, + dies_per_socket, + VpmuFeatureLevel::Disabled, + ) + .expect("Error creating vm_spec"); + let entry = &mut kvm_cpuid_entry2 { + function: 0x0, + index: 0, + flags: 0, + eax: *(0_u32).write_bits_in_range(&eax::CACHE_LEVEL_BITRANGE, cache_level), + ebx: 0, + ecx: 0, + edx: 0, + padding: [0, 0, 0], + }; + + assert!(update_cache_parameters_entry(entry, &vm_spec).is_ok()); + + assert!( + entry + .eax + .read_bits_in_range(&eax::MAX_CPUS_PER_CORE_BITRANGE) + == expected_max_cpus_per_core + ); + } + + #[allow(clippy::too_many_arguments)] + fn check_update_extended_topology_entry( + cpu_count: u8, + index: u32, + expected_apicid_shift_bit: u32, + expected_num_logical_processors: u32, + expected_level_type: u32, + threads_per_core: u8, + cores_per_die: u8, + dies_per_socket: u8, + ) { + use crate::cpuid::cpu_leaf::leaf_0xb::*; + + let vm_spec = VmSpec::new( + 0, + cpu_count, + threads_per_core, + cores_per_die, + dies_per_socket, + VpmuFeatureLevel::Disabled, + ) + .expect("Error creating vm_spec"); + let entry = &mut kvm_cpuid_entry2 { + function: 0x0, + index, + flags: 0, + eax: 0, + ebx: 0, + ecx: 0, + edx: 0, + padding: [0, 0, 0], + }; + + assert!(update_extended_topology_entry(entry, &vm_spec).is_ok()); + + assert!(entry.eax.read_bits_in_range(&eax::APICID_BITRANGE) == expected_apicid_shift_bit); + assert!( + entry + .ebx + .read_bits_in_range(&ebx::NUM_LOGICAL_PROCESSORS_BITRANGE) + == expected_num_logical_processors + ); + assert!(entry.ecx.read_bits_in_range(&ecx::LEVEL_TYPE_BITRANGE) == expected_level_type); + assert!(entry.ecx.read_bits_in_range(&ecx::LEVEL_NUMBER_BITRANGE) == index); + } + + #[allow(clippy::too_many_arguments)] + fn check_update_extended_topology_v2_entry( + cpu_count: u8, + index: u32, + expected_apicid_shift_bit: u32, + expected_num_logical_processors: u32, + expected_level_type: u32, + threads_per_core: u8, + cores_per_die: u8, + dies_per_socket: u8, + ) { + use crate::cpuid::cpu_leaf::leaf_0x1f::*; + + let vm_spec = VmSpec::new( + 0, + cpu_count, + threads_per_core, + cores_per_die, + dies_per_socket, + VpmuFeatureLevel::Disabled, + ) + .expect("Error creating vm_spec"); + let entry = &mut kvm_cpuid_entry2 { + function: 0x0, + index, + flags: 0, + eax: 0, + ebx: 0, + ecx: 0, + edx: 0, + padding: [0, 0, 0], + }; + + assert!(update_extended_topology_v2_entry(entry, &vm_spec).is_ok()); + + assert!(entry.eax.read_bits_in_range(&eax::APICID_BITRANGE) == expected_apicid_shift_bit); + assert!( + entry + .ebx + .read_bits_in_range(&ebx::NUM_LOGICAL_PROCESSORS_BITRANGE) + == expected_num_logical_processors + ); + assert!(entry.ecx.read_bits_in_range(&ecx::LEVEL_TYPE_BITRANGE) == expected_level_type); + assert!(entry.ecx.read_bits_in_range(&ecx::LEVEL_NUMBER_BITRANGE) == index); + } + + #[test] + fn test_1vcpu_ht_off() { + check_update_feature_info_entry(1, false, 1, 1, 1); + + // test update_deterministic_cache_entry + // test L1 + check_update_cache_parameters_entry(1, 1, 0, 1, 1, 1); + // test L2 + check_update_cache_parameters_entry(1, 2, 0, 1, 1, 1); + // test L3 + check_update_cache_parameters_entry(1, 3, 0, 1, 1, 1); + + // test update_extended_topology_entry + // index 0 + check_update_extended_topology_entry(1, 0, 0, 1, LEVEL_TYPE_THREAD, 1, 1, 1); + check_update_extended_topology_v2_entry(1, 0, 0, 1, LEVEL_TYPE_THREAD, 1, 1, 1); + // index 1 + check_update_extended_topology_entry(1, 1, 0, 1, LEVEL_TYPE_CORE, 1, 1, 1); + check_update_extended_topology_v2_entry(1, 1, 0, 1, LEVEL_TYPE_CORE, 1, 1, 1); + // index 5 + check_update_extended_topology_v2_entry(1, 5, 0, 1, LEVEL_TYPE_DIE, 1, 1, 1); + } + + #[test] + fn test_1vcpu_ht_on() { + check_update_feature_info_entry(1, false, 2, 1, 1); + + // test update_deterministic_cache_entry + // test L1 + check_update_cache_parameters_entry(1, 1, 0, 2, 1, 1); + // test L2 + check_update_cache_parameters_entry(1, 2, 0, 2, 1, 1); + // test L3 + check_update_cache_parameters_entry(1, 3, 0, 2, 1, 1); + + // test update_extended_topology_entry + // index 0 + check_update_extended_topology_entry(1, 0, 1, 2, LEVEL_TYPE_THREAD, 2, 1, 1); + check_update_extended_topology_v2_entry(1, 0, 1, 2, LEVEL_TYPE_THREAD, 2, 1, 1); + // index 1 + check_update_extended_topology_entry(1, 1, 1, 2, LEVEL_TYPE_CORE, 2, 1, 1); + check_update_extended_topology_v2_entry(1, 1, 1, 2, LEVEL_TYPE_CORE, 2, 1, 1); + // index 5 + check_update_extended_topology_v2_entry(1, 5, 1, 2, LEVEL_TYPE_DIE, 2, 1, 1); + } + + #[test] + fn test_2vcpu_ht_off() { + check_update_feature_info_entry(2, true, 1, 2, 1); + + // test update_deterministic_cache_entry + // test L1 + check_update_cache_parameters_entry(2, 1, 0, 1, 2, 1); + // test L2 + check_update_cache_parameters_entry(2, 2, 0, 1, 2, 1); + // test L3 + check_update_cache_parameters_entry(2, 3, 1, 1, 2, 1); + + // test update_extended_topology_entry + // index 0 + check_update_extended_topology_entry(2, 0, 0, 1, LEVEL_TYPE_THREAD, 1, 2, 1); + check_update_extended_topology_v2_entry(2, 0, 0, 1, LEVEL_TYPE_THREAD, 1, 2, 1); + // index 1 + check_update_extended_topology_entry(2, 1, 1, 2, LEVEL_TYPE_CORE, 1, 2, 1); + check_update_extended_topology_v2_entry(2, 1, 1, 2, LEVEL_TYPE_CORE, 1, 2, 1); + // index 5 + check_update_extended_topology_v2_entry(2, 5, 1, 2, LEVEL_TYPE_DIE, 1, 2, 1); + } + + #[test] + fn test_2vcpu_ht_on() { + check_update_feature_info_entry(2, true, 2, 2, 1); + + // test update_deterministic_cache_entry + // test L1 + check_update_cache_parameters_entry(2, 1, 1, 2, 2, 1); + // test L2 + check_update_cache_parameters_entry(2, 2, 1, 2, 2, 1); + // test L3 + check_update_cache_parameters_entry(2, 3, 1, 2, 2, 1); + + // test update_extended_topology_entry + // index 0 + check_update_extended_topology_entry(2, 0, 1, 2, LEVEL_TYPE_THREAD, 2, 2, 1); + check_update_extended_topology_v2_entry(2, 0, 1, 2, LEVEL_TYPE_THREAD, 2, 2, 1); + // index 1 + check_update_extended_topology_entry(2, 1, 2, 4, LEVEL_TYPE_CORE, 2, 2, 1); + check_update_extended_topology_v2_entry(2, 1, 2, 4, LEVEL_TYPE_CORE, 2, 2, 1); + // index 5 + check_update_extended_topology_v2_entry(2, 5, 2, 4, LEVEL_TYPE_DIE, 2, 2, 1); + } + + #[test] + fn test_2dies_2vcpu_ht_off() { + // test update_extended_topology_entry + // index 0 + check_update_extended_topology_entry(2, 0, 0, 1, LEVEL_TYPE_THREAD, 1, 1, 2); + check_update_extended_topology_v2_entry(2, 0, 0, 1, LEVEL_TYPE_THREAD, 1, 1, 2); + // index 1 + check_update_extended_topology_entry(2, 1, 0, 1, LEVEL_TYPE_CORE, 1, 1, 2); + check_update_extended_topology_v2_entry(2, 1, 0, 1, LEVEL_TYPE_CORE, 1, 1, 2); + // index 5 + check_update_extended_topology_v2_entry(2, 5, 1, 2, LEVEL_TYPE_DIE, 1, 1, 2); + } + + #[test] + fn test_2dies_4vcpu_ht_on() { + // test update_extended_topology_entry + // index 0 + check_update_extended_topology_entry(4, 0, 1, 2, LEVEL_TYPE_THREAD, 2, 1, 2); + check_update_extended_topology_v2_entry(4, 0, 1, 2, LEVEL_TYPE_THREAD, 2, 1, 2); + // index 1 + check_update_extended_topology_entry(4, 1, 1, 2, LEVEL_TYPE_CORE, 2, 1, 2); + check_update_extended_topology_v2_entry(4, 1, 1, 2, LEVEL_TYPE_CORE, 2, 1, 2); + // index 5 + check_update_extended_topology_v2_entry(4, 5, 2, 4, LEVEL_TYPE_DIE, 2, 1, 2); + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_use_host_cpuid_function_with_count() { + // try to emulate the extended cache topology leaves + let topoext_fn = get_topoext_fn(); + + // check that it behaves correctly for TOPOEXT function + let mut cpuid = CpuId::new(1).unwrap(); + cpuid.as_mut_slice()[0].function = topoext_fn; + assert!(use_host_cpuid_function(&mut cpuid, topoext_fn, true).is_ok()); + let entries = cpuid.as_mut_slice(); + assert!(entries.len() > 1); + for (count, entry) in entries.iter_mut().enumerate() { + assert!(entry.function == topoext_fn); + assert!(entry.index == count as u32); + assert!(entry.eax != 0); + } + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_use_host_cpuid_function_without_count() { + use crate::cpuid::cpu_leaf::leaf_0x1::*; + // try to emulate the extended cache topology leaves + let feature_info_fn = LEAF_NUM; + + // check that it behaves correctly for TOPOEXT function + let mut cpuid = CpuId::new(1).unwrap(); + cpuid.as_mut_slice()[0].function = feature_info_fn; + assert!(use_host_cpuid_function(&mut cpuid, feature_info_fn, false).is_ok()); + let entries = cpuid.as_mut_slice(); + assert!(entries.len() == 1); + let entry = entries[0]; + + assert!(entry.function == feature_info_fn); + assert!(entry.index == 0); + assert!(entry.eax != 0); + } + + #[test] + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + fn test_use_host_cpuid_function_err() { + let topoext_fn = get_topoext_fn(); + // check that it returns Err when there are too many entriesentry.function == topoext_fn + let mut cpuid = CpuId::new(kvm_bindings::KVM_MAX_CPUID_ENTRIES).unwrap(); + match use_host_cpuid_function(&mut cpuid, topoext_fn, true) { + Err(Error::FamError(vmm_sys_util::fam::Error::SizeLimitExceeded)) => {} + _ => panic!("Wrong behavior"), + } + } +} diff --git a/src/dragonball/src/dbs_arch/src/x86_64/cpuid/transformer/intel.rs b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/transformer/intel.rs new file mode 100644 index 000000000..a7395126a --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/transformer/intel.rs @@ -0,0 +1,280 @@ +// Copyright 2021 Alibaba Cloud. All Rights Reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use super::super::bit_helper::BitHelper; +use super::super::cpu_leaf; +use super::*; + +fn update_deterministic_cache_entry(entry: &mut CpuIdEntry, vm_spec: &VmSpec) -> Result<(), Error> { + use cpu_leaf::leaf_0x4::*; + + common::update_cache_parameters_entry(entry, vm_spec)?; + + // If leaf_0xB or leaf_0x1F is enabled, leaf0x4 won't be used to generate topology information. + // In most cases, we could have leaf_0xB in our host cpu. But we keep the leaf_0x4 eax[26,31] + // to prevent rare cases. + if vm_spec.cpu_count <= 64 { + entry.eax.write_bits_in_range( + &eax::MAX_CORES_PER_PACKAGE_BITRANGE, + u32::from(vm_spec.cpu_count - 1), + ); + } + + Ok(()) +} + +fn update_power_management_entry(entry: &mut CpuIdEntry, _vm_spec: &VmSpec) -> Result<(), Error> { + // disable pstate feature + entry.eax = 0; + entry.ebx = 0; + entry.ecx = 0; + entry.edx = 0; + + Ok(()) +} + +fn update_perf_mon_entry(entry: &mut CpuIdEntry, vm_spec: &VmSpec) -> Result<(), Error> { + use cpu_leaf::leaf_0xa::*; + + // Architectural Performance Monitor Leaf + match vm_spec.vpmu_feature { + VpmuFeatureLevel::Disabled => { + // Disable PMU + entry.eax = 0; + entry.ebx = 0; + entry.ecx = 0; + entry.edx = 0; + } + VpmuFeatureLevel::LimitedlyEnabled => { + // Allow minimal vpmu ability (only instuctions and cycles pmu). + entry.eax.write_bits_in_range(&eax::PMC_VERSION_ID, 2); + entry.eax.write_bits_in_range(&eax::BIT_LEN_PMEVENT, 7); + + // 0(false) means support for the targeted performance monitoring event + entry.ebx.write_bit(ebx::CORE_CYCLES_BITINDEX, false); + entry.ebx.write_bit(ebx::REF_CYCLES_BITINDEX, false); + entry.ebx.write_bit(ebx::INST_RETIRED_BITINDEX, false); + entry.ebx.write_bit(ebx::BR_INST_RETIRED_BITINDEX, true); + entry.ebx.write_bit(ebx::LLC_MISSES_BITINDEX, true); + entry.ebx.write_bit(ebx::LLC_REF_BITINDEX, true); + entry.ebx.write_bit(ebx::BR_MIS_RETIRED_BITINDEX, true); + } + VpmuFeatureLevel::FullyEnabled => { + // Allow all supported vpmu ability + entry.eax.write_bits_in_range(&eax::PMC_VERSION_ID, 2); + entry.eax.write_bits_in_range(&eax::BIT_LEN_PMEVENT, 7); + + // 0(false) means support for the targeted performance monitoring event + entry.ebx.write_bit(ebx::CORE_CYCLES_BITINDEX, false); + entry.ebx.write_bit(ebx::REF_CYCLES_BITINDEX, false); + entry.ebx.write_bit(ebx::INST_RETIRED_BITINDEX, false); + entry.ebx.write_bit(ebx::BR_INST_RETIRED_BITINDEX, false); + entry.ebx.write_bit(ebx::LLC_MISSES_BITINDEX, false); + entry.ebx.write_bit(ebx::LLC_REF_BITINDEX, false); + entry.ebx.write_bit(ebx::BR_MIS_RETIRED_BITINDEX, false); + } + }; + Ok(()) +} + +#[derive(Default)] +pub struct IntelCpuidTransformer {} + +impl IntelCpuidTransformer { + pub fn new() -> Self { + Default::default() + } +} + +impl CpuidTransformer for IntelCpuidTransformer { + fn process_cpuid(&self, cpuid: &mut CpuId, vm_spec: &VmSpec) -> Result<(), Error> { + common::use_host_cpuid_function(cpuid, cpu_leaf::leaf_0x0::LEAF_NUM, false)?; + self.process_entries(cpuid, vm_spec) + } + + fn entry_transformer_fn(&self, entry: &mut CpuIdEntry) -> Option { + use cpu_leaf::*; + + match entry.function { + leaf_0x1::LEAF_NUM => Some(common::update_feature_info_entry), + leaf_0x4::LEAF_NUM => Some(intel::update_deterministic_cache_entry), + leaf_0x6::LEAF_NUM => Some(intel::update_power_management_entry), + leaf_0xa::LEAF_NUM => Some(intel::update_perf_mon_entry), + leaf_0xb::LEAF_NUM => Some(common::update_extended_topology_entry), + leaf_0x1f::LEAF_NUM => Some(common::update_extended_topology_v2_entry), + 0x8000_0002..=0x8000_0004 => Some(common::update_brand_string_entry), + _ => None, + } + } +} + +#[cfg(test)] +mod test { + use kvm_bindings::kvm_cpuid_entry2; + + use super::*; + use crate::cpuid::transformer::VmSpec; + + #[test] + fn test_update_perf_mon_entry() { + use crate::cpuid::cpu_leaf::leaf_0xa::*; + // Test when vpmu is off (level Disabled) + let vm_spec = + VmSpec::new(0, 1, 1, 1, 1, VpmuFeatureLevel::Disabled).expect("Error creating vm_spec"); + let entry = &mut kvm_cpuid_entry2 { + function: LEAF_NUM, + index: 0, + flags: 0, + eax: 1, + ebx: 1, + ecx: 1, + edx: 1, + padding: [0, 0, 0], + }; + + assert!(update_perf_mon_entry(entry, &vm_spec).is_ok()); + + assert_eq!(entry.eax, 0); + assert_eq!(entry.ebx, 0); + assert_eq!(entry.ecx, 0); + assert_eq!(entry.edx, 0); + + // Test when only instructions and cycles pmu are enabled (level LimitedlyEnabled) + let vm_spec = VmSpec::new(0, 1, 1, 1, 1, VpmuFeatureLevel::LimitedlyEnabled) + .expect("Error creating vm_spec"); + let entry = &mut kvm_cpuid_entry2 { + function: 0x0, + index: 0, + flags: 0, + eax: 0, + ebx: 0, + ecx: 0, + edx: 0, + padding: [0, 0, 0], + }; + + assert!(update_perf_mon_entry(entry, &vm_spec).is_ok()); + assert_eq!(entry.eax.read_bits_in_range(&eax::PMC_VERSION_ID), 2); + assert_eq!(entry.eax.read_bits_in_range(&eax::BIT_LEN_PMEVENT), 7); + + assert!(!entry.ebx.read_bit(ebx::CORE_CYCLES_BITINDEX)); + assert!(!entry.ebx.read_bit(ebx::INST_RETIRED_BITINDEX)); + assert!(!entry.ebx.read_bit(ebx::REF_CYCLES_BITINDEX)); + assert!(entry.ebx.read_bit(ebx::LLC_REF_BITINDEX)); + assert!(entry.ebx.read_bit(ebx::LLC_MISSES_BITINDEX)); + assert!(entry.ebx.read_bit(ebx::BR_INST_RETIRED_BITINDEX)); + assert!(entry.ebx.read_bit(ebx::BR_MIS_RETIRED_BITINDEX)); + + // Test when all vpmu features are enabled (level FullyEnabled) + let vm_spec = VmSpec::new(0, 1, 1, 1, 1, VpmuFeatureLevel::FullyEnabled) + .expect("Error creating vm_spec"); + let entry = &mut kvm_cpuid_entry2 { + function: 0x0, + index: 0, + flags: 0, + eax: 0, + ebx: 0, + ecx: 0, + edx: 0, + padding: [0, 0, 0], + }; + + assert!(update_perf_mon_entry(entry, &vm_spec).is_ok()); + + assert_eq!(entry.eax.read_bits_in_range(&eax::PMC_VERSION_ID), 2); + assert_eq!(entry.eax.read_bits_in_range(&eax::BIT_LEN_PMEVENT), 7); + + assert!(!entry.ebx.read_bit(ebx::CORE_CYCLES_BITINDEX)); + assert!(!entry.ebx.read_bit(ebx::INST_RETIRED_BITINDEX)); + assert!(!entry.ebx.read_bit(ebx::REF_CYCLES_BITINDEX)); + assert!(!entry.ebx.read_bit(ebx::LLC_REF_BITINDEX)); + assert!(!entry.ebx.read_bit(ebx::LLC_MISSES_BITINDEX)); + assert!(!entry.ebx.read_bit(ebx::BR_INST_RETIRED_BITINDEX)); + assert!(!entry.ebx.read_bit(ebx::BR_MIS_RETIRED_BITINDEX)); + } + + fn check_update_deterministic_cache_entry( + cpu_count: u8, + cache_level: u32, + expected_max_cores_per_package: u32, + threads_per_core: u8, + cores_per_die: u8, + dies_per_socket: u8, + ) { + use crate::cpuid::cpu_leaf::leaf_0x4::*; + + let vm_spec = VmSpec::new( + 0, + cpu_count, + threads_per_core, + cores_per_die, + dies_per_socket, + VpmuFeatureLevel::Disabled, + ) + .expect("Error creating vm_spec"); + let entry = &mut kvm_cpuid_entry2 { + function: 0x0, + index: 0, + flags: 0, + eax: *(0_u32).write_bits_in_range(&eax::CACHE_LEVEL_BITRANGE, cache_level), + ebx: 0, + ecx: 0, + edx: 0, + padding: [0, 0, 0], + }; + + assert!(update_deterministic_cache_entry(entry, &vm_spec).is_ok()); + + assert!( + entry + .eax + .read_bits_in_range(&eax::MAX_CORES_PER_PACKAGE_BITRANGE) + == expected_max_cores_per_package + ); + } + + #[test] + fn test_1vcpu_ht_off() { + // test update_deterministic_cache_entry + // test L1 + check_update_deterministic_cache_entry(1, 1, 0, 1, 1, 1); + // test L2 + check_update_deterministic_cache_entry(1, 2, 0, 1, 1, 1); + // test L3 + check_update_deterministic_cache_entry(1, 3, 0, 1, 1, 1); + } + + #[test] + fn test_1vcpu_ht_on() { + // test update_deterministic_cache_entry + // test L1 + check_update_deterministic_cache_entry(1, 1, 0, 2, 1, 1); + // test L2 + check_update_deterministic_cache_entry(1, 2, 0, 2, 1, 1); + // test L3 + check_update_deterministic_cache_entry(1, 3, 0, 2, 1, 1); + } + + #[test] + fn test_2vcpu_ht_off() { + // test update_deterministic_cache_entry + // test L1 + check_update_deterministic_cache_entry(2, 1, 1, 1, 2, 1); + // test L2 + check_update_deterministic_cache_entry(2, 2, 1, 1, 2, 1); + // test L3 + check_update_deterministic_cache_entry(2, 3, 1, 1, 2, 1); + } + + #[test] + fn test_2vcpu_ht_on() { + // test update_deterministic_cache_entry + // test L1 + check_update_deterministic_cache_entry(2, 1, 1, 2, 2, 1); + // test L2 + check_update_deterministic_cache_entry(2, 2, 1, 2, 2, 1); + // test L3 + check_update_deterministic_cache_entry(2, 3, 1, 2, 2, 1); + } +} diff --git a/src/dragonball/src/dbs_arch/src/x86_64/cpuid/transformer/mod.rs b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/transformer/mod.rs new file mode 100644 index 000000000..58dac23ec --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/cpuid/transformer/mod.rs @@ -0,0 +1,172 @@ +// Copyright 2021 Alibaba Cloud. All Rights Reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use super::brand_string::{BrandString, Reg as BsReg}; +use super::common::get_vendor_id; +use super::{CpuId, CpuIdEntry}; +use crate::VpmuFeatureLevel; + +pub mod amd; +pub mod common; +pub mod intel; + +/// Structure containing the specifications of the VM +pub struct VmSpec { + /// The vendor id of the CPU + cpu_vendor_id: [u8; 12], + /// The id of the current logical cpu in the range [0..cpu_count]. + cpu_id: u8, + /// The total number of logical cpus (includes cpus that could be hotplugged). + cpu_count: u8, + /// The desired brand string for the guest. + brand_string: BrandString, + /// threads per core for cpu topology information + threads_per_core: u8, + /// cores per die for cpu topology information + cores_per_die: u8, + /// dies per socket for cpu topology information + dies_per_socket: u8, + /// if vpmu feature is Disabled, it means vpmu feature is off (by default) + /// if vpmu feature is LimitedlyEnabled, it means minimal vpmu counters are supported (cycles and instructions) + /// if vpmu feature is FullyEnabled, it means all vpmu counters are supported + vpmu_feature: VpmuFeatureLevel, +} + +impl VmSpec { + /// Creates a new instance of VmSpec with the specified parameters + /// The brand string is deduced from the vendor_id + pub fn new( + cpu_id: u8, + cpu_count: u8, + threads_per_core: u8, + cores_per_die: u8, + dies_per_socket: u8, + vpmu_feature: VpmuFeatureLevel, + ) -> Result { + let cpu_vendor_id = get_vendor_id().map_err(Error::InternalError)?; + let brand_string = + BrandString::from_vendor_id(&cpu_vendor_id).map_err(Error::BrandString)?; + + Ok(VmSpec { + cpu_vendor_id, + cpu_id, + cpu_count, + brand_string, + threads_per_core, + cores_per_die, + dies_per_socket, + vpmu_feature, + }) + } + + /// Returns an immutable reference to cpu_vendor_id + pub fn cpu_vendor_id(&self) -> &[u8; 12] { + &self.cpu_vendor_id + } +} + +/// Errors associated with processing the CPUID leaves. +#[derive(Debug, Clone)] +pub enum Error { + /// Failed to parse CPU brand string + BrandString(super::brand_string::Error), + /// The CPU architecture is not supported + CpuNotSupported, + /// A FamStructWrapper operation has failed + FamError(vmm_sys_util::fam::Error), + /// A call to an internal helper method failed + InternalError(super::common::Error), + /// The maximum number of addressable logical CPUs cannot be stored in an `u8`. + VcpuCountOverflow, +} + +pub type EntryTransformerFn = fn(entry: &mut CpuIdEntry, vm_spec: &VmSpec) -> Result<(), Error>; + +/// Generic trait that provides methods for transforming the cpuid +pub trait CpuidTransformer { + /// Process the cpuid array and make the desired transformations. + fn process_cpuid(&self, cpuid: &mut CpuId, vm_spec: &VmSpec) -> Result<(), Error> { + self.process_entries(cpuid, vm_spec) + } + + /// Iterate through all the cpuid entries and calls the associated transformer for each one. + fn process_entries(&self, cpuid: &mut CpuId, vm_spec: &VmSpec) -> Result<(), Error> { + for entry in cpuid.as_mut_slice().iter_mut() { + let maybe_transformer_fn = self.entry_transformer_fn(entry); + + if let Some(transformer_fn) = maybe_transformer_fn { + transformer_fn(entry, vm_spec)?; + } + } + + Ok(()) + } + + /// Get the associated transformer for a cpuid entry + fn entry_transformer_fn(&self, _entry: &mut CpuIdEntry) -> Option { + None + } +} + +#[cfg(test)] +mod test { + use super::*; + use kvm_bindings::kvm_cpuid_entry2; + + const PROCESSED_FN: u32 = 1; + const EXPECTED_INDEX: u32 = 100; + + fn transform_entry(entry: &mut kvm_cpuid_entry2, _vm_spec: &VmSpec) -> Result<(), Error> { + entry.index = EXPECTED_INDEX; + + Ok(()) + } + + struct MockCpuidTransformer {} + + impl CpuidTransformer for MockCpuidTransformer { + fn entry_transformer_fn(&self, entry: &mut kvm_cpuid_entry2) -> Option { + match entry.function { + PROCESSED_FN => Some(transform_entry), + _ => None, + } + } + } + + #[test] + fn test_process_cpuid() { + let num_entries = 5; + + let mut cpuid = CpuId::new(num_entries).unwrap(); + let vm_spec = VmSpec::new(0, 1, 1, 1, 1, VpmuFeatureLevel::Disabled); + cpuid.as_mut_slice()[0].function = PROCESSED_FN; + assert!(MockCpuidTransformer {} + .process_cpuid(&mut cpuid, &vm_spec.unwrap()) + .is_ok()); + + assert!(cpuid.as_mut_slice().len() == num_entries); + for entry in cpuid.as_mut_slice().iter() { + match entry.function { + PROCESSED_FN => { + assert_eq!(entry.index, EXPECTED_INDEX); + } + _ => { + assert_ne!(entry.index, EXPECTED_INDEX); + } + } + } + } + + #[test] + fn test_invalid_cpu_architecture_cpuid() { + use crate::cpuid::process_cpuid; + let num_entries = 5; + + let mut cpuid = CpuId::new(num_entries).unwrap(); + let mut vm_spec = VmSpec::new(0, 1, 1, 1, 1, VpmuFeatureLevel::Disabled).unwrap(); + + vm_spec.cpu_vendor_id = [1; 12]; + assert!(process_cpuid(&mut cpuid, &vm_spec).is_err()); + } +} diff --git a/src/dragonball/src/dbs_arch/src/x86_64/gdt.rs b/src/dragonball/src/dbs_arch/src/x86_64/gdt.rs new file mode 100644 index 000000000..dd8e9d095 --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/gdt.rs @@ -0,0 +1,119 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +// For GDT details see arch/x86/include/asm/segment.h + +#![allow(missing_docs)] + +use kvm_bindings::kvm_segment; + +/// Constructor for a conventional segment GDT (or LDT) entry. Derived from the kernel's segment.h. +#[allow(unused_parens)] +pub fn gdt_entry(flags: u16, base: u32, limit: u32) -> u64 { + (((u64::from(base) & 0xff00_0000u64) << (56 - 24)) + | ((u64::from(flags) & 0x0000_f0ffu64) << 40) + | ((u64::from(limit) & 0x000f_0000u64) << (48 - 16)) + | ((u64::from(base) & 0x00ff_ffffu64) << 16) + | (u64::from(limit) & 0x0000_ffffu64)) +} + +#[allow(unused_parens)] +fn get_base(entry: u64) -> u64 { + ((((entry) & 0xFF00_0000_0000_0000) >> 32) + | (((entry) & 0x0000_00FF_0000_0000) >> 16) + | (((entry) & 0x0000_0000_FFFF_0000) >> 16)) +} + +fn get_limit(entry: u64) -> u32 { + ((((entry) & 0x000F_0000_0000_0000) >> 32) | ((entry) & 0x0000_0000_0000_FFFF)) as u32 +} + +fn get_g(entry: u64) -> u8 { + ((entry & 0x0080_0000_0000_0000) >> 55) as u8 +} + +fn get_db(entry: u64) -> u8 { + ((entry & 0x0040_0000_0000_0000) >> 54) as u8 +} + +fn get_l(entry: u64) -> u8 { + ((entry & 0x0020_0000_0000_0000) >> 53) as u8 +} + +fn get_avl(entry: u64) -> u8 { + ((entry & 0x0010_0000_0000_0000) >> 52) as u8 +} + +fn get_p(entry: u64) -> u8 { + ((entry & 0x0000_8000_0000_0000) >> 47) as u8 +} + +fn get_dpl(entry: u64) -> u8 { + ((entry & 0x0000_6000_0000_0000) >> 45) as u8 +} + +fn get_s(entry: u64) -> u8 { + ((entry & 0x0000_1000_0000_0000) >> 44) as u8 +} + +fn get_type(entry: u64) -> u8 { + ((entry & 0x0000_0F00_0000_0000) >> 40) as u8 +} + +/// Automatically build the kvm struct for SET_SREGS from the kernel bit fields. +/// +/// # Arguments +/// +/// * `entry` - The gdt entry. +/// * `table_index` - Index of the entry in the gdt table. +pub fn kvm_segment_from_gdt(entry: u64, table_index: u8) -> kvm_segment { + kvm_segment { + base: get_base(entry), + limit: get_limit(entry), + selector: u16::from(table_index * 8), + type_: get_type(entry), + present: get_p(entry), + dpl: get_dpl(entry), + db: get_db(entry), + s: get_s(entry), + l: get_l(entry), + g: get_g(entry), + avl: get_avl(entry), + padding: 0, + unusable: match get_p(entry) { + 0 => 1, + _ => 0, + }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn field_parse() { + let gdt = gdt_entry(0xA09B, 0x10_0000, 0xfffff); + let seg = kvm_segment_from_gdt(gdt, 0); + // 0xA09B + // 'A' + assert_eq!(0x1, seg.g); + assert_eq!(0x0, seg.db); + assert_eq!(0x1, seg.l); + assert_eq!(0x0, seg.avl); + // '9' + assert_eq!(0x1, seg.present); + assert_eq!(0x0, seg.dpl); + assert_eq!(0x1, seg.s); + // 'B' + assert_eq!(0xB, seg.type_); + // base and limit + assert_eq!(0x10_0000, seg.base); + assert_eq!(0xfffff, seg.limit); + assert_eq!(0x0, seg.unusable); + } +} diff --git a/src/dragonball/src/dbs_arch/src/x86_64/interrupts.rs b/src/dragonball/src/dbs_arch/src/x86_64/interrupts.rs new file mode 100644 index 000000000..8a7e3b6bd --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/interrupts.rs @@ -0,0 +1,136 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use kvm_bindings::kvm_lapic_state; +use kvm_ioctls::VcpuFd; + +/// Errors thrown while configuring the LAPIC. +#[derive(Debug)] +pub enum Error { + /// Failure in retrieving the LAPIC configuration. + GetLapic(kvm_ioctls::Error), + /// Failure in modifying the LAPIC configuration. + SetLapic(kvm_ioctls::Error), +} +type Result = std::result::Result; + +// Defines poached from apicdef.h kernel header. +const APIC_LVT0: usize = 0x350; +const APIC_LVT1: usize = 0x360; +const APIC_MODE_NMI: u32 = 0x4; +const APIC_MODE_EXTINT: u32 = 0x7; + +fn get_klapic_reg(klapic: &kvm_lapic_state, reg_offset: usize) -> u32 { + let range = reg_offset..reg_offset + 4; + let reg = klapic.regs.get(range).expect("get_klapic_reg range"); + + let mut reg_bytes = [0u8; 4]; + for (byte, read) in reg_bytes.iter_mut().zip(reg.iter().cloned()) { + *byte = read as u8; + } + + u32::from_le_bytes(reg_bytes) +} + +fn set_klapic_reg(klapic: &mut kvm_lapic_state, reg_offset: usize, value: u32) { + let range = reg_offset..reg_offset + 4; + let reg = klapic.regs.get_mut(range).expect("set_klapic_reg range"); + + let value = u32::to_le_bytes(value); + for (byte, read) in reg.iter_mut().zip(value.iter().cloned()) { + *byte = read as i8; + } +} + +#[allow(unused_parens)] +fn set_apic_delivery_mode(reg: u32, mode: u32) -> u32 { + (((reg) & !0x700) | ((mode) << 8)) +} + +/// Configures LAPICs. LAPIC0 is set for external interrupts, LAPIC1 is set for NMI. +/// +/// # Arguments +/// * `vcpu` - The VCPU object to configure. +pub fn set_lint(vcpu: &VcpuFd) -> Result<()> { + let mut klapic = vcpu.get_lapic().map_err(Error::GetLapic)?; + + let lvt_lint0 = get_klapic_reg(&klapic, APIC_LVT0); + set_klapic_reg( + &mut klapic, + APIC_LVT0, + set_apic_delivery_mode(lvt_lint0, APIC_MODE_EXTINT), + ); + let lvt_lint1 = get_klapic_reg(&klapic, APIC_LVT1); + set_klapic_reg( + &mut klapic, + APIC_LVT1, + set_apic_delivery_mode(lvt_lint1, APIC_MODE_NMI), + ); + + vcpu.set_lapic(&klapic).map_err(Error::SetLapic) +} + +#[cfg(test)] +mod tests { + use super::*; + use kvm_ioctls::Kvm; + + const KVM_APIC_REG_SIZE: usize = 0x400; + + #[test] + fn test_set_and_get_klapic_reg() { + let reg_offset = 0x340; + let mut klapic = kvm_lapic_state::default(); + set_klapic_reg(&mut klapic, reg_offset, 3); + let value = get_klapic_reg(&klapic, reg_offset); + assert_eq!(value, 3); + } + + #[test] + #[should_panic] + fn test_set_and_get_klapic_out_of_bounds() { + let reg_offset = KVM_APIC_REG_SIZE + 10; + let mut klapic = kvm_lapic_state::default(); + set_klapic_reg(&mut klapic, reg_offset, 3); + } + + #[test] + fn test_setlint() { + let kvm = Kvm::new().unwrap(); + assert!(kvm.check_extension(kvm_ioctls::Cap::Irqchip)); + let vm = kvm.create_vm().unwrap(); + //the get_lapic ioctl will fail if there is no irqchip created beforehand. + assert!(vm.create_irq_chip().is_ok()); + let vcpu = vm.create_vcpu(0).unwrap(); + let klapic_before: kvm_lapic_state = vcpu.get_lapic().unwrap(); + + // Compute the value that is expected to represent LVT0 and LVT1. + let lint0 = get_klapic_reg(&klapic_before, APIC_LVT0); + let lint1 = get_klapic_reg(&klapic_before, APIC_LVT1); + let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT); + let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI); + + set_lint(&vcpu).unwrap(); + + // Compute the value that represents LVT0 and LVT1 after set_lint. + let klapic_actual: kvm_lapic_state = vcpu.get_lapic().unwrap(); + let lint0_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT0); + let lint1_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT1); + assert_eq!(lint0_mode_expected, lint0_mode_actual); + assert_eq!(lint1_mode_expected, lint1_mode_actual); + } + + #[test] + fn test_setlint_fails() { + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + let vcpu = vm.create_vcpu(0).unwrap(); + // 'get_lapic' ioctl triggered by the 'set_lint' function will fail if there is no + // irqchip created beforehand. + assert!(set_lint(&vcpu).is_err()); + } +} diff --git a/src/dragonball/src/dbs_arch/src/x86_64/mod.rs b/src/dragonball/src/dbs_arch/src/x86_64/mod.rs new file mode 100644 index 000000000..6d39e5b58 --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/mod.rs @@ -0,0 +1,15 @@ +// Copyright 2021 Alibaba Cloud. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! CPU architecture specific constants and utilities for the `x86_64` architecture. + +/// Definitions for x86 CPUID +pub mod cpuid; +/// Definitions for x86 Global Descriptor Table +pub mod gdt; +/// Definitions for x86 interrupts +pub mod interrupts; +/// Definitions for x86 Model Specific Registers(MSR). +pub mod msr; +/// Definitions for x86 Registers +pub mod regs; diff --git a/src/dragonball/src/dbs_arch/src/x86_64/msr.rs b/src/dragonball/src/dbs_arch/src/x86_64/msr.rs new file mode 100644 index 000000000..fcdfb848b --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/msr.rs @@ -0,0 +1,778 @@ +/* automatically generated by rust-bindgen */ + +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![allow(missing_docs)] +#![allow(dead_code)] +#![allow(non_upper_case_globals)] + +/// Model Specific Registers (MSRs) related functionality. +use std::result; + +use kvm_bindings::MsrList; +use kvm_ioctls::Kvm; + +#[derive(Debug)] +/// MSR related errors. +pub enum Error { + /// Getting supported MSRs failed. + GetSupportedModelSpecificRegisters(kvm_ioctls::Error), + /// Setting up MSRs failed. + SetModelSpecificRegisters(kvm_ioctls::Error), + /// Failed to set all MSRs. + SetModelSpecificRegistersCount, + /// Msr error + Msr(vmm_sys_util::fam::Error), +} + +type Result = result::Result; + +/// MSR range +struct MsrRange { + /// Base MSR address + base: u32, + /// Number of MSRs + nmsrs: u32, +} + +impl MsrRange { + /// Returns whether `msr` is contained in this MSR range. + fn contains(&self, msr: u32) -> bool { + self.base <= msr && msr < self.base + self.nmsrs + } +} + +// Creates a MsrRange of one msr given as argument. +macro_rules! SINGLE_MSR { + ($msr:expr) => { + MsrRange { + base: $msr, + nmsrs: 1, + } + }; +} + +// Creates a MsrRange of with msr base and count given as arguments. +macro_rules! MSR_RANGE { + ($first:expr, $count:expr) => { + MsrRange { + base: $first, + nmsrs: $count, + } + }; +} + +// List of MSRs that can be serialized. List is sorted in ascending order of MSRs addresses. +static WHITELISTED_MSR_RANGES: &[MsrRange] = &[ + SINGLE_MSR!(MSR_IA32_P5_MC_ADDR), + SINGLE_MSR!(MSR_IA32_P5_MC_TYPE), + SINGLE_MSR!(MSR_IA32_TSC), + SINGLE_MSR!(MSR_IA32_PLATFORM_ID), + SINGLE_MSR!(MSR_IA32_APICBASE), + SINGLE_MSR!(MSR_IA32_EBL_CR_POWERON), + SINGLE_MSR!(MSR_EBC_FREQUENCY_ID), + SINGLE_MSR!(MSR_SMI_COUNT), + SINGLE_MSR!(MSR_IA32_FEATURE_CONTROL), + SINGLE_MSR!(MSR_IA32_TSC_ADJUST), + SINGLE_MSR!(MSR_IA32_SPEC_CTRL), + SINGLE_MSR!(MSR_IA32_PRED_CMD), + SINGLE_MSR!(MSR_IA32_UCODE_WRITE), + SINGLE_MSR!(MSR_IA32_UCODE_REV), + SINGLE_MSR!(MSR_IA32_SMBASE), + SINGLE_MSR!(MSR_FSB_FREQ), + SINGLE_MSR!(MSR_PLATFORM_INFO), + SINGLE_MSR!(MSR_PKG_CST_CONFIG_CONTROL), + SINGLE_MSR!(MSR_IA32_MPERF), + SINGLE_MSR!(MSR_IA32_APERF), + SINGLE_MSR!(MSR_MTRRcap), + SINGLE_MSR!(MSR_IA32_BBL_CR_CTL3), + SINGLE_MSR!(MSR_IA32_SYSENTER_CS), + SINGLE_MSR!(MSR_IA32_SYSENTER_ESP), + SINGLE_MSR!(MSR_IA32_SYSENTER_EIP), + SINGLE_MSR!(MSR_IA32_MCG_CAP), + SINGLE_MSR!(MSR_IA32_MCG_STATUS), + SINGLE_MSR!(MSR_IA32_MCG_CTL), + SINGLE_MSR!(MSR_IA32_PERF_STATUS), + SINGLE_MSR!(MSR_IA32_MISC_ENABLE), + SINGLE_MSR!(MSR_MISC_FEATURE_CONTROL), + SINGLE_MSR!(MSR_MISC_PWR_MGMT), + SINGLE_MSR!(MSR_TURBO_RATIO_LIMIT), + SINGLE_MSR!(MSR_TURBO_RATIO_LIMIT1), + SINGLE_MSR!(MSR_IA32_DEBUGCTLMSR), + SINGLE_MSR!(MSR_IA32_LASTBRANCHFROMIP), + SINGLE_MSR!(MSR_IA32_LASTBRANCHTOIP), + SINGLE_MSR!(MSR_IA32_LASTINTFROMIP), + SINGLE_MSR!(MSR_IA32_LASTINTTOIP), + SINGLE_MSR!(MSR_IA32_POWER_CTL), + MSR_RANGE!( + // IA32_MTRR_PHYSBASE0 + 0x200, 0x100 + ), + MSR_RANGE!( + // MSR_CORE_C3_RESIDENCY + // MSR_CORE_C6_RESIDENCY + // MSR_CORE_C7_RESIDENCY + MSR_CORE_C3_RESIDENCY, + 3 + ), + MSR_RANGE!(MSR_IA32_MC0_CTL, 0x80), + SINGLE_MSR!(MSR_RAPL_POWER_UNIT), + MSR_RANGE!( + // MSR_PKGC3_IRTL + // MSR_PKGC6_IRTL + // MSR_PKGC7_IRTL + MSR_PKGC3_IRTL, + 3 + ), + SINGLE_MSR!(MSR_PKG_POWER_LIMIT), + SINGLE_MSR!(MSR_PKG_ENERGY_STATUS), + SINGLE_MSR!(MSR_PKG_PERF_STATUS), + SINGLE_MSR!(MSR_PKG_POWER_INFO), + SINGLE_MSR!(MSR_DRAM_POWER_LIMIT), + SINGLE_MSR!(MSR_DRAM_ENERGY_STATUS), + SINGLE_MSR!(MSR_DRAM_PERF_STATUS), + SINGLE_MSR!(MSR_DRAM_POWER_INFO), + SINGLE_MSR!(MSR_CONFIG_TDP_NOMINAL), + SINGLE_MSR!(MSR_CONFIG_TDP_LEVEL_1), + SINGLE_MSR!(MSR_CONFIG_TDP_LEVEL_2), + SINGLE_MSR!(MSR_CONFIG_TDP_CONTROL), + SINGLE_MSR!(MSR_TURBO_ACTIVATION_RATIO), + SINGLE_MSR!(MSR_IA32_TSCDEADLINE), + MSR_RANGE!(APIC_BASE_MSR, APIC_MSR_INDEXES), + SINGLE_MSR!(MSR_IA32_BNDCFGS), + SINGLE_MSR!(MSR_KVM_WALL_CLOCK_NEW), + SINGLE_MSR!(MSR_KVM_SYSTEM_TIME_NEW), + SINGLE_MSR!(MSR_KVM_ASYNC_PF_EN), + SINGLE_MSR!(MSR_KVM_STEAL_TIME), + SINGLE_MSR!(MSR_KVM_PV_EOI_EN), + SINGLE_MSR!(MSR_EFER), + SINGLE_MSR!(MSR_STAR), + SINGLE_MSR!(MSR_LSTAR), + SINGLE_MSR!(MSR_CSTAR), + SINGLE_MSR!(MSR_SYSCALL_MASK), + SINGLE_MSR!(MSR_FS_BASE), + SINGLE_MSR!(MSR_GS_BASE), + SINGLE_MSR!(MSR_KERNEL_GS_BASE), + SINGLE_MSR!(MSR_TSC_AUX), +]; + +/// Specifies whether a particular MSR should be included in vcpu serialization. +/// +/// # Arguments +/// +/// * `index` - The index of the MSR that is checked whether it's needed for serialization. +pub fn msr_should_serialize(index: u32) -> bool { + // Blacklisted MSRs not exported by Linux: IA32_FEATURE_CONTROL and IA32_MCG_CTL + if index == MSR_IA32_FEATURE_CONTROL || index == MSR_IA32_MCG_CTL { + return false; + }; + WHITELISTED_MSR_RANGES + .iter() + .any(|range| range.contains(index)) +} + +/// Returns the list of supported, serializable MSRs. +/// +/// # Arguments +/// +/// * `kvm_fd` - Structure that holds the KVM's fd. +pub fn supported_guest_msrs(kvm_fd: &Kvm) -> Result { + let mut msr_list = kvm_fd + .get_msr_index_list() + .map_err(Error::GetSupportedModelSpecificRegisters)?; + + msr_list.retain(|msr_index| msr_should_serialize(*msr_index)); + + Ok(msr_list) +} + +/// Base MSR for APIC +pub const APIC_BASE_MSR: u32 = 0x800; + +/// Number of APIC MSR indexes +pub const APIC_MSR_INDEXES: u32 = 0x400; + +/// Custom MSRs fall in the range 0x4b564d00-0x4b564dff +pub const MSR_KVM_WALL_CLOCK_NEW: u32 = 0x4b56_4d00; +pub const MSR_KVM_SYSTEM_TIME_NEW: u32 = 0x4b56_4d01; +pub const MSR_KVM_ASYNC_PF_EN: u32 = 0x4b56_4d02; +pub const MSR_KVM_STEAL_TIME: u32 = 0x4b56_4d03; +pub const MSR_KVM_PV_EOI_EN: u32 = 0x4b56_4d04; + +pub const MSR_EFER: u32 = 3221225600; +pub const MSR_STAR: u32 = 3221225601; +pub const MSR_LSTAR: u32 = 3221225602; +pub const MSR_CSTAR: u32 = 3221225603; +pub const MSR_SYSCALL_MASK: u32 = 3221225604; +pub const MSR_FS_BASE: u32 = 3221225728; +pub const MSR_GS_BASE: u32 = 3221225729; +pub const MSR_KERNEL_GS_BASE: u32 = 3221225730; +pub const MSR_TSC_AUX: u32 = 3221225731; +pub const _EFER_SCE: u32 = 0; +pub const _EFER_LME: u32 = 8; +pub const _EFER_LMA: u32 = 10; +pub const _EFER_NX: u32 = 11; +pub const _EFER_SVME: u32 = 12; +pub const _EFER_LMSLE: u32 = 13; +pub const _EFER_FFXSR: u32 = 14; +pub const EFER_SCE: u32 = 1; +pub const EFER_LME: u32 = 256; +pub const EFER_LMA: u32 = 1024; +pub const EFER_NX: u32 = 2048; +pub const EFER_SVME: u32 = 4096; +pub const EFER_LMSLE: u32 = 8192; +pub const EFER_FFXSR: u32 = 16384; +pub const MSR_IA32_SPEC_CTRL: u32 = 72; +pub const SPEC_CTRL_IBRS: u32 = 1; +pub const SPEC_CTRL_STIBP: u32 = 2; +pub const SPEC_CTRL_SSBD_SHIFT: u32 = 2; +pub const SPEC_CTRL_SSBD: u32 = 4; +pub const MSR_IA32_PRED_CMD: u32 = 73; +pub const PRED_CMD_IBPB: u32 = 1; +pub const MSR_IA32_PERFCTR0: u32 = 193; +pub const MSR_IA32_PERFCTR1: u32 = 194; +pub const MSR_FSB_FREQ: u32 = 205; +pub const MSR_PLATFORM_INFO: u32 = 206; +pub const MSR_NHM_SNB_PKG_CST_CFG_CTL: u32 = 226; +pub const NHM_C3_AUTO_DEMOTE: u32 = 33554432; +pub const NHM_C1_AUTO_DEMOTE: u32 = 67108864; +pub const ATM_LNC_C6_AUTO_DEMOTE: u32 = 33554432; +pub const SNB_C1_AUTO_UNDEMOTE: u32 = 134217728; +pub const SNB_C3_AUTO_UNDEMOTE: u32 = 268435456; +pub const MSR_MTRRcap: u32 = 254; +pub const MSR_IA32_ARCH_CAPABILITIES: u32 = 266; +pub const ARCH_CAP_RDCL_NO: u32 = 1; +pub const ARCH_CAP_IBRS_ALL: u32 = 2; +pub const ARCH_CAP_SKIP_VMENTRY_L1DFLUSH: u32 = 8; +pub const ARCH_CAP_SSB_NO: u32 = 16; +pub const MSR_IA32_FLUSH_CMD: u32 = 267; +pub const L1D_FLUSH: u32 = 1; +pub const MSR_PKG_CST_CONFIG_CONTROL: u32 = 226; +pub const MSR_IA32_BBL_CR_CTL: u32 = 281; +pub const MSR_IA32_BBL_CR_CTL3: u32 = 286; +pub const MSR_IA32_SYSENTER_CS: u32 = 372; +pub const MSR_IA32_SYSENTER_ESP: u32 = 373; +pub const MSR_IA32_SYSENTER_EIP: u32 = 374; +pub const MSR_IA32_MCG_CAP: u32 = 377; +pub const MSR_IA32_MCG_STATUS: u32 = 378; +pub const MSR_IA32_MCG_CTL: u32 = 379; +pub const MSR_IA32_MCG_EXT_CTL: u32 = 1232; +pub const MSR_OFFCORE_RSP_0: u32 = 422; +pub const MSR_OFFCORE_RSP_1: u32 = 423; +pub const MSR_TURBO_RATIO_LIMIT: u32 = 429; +pub const MSR_TURBO_RATIO_LIMIT1: u32 = 430; +pub const MSR_TURBO_RATIO_LIMIT2: u32 = 431; +pub const MSR_LBR_SELECT: u32 = 456; +pub const MSR_LBR_TOS: u32 = 457; +pub const MSR_LBR_NHM_FROM: u32 = 1664; +pub const MSR_LBR_NHM_TO: u32 = 1728; +pub const MSR_LBR_CORE_FROM: u32 = 64; +pub const MSR_LBR_CORE_TO: u32 = 96; +pub const MSR_LBR_INFO_0: u32 = 3520; +pub const LBR_INFO_CYCLES: u32 = 65535; +pub const MSR_IA32_PEBS_ENABLE: u32 = 1009; +pub const MSR_IA32_DS_AREA: u32 = 1536; +pub const MSR_IA32_PERF_CAPABILITIES: u32 = 837; +pub const MSR_PEBS_LD_LAT_THRESHOLD: u32 = 1014; +pub const MSR_IA32_RTIT_CTL: u32 = 1392; +pub const MSR_IA32_RTIT_STATUS: u32 = 1393; +pub const MSR_IA32_RTIT_ADDR0_A: u32 = 1408; +pub const MSR_IA32_RTIT_ADDR0_B: u32 = 1409; +pub const MSR_IA32_RTIT_ADDR1_A: u32 = 1410; +pub const MSR_IA32_RTIT_ADDR1_B: u32 = 1411; +pub const MSR_IA32_RTIT_ADDR2_A: u32 = 1412; +pub const MSR_IA32_RTIT_ADDR2_B: u32 = 1413; +pub const MSR_IA32_RTIT_ADDR3_A: u32 = 1414; +pub const MSR_IA32_RTIT_ADDR3_B: u32 = 1415; +pub const MSR_IA32_RTIT_CR3_MATCH: u32 = 1394; +pub const MSR_IA32_RTIT_OUTPUT_BASE: u32 = 1376; +pub const MSR_IA32_RTIT_OUTPUT_MASK: u32 = 1377; +pub const MSR_MTRRfix64K_00000: u32 = 592; +pub const MSR_MTRRfix16K_80000: u32 = 600; +pub const MSR_MTRRfix16K_A0000: u32 = 601; +pub const MSR_MTRRfix4K_C0000: u32 = 616; +pub const MSR_MTRRfix4K_C8000: u32 = 617; +pub const MSR_MTRRfix4K_D0000: u32 = 618; +pub const MSR_MTRRfix4K_D8000: u32 = 619; +pub const MSR_MTRRfix4K_E0000: u32 = 620; +pub const MSR_MTRRfix4K_E8000: u32 = 621; +pub const MSR_MTRRfix4K_F0000: u32 = 622; +pub const MSR_MTRRfix4K_F8000: u32 = 623; +pub const MSR_MTRRdefType: u32 = 767; +pub const MSR_IA32_CR_PAT: u32 = 631; +pub const MSR_IA32_DEBUGCTLMSR: u32 = 473; +pub const MSR_IA32_LASTBRANCHFROMIP: u32 = 475; +pub const MSR_IA32_LASTBRANCHTOIP: u32 = 476; +pub const MSR_IA32_LASTINTFROMIP: u32 = 477; +pub const MSR_IA32_LASTINTTOIP: u32 = 478; +pub const DEBUGCTLMSR_LBR: u32 = 1; +pub const DEBUGCTLMSR_BTF_SHIFT: u32 = 1; +pub const DEBUGCTLMSR_BTF: u32 = 2; +pub const DEBUGCTLMSR_TR: u32 = 64; +pub const DEBUGCTLMSR_BTS: u32 = 128; +pub const DEBUGCTLMSR_BTINT: u32 = 256; +pub const DEBUGCTLMSR_BTS_OFF_OS: u32 = 512; +pub const DEBUGCTLMSR_BTS_OFF_USR: u32 = 1024; +pub const DEBUGCTLMSR_FREEZE_LBRS_ON_PMI: u32 = 2048; +pub const MSR_PEBS_FRONTEND: u32 = 1015; +pub const MSR_IA32_POWER_CTL: u32 = 508; +pub const MSR_IA32_MC0_CTL: u32 = 1024; +pub const MSR_IA32_MC0_STATUS: u32 = 1025; +pub const MSR_IA32_MC0_ADDR: u32 = 1026; +pub const MSR_IA32_MC0_MISC: u32 = 1027; +pub const MSR_PKG_C3_RESIDENCY: u32 = 1016; +pub const MSR_PKG_C6_RESIDENCY: u32 = 1017; +pub const MSR_PKG_C7_RESIDENCY: u32 = 1018; +pub const MSR_CORE_C3_RESIDENCY: u32 = 1020; +pub const MSR_CORE_C6_RESIDENCY: u32 = 1021; +pub const MSR_CORE_C7_RESIDENCY: u32 = 1022; +pub const MSR_KNL_CORE_C6_RESIDENCY: u32 = 1023; +pub const MSR_PKG_C2_RESIDENCY: u32 = 1549; +pub const MSR_PKG_C8_RESIDENCY: u32 = 1584; +pub const MSR_PKG_C9_RESIDENCY: u32 = 1585; +pub const MSR_PKG_C10_RESIDENCY: u32 = 1586; +pub const MSR_PKGC3_IRTL: u32 = 1546; +pub const MSR_PKGC6_IRTL: u32 = 1547; +pub const MSR_PKGC7_IRTL: u32 = 1548; +pub const MSR_PKGC8_IRTL: u32 = 1587; +pub const MSR_PKGC9_IRTL: u32 = 1588; +pub const MSR_PKGC10_IRTL: u32 = 1589; +pub const MSR_RAPL_POWER_UNIT: u32 = 1542; +pub const MSR_PKG_POWER_LIMIT: u32 = 1552; +pub const MSR_PKG_ENERGY_STATUS: u32 = 1553; +pub const MSR_PKG_PERF_STATUS: u32 = 1555; +pub const MSR_PKG_POWER_INFO: u32 = 1556; +pub const MSR_DRAM_POWER_LIMIT: u32 = 1560; +pub const MSR_DRAM_ENERGY_STATUS: u32 = 1561; +pub const MSR_DRAM_PERF_STATUS: u32 = 1563; +pub const MSR_DRAM_POWER_INFO: u32 = 1564; +pub const MSR_PP0_POWER_LIMIT: u32 = 1592; +pub const MSR_PP0_ENERGY_STATUS: u32 = 1593; +pub const MSR_PP0_POLICY: u32 = 1594; +pub const MSR_PP0_PERF_STATUS: u32 = 1595; +pub const MSR_PP1_POWER_LIMIT: u32 = 1600; +pub const MSR_PP1_ENERGY_STATUS: u32 = 1601; +pub const MSR_PP1_POLICY: u32 = 1602; +pub const MSR_CONFIG_TDP_NOMINAL: u32 = 1608; +pub const MSR_CONFIG_TDP_LEVEL_1: u32 = 1609; +pub const MSR_CONFIG_TDP_LEVEL_2: u32 = 1610; +pub const MSR_CONFIG_TDP_CONTROL: u32 = 1611; +pub const MSR_TURBO_ACTIVATION_RATIO: u32 = 1612; +pub const MSR_PLATFORM_ENERGY_STATUS: u32 = 1613; +pub const MSR_PKG_WEIGHTED_CORE_C0_RES: u32 = 1624; +pub const MSR_PKG_ANY_CORE_C0_RES: u32 = 1625; +pub const MSR_PKG_ANY_GFXE_C0_RES: u32 = 1626; +pub const MSR_PKG_BOTH_CORE_GFXE_C0_RES: u32 = 1627; +pub const MSR_CORE_C1_RES: u32 = 1632; +pub const MSR_CC6_DEMOTION_POLICY_CONFIG: u32 = 1640; +pub const MSR_MC6_DEMOTION_POLICY_CONFIG: u32 = 1641; +pub const MSR_CORE_PERF_LIMIT_REASONS: u32 = 1680; +pub const MSR_GFX_PERF_LIMIT_REASONS: u32 = 1712; +pub const MSR_RING_PERF_LIMIT_REASONS: u32 = 1713; +pub const MSR_PPERF: u32 = 1614; +pub const MSR_PERF_LIMIT_REASONS: u32 = 1615; +pub const MSR_PM_ENABLE: u32 = 1904; +pub const MSR_HWP_CAPABILITIES: u32 = 1905; +pub const MSR_HWP_REQUEST_PKG: u32 = 1906; +pub const MSR_HWP_INTERRUPT: u32 = 1907; +pub const MSR_HWP_REQUEST: u32 = 1908; +pub const MSR_HWP_STATUS: u32 = 1911; +pub const HWP_BASE_BIT: u32 = 128; +pub const HWP_NOTIFICATIONS_BIT: u32 = 256; +pub const HWP_ACTIVITY_WINDOW_BIT: u32 = 512; +pub const HWP_ENERGY_PERF_PREFERENCE_BIT: u32 = 1024; +pub const HWP_PACKAGE_LEVEL_REQUEST_BIT: u32 = 2048; +pub const MSR_AMD64_MC0_MASK: u32 = 3221291076; +pub const MSR_IA32_MC0_CTL2: u32 = 640; +pub const MSR_P6_PERFCTR0: u32 = 193; +pub const MSR_P6_PERFCTR1: u32 = 194; +pub const MSR_P6_EVNTSEL0: u32 = 390; +pub const MSR_P6_EVNTSEL1: u32 = 391; +pub const MSR_KNC_PERFCTR0: u32 = 32; +pub const MSR_KNC_PERFCTR1: u32 = 33; +pub const MSR_KNC_EVNTSEL0: u32 = 40; +pub const MSR_KNC_EVNTSEL1: u32 = 41; +pub const MSR_IA32_PMC0: u32 = 1217; +pub const MSR_AMD64_PATCH_LEVEL: u32 = 139; +pub const MSR_AMD64_TSC_RATIO: u32 = 3221225732; +pub const MSR_AMD64_NB_CFG: u32 = 3221291039; +pub const MSR_AMD64_PATCH_LOADER: u32 = 3221291040; +pub const MSR_AMD64_OSVW_ID_LENGTH: u32 = 3221291328; +pub const MSR_AMD64_OSVW_STATUS: u32 = 3221291329; +pub const MSR_AMD64_LS_CFG: u32 = 3221295136; +pub const MSR_AMD64_DC_CFG: u32 = 3221295138; +pub const MSR_AMD64_BU_CFG2: u32 = 3221295146; +pub const MSR_AMD64_IBSFETCHCTL: u32 = 3221295152; +pub const MSR_AMD64_IBSFETCHLINAD: u32 = 3221295153; +pub const MSR_AMD64_IBSFETCHPHYSAD: u32 = 3221295154; +pub const MSR_AMD64_IBSFETCH_REG_COUNT: u32 = 3; +pub const MSR_AMD64_IBSFETCH_REG_MASK: u32 = 7; +pub const MSR_AMD64_IBSOPCTL: u32 = 3221295155; +pub const MSR_AMD64_IBSOPRIP: u32 = 3221295156; +pub const MSR_AMD64_IBSOPDATA: u32 = 3221295157; +pub const MSR_AMD64_IBSOPDATA2: u32 = 3221295158; +pub const MSR_AMD64_IBSOPDATA3: u32 = 3221295159; +pub const MSR_AMD64_IBSDCLINAD: u32 = 3221295160; +pub const MSR_AMD64_IBSDCPHYSAD: u32 = 3221295161; +pub const MSR_AMD64_IBSOP_REG_COUNT: u32 = 7; +pub const MSR_AMD64_IBSOP_REG_MASK: u32 = 127; +pub const MSR_AMD64_IBSCTL: u32 = 3221295162; +pub const MSR_AMD64_IBSBRTARGET: u32 = 3221295163; +pub const MSR_AMD64_IBSOPDATA4: u32 = 3221295165; +pub const MSR_AMD64_IBS_REG_COUNT_MAX: u32 = 8; +pub const MSR_AMD64_VIRT_SPEC_CTRL: u32 = 3221291295; +pub const MSR_F17H_IRPERF: u32 = 3221225705; +pub const MSR_F16H_L2I_PERF_CTL: u32 = 3221291568; +pub const MSR_F16H_L2I_PERF_CTR: u32 = 3221291569; +pub const MSR_F16H_DR1_ADDR_MASK: u32 = 3221295129; +pub const MSR_F16H_DR2_ADDR_MASK: u32 = 3221295130; +pub const MSR_F16H_DR3_ADDR_MASK: u32 = 3221295131; +pub const MSR_F16H_DR0_ADDR_MASK: u32 = 3221295143; +pub const MSR_F15H_PERF_CTL: u32 = 3221291520; +pub const MSR_F15H_PERF_CTR: u32 = 3221291521; +pub const MSR_F15H_NB_PERF_CTL: u32 = 3221291584; +pub const MSR_F15H_NB_PERF_CTR: u32 = 3221291585; +pub const MSR_F15H_PTSC: u32 = 3221291648; +pub const MSR_F15H_IC_CFG: u32 = 3221295137; +pub const MSR_FAM10H_MMIO_CONF_BASE: u32 = 3221291096; +pub const FAM10H_MMIO_CONF_ENABLE: u32 = 1; +pub const FAM10H_MMIO_CONF_BUSRANGE_MASK: u32 = 15; +pub const FAM10H_MMIO_CONF_BUSRANGE_SHIFT: u32 = 2; +pub const FAM10H_MMIO_CONF_BASE_MASK: u32 = 268435455; +pub const FAM10H_MMIO_CONF_BASE_SHIFT: u32 = 20; +pub const MSR_FAM10H_NODE_ID: u32 = 3221295116; +pub const MSR_F10H_DECFG: u32 = 3221295145; +pub const MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT: u32 = 1; +pub const MSR_K8_TOP_MEM1: u32 = 3221291034; +pub const MSR_K8_TOP_MEM2: u32 = 3221291037; +pub const MSR_K8_SYSCFG: u32 = 3221291024; +pub const MSR_K8_INT_PENDING_MSG: u32 = 3221291093; +pub const K8_INTP_C1E_ACTIVE_MASK: u32 = 402653184; +pub const MSR_K8_TSEG_ADDR: u32 = 3221291282; +pub const MSR_K8_TSEG_MASK: u32 = 3221291283; +pub const K8_MTRRFIXRANGE_DRAM_ENABLE: u32 = 262144; +pub const K8_MTRRFIXRANGE_DRAM_MODIFY: u32 = 524288; +pub const K8_MTRR_RDMEM_WRMEM_MASK: u32 = 404232216; +pub const MSR_K7_EVNTSEL0: u32 = 3221291008; +pub const MSR_K7_PERFCTR0: u32 = 3221291012; +pub const MSR_K7_EVNTSEL1: u32 = 3221291009; +pub const MSR_K7_PERFCTR1: u32 = 3221291013; +pub const MSR_K7_EVNTSEL2: u32 = 3221291010; +pub const MSR_K7_PERFCTR2: u32 = 3221291014; +pub const MSR_K7_EVNTSEL3: u32 = 3221291011; +pub const MSR_K7_PERFCTR3: u32 = 3221291015; +pub const MSR_K7_CLK_CTL: u32 = 3221291035; +pub const MSR_K7_HWCR: u32 = 3221291029; +pub const MSR_K7_FID_VID_CTL: u32 = 3221291073; +pub const MSR_K7_FID_VID_STATUS: u32 = 3221291074; +pub const MSR_K6_WHCR: u32 = 3221225602; +pub const MSR_K6_UWCCR: u32 = 3221225605; +pub const MSR_K6_EPMR: u32 = 3221225606; +pub const MSR_K6_PSOR: u32 = 3221225607; +pub const MSR_K6_PFIR: u32 = 3221225608; +pub const MSR_IDT_FCR1: u32 = 263; +pub const MSR_IDT_FCR2: u32 = 264; +pub const MSR_IDT_FCR3: u32 = 265; +pub const MSR_IDT_FCR4: u32 = 266; +pub const MSR_IDT_MCR0: u32 = 272; +pub const MSR_IDT_MCR1: u32 = 273; +pub const MSR_IDT_MCR2: u32 = 274; +pub const MSR_IDT_MCR3: u32 = 275; +pub const MSR_IDT_MCR4: u32 = 276; +pub const MSR_IDT_MCR5: u32 = 277; +pub const MSR_IDT_MCR6: u32 = 278; +pub const MSR_IDT_MCR7: u32 = 279; +pub const MSR_IDT_MCR_CTRL: u32 = 288; +pub const MSR_VIA_FCR: u32 = 4359; +pub const MSR_VIA_LONGHAUL: u32 = 4362; +pub const MSR_VIA_RNG: u32 = 4363; +pub const MSR_VIA_BCR2: u32 = 4423; +pub const MSR_TMTA_LONGRUN_CTRL: u32 = 2156298256; +pub const MSR_TMTA_LONGRUN_FLAGS: u32 = 2156298257; +pub const MSR_TMTA_LRTI_READOUT: u32 = 2156298264; +pub const MSR_TMTA_LRTI_VOLT_MHZ: u32 = 2156298266; +pub const MSR_IA32_P5_MC_ADDR: u32 = 0; +pub const MSR_IA32_P5_MC_TYPE: u32 = 1; +pub const MSR_IA32_TSC: u32 = 16; +pub const MSR_IA32_PLATFORM_ID: u32 = 23; +pub const MSR_IA32_EBL_CR_POWERON: u32 = 42; +pub const MSR_EBC_FREQUENCY_ID: u32 = 44; +pub const MSR_SMI_COUNT: u32 = 52; +pub const MSR_IA32_FEATURE_CONTROL: u32 = 58; +pub const MSR_IA32_TSC_ADJUST: u32 = 59; +pub const MSR_IA32_BNDCFGS: u32 = 3472; +pub const MSR_IA32_BNDCFGS_RSVD: u32 = 4092; +pub const MSR_IA32_XSS: u32 = 3488; +pub const FEATURE_CONTROL_LOCKED: u32 = 1; +pub const FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX: u32 = 2; +pub const FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX: u32 = 4; +pub const FEATURE_CONTROL_LMCE: u32 = 1048576; +pub const MSR_IA32_APICBASE: u32 = 27; +pub const MSR_IA32_APICBASE_BSP: u32 = 256; +pub const MSR_IA32_APICBASE_ENABLE: u32 = 2048; +pub const MSR_IA32_APICBASE_BASE: u32 = 4294963200; +pub const MSR_IA32_TSCDEADLINE: u32 = 1760; +pub const MSR_IA32_UCODE_WRITE: u32 = 121; +pub const MSR_IA32_UCODE_REV: u32 = 139; +pub const MSR_IA32_SMM_MONITOR_CTL: u32 = 155; +pub const MSR_IA32_SMBASE: u32 = 158; +pub const MSR_IA32_PERF_STATUS: u32 = 408; +pub const MSR_IA32_PERF_CTL: u32 = 409; +pub const INTEL_PERF_CTL_MASK: u32 = 65535; +pub const MSR_AMD_PSTATE_DEF_BASE: u32 = 3221291108; +pub const MSR_AMD_PERF_STATUS: u32 = 3221291107; +pub const MSR_AMD_PERF_CTL: u32 = 3221291106; +pub const MSR_IA32_MPERF: u32 = 231; +pub const MSR_IA32_APERF: u32 = 232; +pub const MSR_IA32_THERM_CONTROL: u32 = 410; +pub const MSR_IA32_THERM_INTERRUPT: u32 = 411; +pub const THERM_INT_HIGH_ENABLE: u32 = 1; +pub const THERM_INT_LOW_ENABLE: u32 = 2; +pub const THERM_INT_PLN_ENABLE: u32 = 16777216; +pub const MSR_IA32_THERM_STATUS: u32 = 412; +pub const THERM_STATUS_PROCHOT: u32 = 1; +pub const THERM_STATUS_POWER_LIMIT: u32 = 1024; +pub const MSR_THERM2_CTL: u32 = 413; +pub const MSR_THERM2_CTL_TM_SELECT: u32 = 65536; +pub const MSR_IA32_MISC_ENABLE: u32 = 416; +pub const MSR_IA32_TEMPERATURE_TARGET: u32 = 418; +pub const MSR_MISC_FEATURE_CONTROL: u32 = 420; +pub const MSR_MISC_PWR_MGMT: u32 = 426; +pub const MSR_IA32_ENERGY_PERF_BIAS: u32 = 432; +pub const ENERGY_PERF_BIAS_PERFORMANCE: u32 = 0; +pub const ENERGY_PERF_BIAS_NORMAL: u32 = 6; +pub const ENERGY_PERF_BIAS_POWERSAVE: u32 = 15; +pub const MSR_IA32_PACKAGE_THERM_STATUS: u32 = 433; +pub const PACKAGE_THERM_STATUS_PROCHOT: u32 = 1; +pub const PACKAGE_THERM_STATUS_POWER_LIMIT: u32 = 1024; +pub const MSR_IA32_PACKAGE_THERM_INTERRUPT: u32 = 434; +pub const PACKAGE_THERM_INT_HIGH_ENABLE: u32 = 1; +pub const PACKAGE_THERM_INT_LOW_ENABLE: u32 = 2; +pub const PACKAGE_THERM_INT_PLN_ENABLE: u32 = 16777216; +pub const THERM_INT_THRESHOLD0_ENABLE: u32 = 32768; +pub const THERM_SHIFT_THRESHOLD0: u32 = 8; +pub const THERM_MASK_THRESHOLD0: u32 = 32512; +pub const THERM_INT_THRESHOLD1_ENABLE: u32 = 8388608; +pub const THERM_SHIFT_THRESHOLD1: u32 = 16; +pub const THERM_MASK_THRESHOLD1: u32 = 8323072; +pub const THERM_STATUS_THRESHOLD0: u32 = 64; +pub const THERM_LOG_THRESHOLD0: u32 = 128; +pub const THERM_STATUS_THRESHOLD1: u32 = 256; +pub const THERM_LOG_THRESHOLD1: u32 = 512; +pub const MSR_IA32_MISC_ENABLE_FAST_STRING_BIT: u32 = 0; +pub const MSR_IA32_MISC_ENABLE_FAST_STRING: u32 = 1; +pub const MSR_IA32_MISC_ENABLE_TCC_BIT: u32 = 1; +pub const MSR_IA32_MISC_ENABLE_TCC: u32 = 2; +pub const MSR_IA32_MISC_ENABLE_EMON_BIT: u32 = 7; +pub const MSR_IA32_MISC_ENABLE_EMON: u32 = 128; +pub const MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT: u32 = 11; +pub const MSR_IA32_MISC_ENABLE_BTS_UNAVAIL: u32 = 2048; +pub const MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT: u32 = 12; +pub const MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL: u32 = 4096; +pub const MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT: u32 = 16; +pub const MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP: u32 = 65536; +pub const MSR_IA32_MISC_ENABLE_MWAIT_BIT: u32 = 18; +pub const MSR_IA32_MISC_ENABLE_MWAIT: u32 = 262144; +pub const MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT: u32 = 22; +pub const MSR_IA32_MISC_ENABLE_LIMIT_CPUID: u32 = 4194304; +pub const MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT: u32 = 23; +pub const MSR_IA32_MISC_ENABLE_XTPR_DISABLE: u32 = 8388608; +pub const MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT: u32 = 34; +pub const MSR_IA32_MISC_ENABLE_XD_DISABLE: u64 = 17179869184; +pub const MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT: u32 = 2; +pub const MSR_IA32_MISC_ENABLE_X87_COMPAT: u32 = 4; +pub const MSR_IA32_MISC_ENABLE_TM1_BIT: u32 = 3; +pub const MSR_IA32_MISC_ENABLE_TM1: u32 = 8; +pub const MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT: u32 = 4; +pub const MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE: u32 = 16; +pub const MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT: u32 = 6; +pub const MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE: u32 = 64; +pub const MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT: u32 = 8; +pub const MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK: u32 = 256; +pub const MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT: u32 = 9; +pub const MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE: u32 = 512; +pub const MSR_IA32_MISC_ENABLE_FERR_BIT: u32 = 10; +pub const MSR_IA32_MISC_ENABLE_FERR: u32 = 1024; +pub const MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT: u32 = 10; +pub const MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX: u32 = 1024; +pub const MSR_IA32_MISC_ENABLE_TM2_BIT: u32 = 13; +pub const MSR_IA32_MISC_ENABLE_TM2: u32 = 8192; +pub const MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT: u32 = 19; +pub const MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE: u32 = 524288; +pub const MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT: u32 = 20; +pub const MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK: u32 = 1048576; +pub const MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT: u32 = 24; +pub const MSR_IA32_MISC_ENABLE_L1D_CONTEXT: u32 = 16777216; +pub const MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT: u32 = 37; +pub const MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE: u64 = 137438953472; +pub const MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT: u32 = 38; +pub const MSR_IA32_MISC_ENABLE_TURBO_DISABLE: u64 = 274877906944; +pub const MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT: u32 = 39; +pub const MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE: u64 = 549755813888; +pub const MSR_IA32_TSC_DEADLINE: u32 = 1760; +pub const MSR_TSX_FORCE_ABORT: u32 = 271; +pub const MSR_TFA_RTM_FORCE_ABORT_BIT: u32 = 0; +pub const MSR_IA32_MCG_EAX: u32 = 384; +pub const MSR_IA32_MCG_EBX: u32 = 385; +pub const MSR_IA32_MCG_ECX: u32 = 386; +pub const MSR_IA32_MCG_EDX: u32 = 387; +pub const MSR_IA32_MCG_ESI: u32 = 388; +pub const MSR_IA32_MCG_EDI: u32 = 389; +pub const MSR_IA32_MCG_EBP: u32 = 390; +pub const MSR_IA32_MCG_ESP: u32 = 391; +pub const MSR_IA32_MCG_EFLAGS: u32 = 392; +pub const MSR_IA32_MCG_EIP: u32 = 393; +pub const MSR_IA32_MCG_RESERVED: u32 = 394; +pub const MSR_P4_BPU_PERFCTR0: u32 = 768; +pub const MSR_P4_BPU_PERFCTR1: u32 = 769; +pub const MSR_P4_BPU_PERFCTR2: u32 = 770; +pub const MSR_P4_BPU_PERFCTR3: u32 = 771; +pub const MSR_P4_MS_PERFCTR0: u32 = 772; +pub const MSR_P4_MS_PERFCTR1: u32 = 773; +pub const MSR_P4_MS_PERFCTR2: u32 = 774; +pub const MSR_P4_MS_PERFCTR3: u32 = 775; +pub const MSR_P4_FLAME_PERFCTR0: u32 = 776; +pub const MSR_P4_FLAME_PERFCTR1: u32 = 777; +pub const MSR_P4_FLAME_PERFCTR2: u32 = 778; +pub const MSR_P4_FLAME_PERFCTR3: u32 = 779; +pub const MSR_P4_IQ_PERFCTR0: u32 = 780; +pub const MSR_P4_IQ_PERFCTR1: u32 = 781; +pub const MSR_P4_IQ_PERFCTR2: u32 = 782; +pub const MSR_P4_IQ_PERFCTR3: u32 = 783; +pub const MSR_P4_IQ_PERFCTR4: u32 = 784; +pub const MSR_P4_IQ_PERFCTR5: u32 = 785; +pub const MSR_P4_BPU_CCCR0: u32 = 864; +pub const MSR_P4_BPU_CCCR1: u32 = 865; +pub const MSR_P4_BPU_CCCR2: u32 = 866; +pub const MSR_P4_BPU_CCCR3: u32 = 867; +pub const MSR_P4_MS_CCCR0: u32 = 868; +pub const MSR_P4_MS_CCCR1: u32 = 869; +pub const MSR_P4_MS_CCCR2: u32 = 870; +pub const MSR_P4_MS_CCCR3: u32 = 871; +pub const MSR_P4_FLAME_CCCR0: u32 = 872; +pub const MSR_P4_FLAME_CCCR1: u32 = 873; +pub const MSR_P4_FLAME_CCCR2: u32 = 874; +pub const MSR_P4_FLAME_CCCR3: u32 = 875; +pub const MSR_P4_IQ_CCCR0: u32 = 876; +pub const MSR_P4_IQ_CCCR1: u32 = 877; +pub const MSR_P4_IQ_CCCR2: u32 = 878; +pub const MSR_P4_IQ_CCCR3: u32 = 879; +pub const MSR_P4_IQ_CCCR4: u32 = 880; +pub const MSR_P4_IQ_CCCR5: u32 = 881; +pub const MSR_P4_ALF_ESCR0: u32 = 970; +pub const MSR_P4_ALF_ESCR1: u32 = 971; +pub const MSR_P4_BPU_ESCR0: u32 = 946; +pub const MSR_P4_BPU_ESCR1: u32 = 947; +pub const MSR_P4_BSU_ESCR0: u32 = 928; +pub const MSR_P4_BSU_ESCR1: u32 = 929; +pub const MSR_P4_CRU_ESCR0: u32 = 952; +pub const MSR_P4_CRU_ESCR1: u32 = 953; +pub const MSR_P4_CRU_ESCR2: u32 = 972; +pub const MSR_P4_CRU_ESCR3: u32 = 973; +pub const MSR_P4_CRU_ESCR4: u32 = 992; +pub const MSR_P4_CRU_ESCR5: u32 = 993; +pub const MSR_P4_DAC_ESCR0: u32 = 936; +pub const MSR_P4_DAC_ESCR1: u32 = 937; +pub const MSR_P4_FIRM_ESCR0: u32 = 932; +pub const MSR_P4_FIRM_ESCR1: u32 = 933; +pub const MSR_P4_FLAME_ESCR0: u32 = 934; +pub const MSR_P4_FLAME_ESCR1: u32 = 935; +pub const MSR_P4_FSB_ESCR0: u32 = 930; +pub const MSR_P4_FSB_ESCR1: u32 = 931; +pub const MSR_P4_IQ_ESCR0: u32 = 954; +pub const MSR_P4_IQ_ESCR1: u32 = 955; +pub const MSR_P4_IS_ESCR0: u32 = 948; +pub const MSR_P4_IS_ESCR1: u32 = 949; +pub const MSR_P4_ITLB_ESCR0: u32 = 950; +pub const MSR_P4_ITLB_ESCR1: u32 = 951; +pub const MSR_P4_IX_ESCR0: u32 = 968; +pub const MSR_P4_IX_ESCR1: u32 = 969; +pub const MSR_P4_MOB_ESCR0: u32 = 938; +pub const MSR_P4_MOB_ESCR1: u32 = 939; +pub const MSR_P4_MS_ESCR0: u32 = 960; +pub const MSR_P4_MS_ESCR1: u32 = 961; +pub const MSR_P4_PMH_ESCR0: u32 = 940; +pub const MSR_P4_PMH_ESCR1: u32 = 941; +pub const MSR_P4_RAT_ESCR0: u32 = 956; +pub const MSR_P4_RAT_ESCR1: u32 = 957; +pub const MSR_P4_SAAT_ESCR0: u32 = 942; +pub const MSR_P4_SAAT_ESCR1: u32 = 943; +pub const MSR_P4_SSU_ESCR0: u32 = 958; +pub const MSR_P4_SSU_ESCR1: u32 = 959; +pub const MSR_P4_TBPU_ESCR0: u32 = 962; +pub const MSR_P4_TBPU_ESCR1: u32 = 963; +pub const MSR_P4_TC_ESCR0: u32 = 964; +pub const MSR_P4_TC_ESCR1: u32 = 965; +pub const MSR_P4_U2L_ESCR0: u32 = 944; +pub const MSR_P4_U2L_ESCR1: u32 = 945; +pub const MSR_P4_PEBS_MATRIX_VERT: u32 = 1010; +pub const MSR_CORE_PERF_FIXED_CTR0: u32 = 777; +pub const MSR_CORE_PERF_FIXED_CTR1: u32 = 778; +pub const MSR_CORE_PERF_FIXED_CTR2: u32 = 779; +pub const MSR_CORE_PERF_FIXED_CTR_CTRL: u32 = 909; +pub const MSR_CORE_PERF_GLOBAL_STATUS: u32 = 910; +pub const MSR_CORE_PERF_GLOBAL_CTRL: u32 = 911; +pub const MSR_CORE_PERF_GLOBAL_OVF_CTRL: u32 = 912; +pub const MSR_GEODE_BUSCONT_CONF0: u32 = 6400; +pub const MSR_IA32_VMX_BASIC: u32 = 1152; +pub const MSR_IA32_VMX_PINBASED_CTLS: u32 = 1153; +pub const MSR_IA32_VMX_PROCBASED_CTLS: u32 = 1154; +pub const MSR_IA32_VMX_EXIT_CTLS: u32 = 1155; +pub const MSR_IA32_VMX_ENTRY_CTLS: u32 = 1156; +pub const MSR_IA32_VMX_MISC: u32 = 1157; +pub const MSR_IA32_VMX_CR0_FIXED0: u32 = 1158; +pub const MSR_IA32_VMX_CR0_FIXED1: u32 = 1159; +pub const MSR_IA32_VMX_CR4_FIXED0: u32 = 1160; +pub const MSR_IA32_VMX_CR4_FIXED1: u32 = 1161; +pub const MSR_IA32_VMX_VMCS_ENUM: u32 = 1162; +pub const MSR_IA32_VMX_PROCBASED_CTLS2: u32 = 1163; +pub const MSR_IA32_VMX_EPT_VPID_CAP: u32 = 1164; +pub const MSR_IA32_VMX_TRUE_PINBASED_CTLS: u32 = 1165; +pub const MSR_IA32_VMX_TRUE_PROCBASED_CTLS: u32 = 1166; +pub const MSR_IA32_VMX_TRUE_EXIT_CTLS: u32 = 1167; +pub const MSR_IA32_VMX_TRUE_ENTRY_CTLS: u32 = 1168; +pub const MSR_IA32_VMX_VMFUNC: u32 = 1169; +pub const VMX_BASIC_VMCS_SIZE_SHIFT: u32 = 32; +pub const VMX_BASIC_TRUE_CTLS: u64 = 36028797018963968; +pub const VMX_BASIC_64: u64 = 281474976710656; +pub const VMX_BASIC_MEM_TYPE_SHIFT: u32 = 50; +pub const VMX_BASIC_MEM_TYPE_MASK: u64 = 16888498602639360; +pub const VMX_BASIC_MEM_TYPE_WB: u32 = 6; +pub const VMX_BASIC_INOUT: u64 = 18014398509481984; +pub const MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS: u32 = 536870912; +pub const MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE: u32 = 31; +pub const MSR_VM_CR: u32 = 3221291284; +pub const MSR_VM_IGNNE: u32 = 3221291285; +pub const MSR_VM_HSAVE_PA: u32 = 3221291287; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_msr_whitelist() { + for range in WHITELISTED_MSR_RANGES.iter() { + for msr in range.base..(range.base + range.nmsrs) { + let should = !matches!(msr, MSR_IA32_FEATURE_CONTROL | MSR_IA32_MCG_CTL); + assert_eq!(msr_should_serialize(msr), should); + } + } + } + + #[test] + fn test_msr_contains() { + let msr_range_a = MSR_RANGE!(0xEA, 9); + let msr_a = 0x8888; + assert!(!msr_range_a.contains(msr_a)); + + let msr_range_b = MSR_RANGE!(0xCCCC, 5); + let msr_b = 0xCCCD; + assert!(msr_range_b.contains(msr_b)); + } + + fn test_supported_msrs() { + let kvm = Kvm::new().unwrap(); + assert!(supported_guest_msrs(&kvm).is_ok()); + } +} diff --git a/src/dragonball/src/dbs_arch/src/x86_64/regs.rs b/src/dragonball/src/dbs_arch/src/x86_64/regs.rs new file mode 100644 index 000000000..ca04e887e --- /dev/null +++ b/src/dragonball/src/dbs_arch/src/x86_64/regs.rs @@ -0,0 +1,402 @@ +// Copyright 2021-2022 Alibaba Cloud. All Rights Reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! Constants and utilities for x86 CPU generic, system and model specific registers. + +use std::mem; + +use kvm_bindings::{kvm_fpu, kvm_msr_entry, kvm_regs, kvm_sregs, Msrs}; +use kvm_ioctls::VcpuFd; +use vm_memory::{Address, Bytes, GuestAddress, GuestMemory}; + +use super::gdt::kvm_segment_from_gdt; +use super::msr; + +/// Non-Executable bit in EFER MSR. +pub const EFER_NX: u64 = 0x800; +/// Long-mode active bit in EFER MSR. +pub const EFER_LMA: u64 = 0x400; +/// Long-mode enable bit in EFER MSR. +pub const EFER_LME: u64 = 0x100; + +/// Protection mode enable bit in CR0. +pub const X86_CR0_PE: u64 = 0x1; +/// Paging enable bit in CR0. +pub const X86_CR0_PG: u64 = 0x8000_0000; +/// Physical Address Extension bit in CR4. +pub const X86_CR4_PAE: u64 = 0x20; + +/// Errors thrown while setting up x86_64 registers. +#[derive(Debug)] +pub enum Error { + /// Failed to get SREGs for this CPU. + GetStatusRegisters(kvm_ioctls::Error), + /// Failed to set base registers for this CPU. + SetBaseRegisters(kvm_ioctls::Error), + /// Failed to configure the FPU. + SetFPURegisters(kvm_ioctls::Error), + /// Setting up MSRs failed. + SetModelSpecificRegisters(kvm_ioctls::Error), + /// Failed to set all MSRs. + SetModelSpecificRegistersCount, + /// Failed to set SREGs for this CPU. + SetStatusRegisters(kvm_ioctls::Error), + /// Writing the GDT to RAM failed. + WriteGDT, + /// Writing the IDT to RAM failed. + WriteIDT, +} + +type Result = std::result::Result; + +/// Configure Floating-Point Unit (FPU) registers for a given CPU. +/// +/// # Arguments +/// +/// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. +pub fn setup_fpu(vcpu: &VcpuFd) -> Result<()> { + let fpu: kvm_fpu = kvm_fpu { + fcw: 0x37f, + mxcsr: 0x1f80, + ..Default::default() + }; + + vcpu.set_fpu(&fpu).map_err(Error::SetFPURegisters) +} + +/// Configure Model Specific Registers (MSRs) for a given CPU. +/// +/// # Arguments +/// +/// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. +pub fn setup_msrs(vcpu: &VcpuFd) -> Result<()> { + let entry_vec = create_msr_entries(); + let kvm_msrs = + Msrs::from_entries(&entry_vec).map_err(|_| Error::SetModelSpecificRegistersCount)?; + + vcpu.set_msrs(&kvm_msrs) + .map_err(Error::SetModelSpecificRegisters) + .and_then(|msrs_written| { + if msrs_written as u32 != kvm_msrs.as_fam_struct_ref().nmsrs { + Err(Error::SetModelSpecificRegistersCount) + } else { + Ok(msrs_written) + } + })?; + Ok(()) +} + +/// Configure base registers for a given CPU. +/// +/// # Arguments +/// +/// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. +/// * `boot_ip` - Starting instruction pointer. +/// * `rsp` - Value for RSP register +/// * `rbp` - Value for RBP register +/// * `rsi` - Value for RSI register +pub fn setup_regs(vcpu: &VcpuFd, boot_ip: u64, rsp: u64, rbp: u64, rsi: u64) -> Result<()> { + let regs: kvm_regs = kvm_regs { + rflags: 0x0000_0000_0000_0002u64, + rip: boot_ip, + rsp, + rbp, + rsi, + ..Default::default() + }; + + vcpu.set_regs(®s).map_err(Error::SetBaseRegisters) +} + +/// Configures the segment registers for a given CPU. +/// +/// # Arguments +/// +/// * `mem` - The memory that will be passed to the guest. +/// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. +/// * `pgtable_addr` - Address of the vcpu pgtable. +/// * `gdt_table` - Content of the global descriptor table. +/// * `gdt_addr` - Address of the global descriptor table. +/// * `idt_addr` - Address of the interrupt descriptor table. +pub fn setup_sregs( + mem: &M, + vcpu: &VcpuFd, + pgtable_addr: GuestAddress, + gdt_table: &[u64], + gdt_addr: u64, + idt_addr: u64, +) -> Result<()> { + let mut sregs: kvm_sregs = vcpu.get_sregs().map_err(Error::GetStatusRegisters)?; + configure_segments_and_sregs(mem, &mut sregs, pgtable_addr, gdt_table, gdt_addr, idt_addr)?; + vcpu.set_sregs(&sregs).map_err(Error::SetStatusRegisters) +} + +fn configure_segments_and_sregs( + mem: &M, + sregs: &mut kvm_sregs, + pgtable_addr: GuestAddress, + gdt_table: &[u64], + gdt_addr: u64, + idt_addr: u64, +) -> Result<()> { + assert!(gdt_table.len() >= 4); + let code_seg = kvm_segment_from_gdt(gdt_table[1], 1); + let data_seg = kvm_segment_from_gdt(gdt_table[2], 2); + let tss_seg = kvm_segment_from_gdt(gdt_table[3], 3); + + // Write segments + write_gdt_table(gdt_table, gdt_addr, mem)?; + sregs.gdt.base = gdt_addr; + sregs.gdt.limit = std::mem::size_of_val(gdt_table) as u16 - 1; + + write_idt_value(0, idt_addr, mem)?; + sregs.idt.base = idt_addr; + sregs.idt.limit = mem::size_of::() as u16 - 1; + + sregs.cs = code_seg; + sregs.ds = data_seg; + sregs.es = data_seg; + sregs.fs = data_seg; + sregs.gs = data_seg; + sregs.ss = data_seg; + sregs.tr = tss_seg; + + /* 64-bit protected mode */ + sregs.cr0 |= X86_CR0_PE; + sregs.cr3 = pgtable_addr.raw_value(); + sregs.cr4 |= X86_CR4_PAE; + sregs.cr0 |= X86_CR0_PG; + sregs.efer |= EFER_LME | EFER_LMA; + + Ok(()) +} + +fn write_gdt_table(gdt_table: &[u64], gdt_addr: u64, guest_mem: &M) -> Result<()> { + let boot_gdt_addr = GuestAddress(gdt_addr); + for (index, entry) in gdt_table.iter().enumerate() { + let addr = guest_mem + .checked_offset(boot_gdt_addr, index * mem::size_of::()) + .ok_or(Error::WriteGDT)?; + guest_mem + .write_obj(*entry, addr) + .map_err(|_| Error::WriteGDT)?; + } + Ok(()) +} + +fn write_idt_value(idt_table: u64, idt_addr: u64, guest_mem: &M) -> Result<()> { + let boot_idt_addr = GuestAddress(idt_addr); + guest_mem + .write_obj(idt_table, boot_idt_addr) + .map_err(|_| Error::WriteIDT) +} + +#[allow(clippy::vec_init_then_push)] +fn create_msr_entries() -> Vec { + let mut entries = Vec::::new(); + + entries.push(kvm_msr_entry { + index: msr::MSR_IA32_SYSENTER_CS, + data: 0x0, + ..Default::default() + }); + entries.push(kvm_msr_entry { + index: msr::MSR_IA32_SYSENTER_ESP, + data: 0x0, + ..Default::default() + }); + entries.push(kvm_msr_entry { + index: msr::MSR_IA32_SYSENTER_EIP, + data: 0x0, + ..Default::default() + }); + // x86_64 specific msrs, we only run on x86_64 not x86. + entries.push(kvm_msr_entry { + index: msr::MSR_STAR, + data: 0x0, + ..Default::default() + }); + entries.push(kvm_msr_entry { + index: msr::MSR_CSTAR, + data: 0x0, + ..Default::default() + }); + entries.push(kvm_msr_entry { + index: msr::MSR_KERNEL_GS_BASE, + data: 0x0, + ..Default::default() + }); + entries.push(kvm_msr_entry { + index: msr::MSR_SYSCALL_MASK, + data: 0x0, + ..Default::default() + }); + entries.push(kvm_msr_entry { + index: msr::MSR_LSTAR, + data: 0x0, + ..Default::default() + }); + // end of x86_64 specific code + entries.push(kvm_msr_entry { + index: msr::MSR_IA32_TSC, + data: 0x0, + ..Default::default() + }); + entries.push(kvm_msr_entry { + index: msr::MSR_IA32_MISC_ENABLE, + data: u64::from(msr::MSR_IA32_MISC_ENABLE_FAST_STRING), + ..Default::default() + }); + + entries +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::x86_64::gdt::gdt_entry; + use kvm_ioctls::Kvm; + use vm_memory::{Bytes, GuestAddress, GuestMemoryMmap}; + + const BOOT_GDT_OFFSET: u64 = 0x500; + const BOOT_IDT_OFFSET: u64 = 0x520; + const BOOT_STACK_POINTER: u64 = 0x100_0000; + const ZERO_PAGE_START: u64 = 0x7_C000; + const BOOT_GDT_MAX: usize = 4; + const PML4_START: u64 = 0x9000; + + fn create_guest_mem() -> GuestMemoryMmap { + GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap() + } + + fn read_u64(gm: &GuestMemoryMmap, offset: u64) -> u64 { + let read_addr = GuestAddress(offset); + gm.read_obj(read_addr).unwrap() + } + + fn validate_segments_and_sregs(gm: &GuestMemoryMmap, sregs: &kvm_sregs) { + assert_eq!(0x0, read_u64(gm, BOOT_GDT_OFFSET)); + assert_eq!(0xaf_9b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 8)); + assert_eq!(0xcf_9300_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 16)); + assert_eq!(0x8f_8b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 24)); + assert_eq!(0x0, read_u64(gm, BOOT_IDT_OFFSET)); + + assert_eq!(0, sregs.cs.base); + assert_eq!(0xfffff, sregs.ds.limit); + assert_eq!(0x10, sregs.es.selector); + assert_eq!(1, sregs.fs.present); + assert_eq!(1, sregs.gs.g); + assert_eq!(0, sregs.ss.avl); + assert_eq!(0, sregs.tr.base); + assert_eq!(0xfffff, sregs.tr.limit); + assert_eq!(0, sregs.tr.avl); + assert!(sregs.cr0 & X86_CR0_PE != 0); + assert!(sregs.efer & EFER_LME != 0 && sregs.efer & EFER_LMA != 0); + } + + #[test] + fn test_configure_segments_and_sregs() { + let mut sregs: kvm_sregs = Default::default(); + let gm = create_guest_mem(); + let gdt_table: [u64; BOOT_GDT_MAX] = [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xa09b, 0, 0xfffff), // CODE + gdt_entry(0xc093, 0, 0xfffff), // DATA + gdt_entry(0x808b, 0, 0xfffff), // TSS + ]; + configure_segments_and_sregs( + &gm, + &mut sregs, + GuestAddress(PML4_START), + &gdt_table, + BOOT_GDT_OFFSET, + BOOT_IDT_OFFSET, + ) + .unwrap(); + + validate_segments_and_sregs(&gm, &sregs); + } + + #[test] + fn test_setup_fpu() { + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + let vcpu = vm.create_vcpu(0).unwrap(); + setup_fpu(&vcpu).unwrap(); + + let expected_fpu: kvm_fpu = kvm_fpu { + fcw: 0x37f, + mxcsr: 0x1f80, + ..Default::default() + }; + let actual_fpu: kvm_fpu = vcpu.get_fpu().unwrap(); + assert_eq!(expected_fpu.fcw, actual_fpu.fcw); + // Setting the mxcsr register from kvm_fpu inside setup_fpu does not influence anything. + // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c. + // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should + // remove it at all. + // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr); + } + + #[test] + #[allow(clippy::cast_ptr_alignment)] + fn test_setup_msrs() { + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + let vcpu = vm.create_vcpu(0).unwrap(); + setup_msrs(&vcpu).unwrap(); + + // This test will check against the last MSR entry configured (the tenth one). + // See create_msr_entries() for details. + let test_kvm_msrs_entry = [kvm_msr_entry { + index: msr::MSR_IA32_MISC_ENABLE, + ..Default::default() + }]; + let mut kvm_msrs = Msrs::from_entries(&test_kvm_msrs_entry).unwrap(); + + // kvm_ioctls::get_msrs() returns the number of msrs that it succeeded in reading. + // We only want to read one in this test case scenario. + let read_nmsrs = vcpu.get_msrs(&mut kvm_msrs).unwrap(); + // Validate it only read one. + assert_eq!(read_nmsrs, 1); + + // Official entries that were setup when we did setup_msrs. We need to assert that the + // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we + // expect. + let entry_vec = create_msr_entries(); + assert_eq!(entry_vec[9], kvm_msrs.as_slice()[0]); + } + + #[test] + fn test_setup_regs() { + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + let vcpu = vm.create_vcpu(0).unwrap(); + + let expected_regs: kvm_regs = kvm_regs { + rflags: 0x0000_0000_0000_0002u64, + rip: 1, + rsp: BOOT_STACK_POINTER, + rbp: BOOT_STACK_POINTER, + rsi: ZERO_PAGE_START, + ..Default::default() + }; + + setup_regs( + &vcpu, + expected_regs.rip, + BOOT_STACK_POINTER, + BOOT_STACK_POINTER, + ZERO_PAGE_START, + ) + .unwrap(); + + let actual_regs: kvm_regs = vcpu.get_regs().unwrap(); + assert_eq!(actual_regs, expected_regs); + } +} diff --git a/src/dragonball/src/dbs_boot/Cargo.toml b/src/dragonball/src/dbs_boot/Cargo.toml new file mode 100644 index 000000000..7216795d5 --- /dev/null +++ b/src/dragonball/src/dbs_boot/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "dbs-boot" +version = "0.4.0" +authors = ["Alibaba Dragonball Team"] +description = "Traits and structs for booting sandbox" +license = "Apache-2.0 AND BSD-3-Clause" +edition = "2018" +homepage = "https://github.com/openanolis/dragonball-sandbox" +repository = "https://github.com/openanolis/dragonball-sandbox" +keywords = ["dragonball", "boot", "VMM"] +readme = "README.md" + +[dependencies] +dbs-arch = { path = "../dbs_arch" } +kvm-bindings = { version = "0.6.0", features = ["fam-wrappers"] } +kvm-ioctls = "0.12.0" +lazy_static = "1" +libc = "0.2.39" +thiserror = "1" +vm-memory = "0.9.0" +vm-fdt = "0.2.0" + +[dev-dependencies] +vm-memory = { version = "0.9.0", features = ["backend-mmap"] } +device_tree = ">=1.1.0" +dbs-device = { path = "../dbs_device" } diff --git a/src/dragonball/src/dbs_boot/LICENSE b/src/dragonball/src/dbs_boot/LICENSE new file mode 120000 index 000000000..30cff7403 --- /dev/null +++ b/src/dragonball/src/dbs_boot/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/src/dragonball/src/dbs_boot/README.md b/src/dragonball/src/dbs_boot/README.md new file mode 100644 index 000000000..a55842d18 --- /dev/null +++ b/src/dragonball/src/dbs_boot/README.md @@ -0,0 +1,24 @@ +# dbs-boot + +## Design + +The `dbs-boot` crate is a collection of constants, structs and utilities used to boot virtual machines. + +## Submodule List + +This repository contains the following submodules: +| Name | Arch| Description | +| --- | --- | --- | +| [`bootparam`](src/x86_64/bootparam.rs) | x86_64 | Magic addresses externally used to lay out x86_64 VMs | +| [fdt](src/aarch64/fdt.rs) | aarch64| Create FDT for Aarch64 systems | +| [layout](src/x86_64/layout.rs) | x86_64 | x86_64 layout constants | +| [layout](src/aarch64/layout.rs/) | aarch64 | aarch64 layout constants | +| [mptable](src/x86_64/mptable.rs) | x86_64 | MP Table configurations used for defining VM boot status | + +## Acknowledgement + +Part of the code is derived from the [Firecracker](https://github.com/firecracker-microvm/firecracker) project. + +## License + +This project is licensed under [Apache License](http://www.apache.org/licenses/LICENSE-2.0), Version 2.0. diff --git a/src/dragonball/src/dbs_boot/THIRD-PARTY b/src/dragonball/src/dbs_boot/THIRD-PARTY new file mode 120000 index 000000000..301d0a498 --- /dev/null +++ b/src/dragonball/src/dbs_boot/THIRD-PARTY @@ -0,0 +1 @@ +../../THIRD-PARTY \ No newline at end of file diff --git a/src/dragonball/src/dbs_boot/src/aarch64/fdt.rs b/src/dragonball/src/dbs_boot/src/aarch64/fdt.rs new file mode 100644 index 000000000..6d6eeaaf3 --- /dev/null +++ b/src/dragonball/src/dbs_boot/src/aarch64/fdt.rs @@ -0,0 +1,608 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! Create Flatten Device Tree (FDT) for ARM64 systems. + +use std::collections::HashMap; +use std::fmt::Debug; + +use dbs_arch::gic::its::ItsType::{self, PciMsiIts, PlatformMsiIts}; +use dbs_arch::gic::GICDevice; +use dbs_arch::{pmu::VIRTUAL_PMU_IRQ, VpmuFeatureLevel}; +use dbs_arch::{DeviceInfoForFDT, DeviceType}; + +use vm_fdt::FdtWriter; +use vm_memory::GuestMemoryRegion; +use vm_memory::{Address, Bytes, GuestAddress, GuestMemory}; + +use super::fdt_utils::*; +use super::Error; +use crate::Result; + +// This is a value for uniquely identifying the FDT node declaring the interrupt controller. +const GIC_PHANDLE: u32 = 1; +// This is a value for uniquely identifying the FDT node containing the clock definition. +const CLOCK_PHANDLE: u32 = 2; +// This is a value for uniquely identifying the FDT node containing the plaform msi ITS definition. +const GIC_PLATFORM_MSI_ITS_PHANDLE: u32 = 3; +// This is a value for uniquely identifying the FDT node containing the pci msi ITS definition. +const GIC_PCI_MSI_ITS_PHANDLE: u32 = 4; +// According to the arm, gic-v3.txt document, ITS' #msi-cells is fixed at 1. +const GIC_PLATFORM_MSI_ITS_CELLS_SIZE: u32 = 1; + +// Read the documentation specified when appending the root node to the FDT. +const ADDRESS_CELLS: u32 = 0x2; +const SIZE_CELLS: u32 = 0x2; + +// As per kvm tool and +// https://www.kernel.org/doc/Documentation/devicetree/bindings/interrupt-controller/arm%2Cgic.txt +// Look for "The 1st cell..." +const GIC_FDT_IRQ_TYPE_SPI: u32 = 0; +const GIC_FDT_IRQ_TYPE_PPI: u32 = 1; + +// From https://elixir.bootlin.com/linux/v4.9.62/source/include/dt-bindings/interrupt-controller/irq.h#L17 +const IRQ_TYPE_EDGE_RISING: u32 = 1; +const IRQ_TYPE_LEVEL_HI: u32 = 4; + +/// Creates the flattened device tree for this aarch64 microVM. +pub fn create_fdt( + fdt_vm_info: FdtVmInfo, + _fdt_numa_info: FdtNumaInfo, + fdt_device_info: FdtDeviceInfo, +) -> Result> +where + T: DeviceInfoForFDT + Clone + Debug, +{ + let mut fdt = FdtWriter::new()?; + + // For an explanation why these nodes were introduced in the blob take a look at + // https://github.com/torvalds/linux/blob/master/Documentation/devicetree/booting-without-of.txt#L845 + // Look for "Required nodes and properties". + + // Header or the root node as per above mentioned documentation. + let root_node = fdt.begin_node("")?; + fdt.property_string("compatible", "linux,dummy-virt")?; + // For info on #address-cells and size-cells read "Note about cells and address representation" + // from the above mentioned txt file. + fdt.property_u32("#address-cells", ADDRESS_CELLS)?; + fdt.property_u32("#size-cells", SIZE_CELLS)?; + // This is not mandatory but we use it to point the root node to the node + // containing description of the interrupt controller for this VM. + fdt.property_u32("interrupt-parent", GIC_PHANDLE)?; + create_cpu_nodes(&mut fdt, &fdt_vm_info)?; + create_memory_node(&mut fdt, fdt_vm_info.get_guest_memory())?; + create_chosen_node(&mut fdt, &fdt_vm_info)?; + create_gic_node(&mut fdt, fdt_device_info.get_irqchip())?; + create_timer_node(&mut fdt)?; + create_clock_node(&mut fdt)?; + create_psci_node(&mut fdt)?; + fdt_device_info + .get_mmio_device_info() + .map_or(Ok(()), |v| create_devices_node(&mut fdt, v))?; + create_pmu_node(&mut fdt, fdt_vm_info.get_vpmu_feature())?; + + // End Header node. + fdt.end_node(root_node)?; + + // Allocate another buffer so we can format and then write fdt to guest. + let fdt_final = fdt.finish()?; + + // Write FDT to memory. + let fdt_address = GuestAddress(super::get_fdt_addr(fdt_vm_info.get_guest_memory())); + fdt_vm_info + .get_guest_memory() + .write_slice(fdt_final.as_slice(), fdt_address)?; + Ok(fdt_final) +} + +// Following are the auxiliary function for creating the different nodes that we append to our FDT. +fn create_cpu_nodes(fdt: &mut FdtWriter, fdt_vm_info: &FdtVmInfo) -> Result<()> { + // See https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/arm/cpus.yaml. + let cpus_node = fdt.begin_node("cpus")?; + // As per documentation, on ARM v8 64-bit systems value should be set to 2. + fdt.property_u32("#address-cells", 0x02)?; + fdt.property_u32("#size-cells", 0x0)?; + let vcpu_mpidr = fdt_vm_info.get_vcpu_mpidr(); + let vcpu_boot_onlined = fdt_vm_info.get_boot_onlined(); + let num_cpus = vcpu_mpidr.len(); + + for (cpu_index, mpidr) in vcpu_mpidr.iter().enumerate().take(num_cpus) { + let cpu_name = format!("cpu@{cpu_index:x}"); + let cpu_node = fdt.begin_node(&cpu_name)?; + fdt.property_string("device_type", "cpu")?; + fdt.property_string("compatible", "arm,arm-v8")?; + if num_cpus > 1 { + // This is required on armv8 64-bit. See aforementioned documentation. + fdt.property_string("enable-method", "psci")?; + } + // boot-onlined attribute is used to indicate whether this cpu should be onlined at boot. + // 0 means offline, 1 means online. + fdt.property_u32("boot-onlined", vcpu_boot_onlined[cpu_index])?; + // Set the field to first 24 bits of the MPIDR - Multiprocessor Affinity Register. + // See http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0488c/BABHBJCI.html. + fdt.property_u64("reg", mpidr & 0x7FFFFF)?; + fdt.end_node(cpu_node)?; + } + fdt.end_node(cpus_node)?; + Ok(()) +} + +fn create_memory_node(fdt: &mut FdtWriter, guest_mem: &M) -> Result<()> { + // See https://github.com/torvalds/linux/blob/v5.9/Documentation/devicetree/booting-without-of.rst + for region in guest_mem.iter() { + let memory_name = format!("memory@{:x}", region.start_addr().raw_value()); + let mem_reg_prop = &[region.start_addr().raw_value(), region.len()]; + let memory_node = fdt.begin_node(&memory_name)?; + fdt.property_string("device_type", "memory")?; + fdt.property_array_u64("reg", mem_reg_prop)?; + fdt.end_node(memory_node)?; + } + Ok(()) +} + +fn create_chosen_node(fdt: &mut FdtWriter, fdt_vm_info: &FdtVmInfo) -> Result<()> { + let chosen_node = fdt.begin_node("chosen")?; + fdt.property_string("bootargs", fdt_vm_info.get_cmdline())?; + + if let Some(initrd_config) = fdt_vm_info.get_initrd_config() { + fdt.property_u64("linux,initrd-start", initrd_config.address.raw_value())?; + fdt.property_u64( + "linux,initrd-end", + initrd_config.address.raw_value() + initrd_config.size as u64, + )?; + } + + fdt.end_node(chosen_node)?; + + Ok(()) +} + +fn append_its_common_property(fdt: &mut FdtWriter, registers_prop: &[u64]) -> Result<()> { + fdt.property_string("compatible", "arm,gic-v3-its")?; + fdt.property_null("msi-controller")?; + fdt.property_array_u64("reg", registers_prop)?; + Ok(()) +} + +fn create_its_node( + fdt: &mut FdtWriter, + gic_device: &dyn GICDevice, + its_type: ItsType, +) -> Result<()> { + let reg = gic_device.get_its_reg_range(&its_type); + if let Some(registers) = reg { + // There are two types of its, pci_msi_its and platform_msi_its. + // If this is pci_msi_its, the fdt node of its is required to have no + // #msi-cells attribute. If this is platform_msi_its, the #msi-cells + // attribute of its fdt node is required, and the value is 1. + match its_type { + PlatformMsiIts => { + let its_node = fdt.begin_node("gic-platform-its")?; + append_its_common_property(fdt, ®isters)?; + fdt.property_u32("phandle", GIC_PLATFORM_MSI_ITS_PHANDLE)?; + fdt.property_u32("#msi-cells", GIC_PLATFORM_MSI_ITS_CELLS_SIZE)?; + fdt.end_node(its_node)?; + } + PciMsiIts => { + let its_node = fdt.begin_node("gic-pci-its")?; + append_its_common_property(fdt, ®isters)?; + fdt.property_u32("phandle", GIC_PCI_MSI_ITS_PHANDLE)?; + fdt.end_node(its_node)?; + } + } + } + Ok(()) +} + +fn create_gic_node(fdt: &mut FdtWriter, gic_device: &dyn GICDevice) -> Result<()> { + let gic_reg_prop = gic_device.device_properties(); + + let intc_node = fdt.begin_node("intc")?; + fdt.property_string("compatible", gic_device.fdt_compatibility())?; + fdt.property_null("interrupt-controller")?; + // "interrupt-cells" field specifies the number of cells needed to encode an + // interrupt source. The type shall be a and the value shall be 3 if no PPI affinity description + // is required. + fdt.property_u32("#interrupt-cells", 3)?; + fdt.property_array_u64("reg", gic_reg_prop)?; + fdt.property_u32("phandle", GIC_PHANDLE)?; + fdt.property_u32("#address-cells", 2)?; + fdt.property_u32("#size-cells", 2)?; + fdt.property_null("ranges")?; + let gic_intr_prop = &[ + GIC_FDT_IRQ_TYPE_PPI, + gic_device.fdt_maint_irq(), + IRQ_TYPE_LEVEL_HI, + ]; + + fdt.property_array_u32("interrupts", gic_intr_prop)?; + create_its_node(fdt, gic_device, PlatformMsiIts)?; + create_its_node(fdt, gic_device, PciMsiIts)?; + fdt.end_node(intc_node)?; + + Ok(()) +} + +fn create_clock_node(fdt: &mut FdtWriter) -> Result<()> { + // The Advanced Peripheral Bus (APB) is part of the Advanced Microcontroller Bus Architecture + // (AMBA) protocol family. It defines a low-cost interface that is optimized for minimal power + // consumption and reduced interface complexity. + // PCLK is the clock source and this node defines exactly the clock for the APB. + let clock_node = fdt.begin_node("apb-pclk")?; + fdt.property_string("compatible", "fixed-clock")?; + fdt.property_u32("#clock-cells", 0x0)?; + fdt.property_u32("clock-frequency", 24000000)?; + fdt.property_string("clock-output-names", "clk24mhz")?; + fdt.property_u32("phandle", CLOCK_PHANDLE)?; + fdt.end_node(clock_node)?; + + Ok(()) +} + +fn create_timer_node(fdt: &mut FdtWriter) -> Result<()> { + // See + // https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/interrupt-controller/arch_timer.txt + // These are fixed interrupt numbers for the timer device. + let irqs = [13, 14, 11, 10]; + let compatible = "arm,armv8-timer"; + + let mut timer_reg_cells: Vec = Vec::new(); + for &irq in irqs.iter() { + timer_reg_cells.push(GIC_FDT_IRQ_TYPE_PPI); + timer_reg_cells.push(irq); + timer_reg_cells.push(IRQ_TYPE_LEVEL_HI); + } + + let timer_node = fdt.begin_node("timer")?; + fdt.property_string("compatible", compatible)?; + fdt.property_null("always-on")?; + fdt.property_array_u32("interrupts", &timer_reg_cells)?; + fdt.end_node(timer_node)?; + + Ok(()) +} + +fn create_psci_node(fdt: &mut FdtWriter) -> Result<()> { + let compatible = "arm,psci-0.2"; + let psci_node = fdt.begin_node("psci")?; + fdt.property_string("compatible", compatible)?; + // Two methods available: hvc and smc. + // As per documentation, PSCI calls between a guest and hypervisor may use the HVC conduit instead of SMC. + // So, since we are using kvm, we need to use hvc. + fdt.property_string("method", "hvc")?; + fdt.end_node(psci_node)?; + + Ok(()) +} + +fn create_virtio_node( + fdt: &mut FdtWriter, + dev_info: &T, +) -> Result<()> { + let device_reg_prop = &[dev_info.addr(), dev_info.length()]; + let irq_number = dev_info.irq().map_err(|_| Error::InvalidArguments)?; + let irq_property = &[GIC_FDT_IRQ_TYPE_SPI, irq_number, IRQ_TYPE_EDGE_RISING]; + + let virtio_mmio_node = fdt.begin_node(&format!("virtio_mmio@{:x}", dev_info.addr()))?; + fdt.property_string("compatible", "virtio,mmio")?; + fdt.property_array_u64("reg", device_reg_prop)?; + fdt.property_array_u32("interrupts", irq_property)?; + fdt.property_u32("interrupt-parent", GIC_PHANDLE)?; + fdt.end_node(virtio_mmio_node)?; + + Ok(()) +} + +fn create_serial_node( + fdt: &mut FdtWriter, + dev_info: &T, +) -> Result<()> { + let serial_reg_prop = &[dev_info.addr(), dev_info.length()]; + let irq_number = dev_info.irq().map_err(|_| Error::InvalidArguments)?; + let irq_property = &[GIC_FDT_IRQ_TYPE_SPI, irq_number, IRQ_TYPE_EDGE_RISING]; + + let uart_node = fdt.begin_node(&format!("uart@{:x}", dev_info.addr()))?; + fdt.property_string("compatible", "ns16550a")?; + fdt.property_array_u64("reg", serial_reg_prop)?; + fdt.property_u32("clocks", CLOCK_PHANDLE)?; + fdt.property_string("clock-names", "apb_pclk")?; + fdt.property_array_u32("interrupts", irq_property)?; + fdt.end_node(uart_node)?; + + Ok(()) +} + +fn create_rtc_node( + fdt: &mut FdtWriter, + dev_info: &T, +) -> Result<()> { + let compatible = b"arm,pl031\0arm,primecell\0"; + let rtc_reg_prop = &[dev_info.addr(), dev_info.length()]; + let irq_number = dev_info.irq().map_err(|_| Error::InvalidArguments)?; + let irq_property = &[GIC_FDT_IRQ_TYPE_SPI, irq_number, IRQ_TYPE_LEVEL_HI]; + + let rtc_node = fdt.begin_node(&format!("rtc@{:x}", dev_info.addr()))?; + fdt.property("compatible", compatible)?; + fdt.property_array_u64("reg", rtc_reg_prop)?; + fdt.property_array_u32("interrupts", irq_property)?; + fdt.property_u32("clocks", CLOCK_PHANDLE)?; + fdt.property_string("clock-names", "apb_pclk")?; + fdt.end_node(rtc_node)?; + + Ok(()) +} + +fn create_devices_node( + fdt: &mut FdtWriter, + dev_info: &HashMap<(DeviceType, String), T>, +) -> Result<()> { + // Serial devices need to be registered in order + let mut ordered_serial_device: Vec<&T> = Vec::new(); + // Create one temp Vec to store all virtio devices + let mut ordered_virtio_device: Vec<&T> = Vec::new(); + + for ((device_type, _device_id), info) in dev_info { + match device_type { + DeviceType::RTC => create_rtc_node(fdt, info)?, + DeviceType::Serial => { + ordered_serial_device.push(info); + } + DeviceType::Virtio(_) => { + ordered_virtio_device.push(info); + } + } + } + + // Sort out serial devices by address from low to high and insert them into fdt table. + ordered_serial_device.sort_by_key(|a| a.addr()); + for serial_device_info in ordered_serial_device.drain(..) { + create_serial_node(fdt, serial_device_info)?; + } + // Sort out virtio devices by address from low to high and insert them into fdt table. + ordered_virtio_device.sort_by_key(|a| a.addr()); + for ordered_device_info in ordered_virtio_device.drain(..) { + create_virtio_node(fdt, ordered_device_info)?; + } + + Ok(()) +} + +fn create_pmu_node(fdt: &mut FdtWriter, vpmu_feature: VpmuFeatureLevel) -> Result<()> { + if vpmu_feature == VpmuFeatureLevel::Disabled { + return Ok(()); + }; + + let pmu_node = fdt.begin_node("pmu")?; + fdt.property_string("compatible", "arm,armv8-pmuv3")?; + let pmu_intr_prop = [GIC_FDT_IRQ_TYPE_PPI, VIRTUAL_PMU_IRQ, IRQ_TYPE_LEVEL_HI]; + fdt.property_array_u32("interrupts", &pmu_intr_prop)?; + fdt.end_node(pmu_node)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::cmp::min; + use std::collections::HashMap; + use std::env; + use std::fs::OpenOptions; + use std::io::Write; + use std::path::PathBuf; + + use dbs_arch::{gic::create_gic, pmu::initialize_pmu}; + use device_tree::DeviceTree; + use kvm_bindings::{kvm_vcpu_init, KVM_ARM_VCPU_PMU_V3, KVM_ARM_VCPU_PSCI_0_2}; + use kvm_ioctls::{Kvm, VcpuFd, VmFd}; + use vm_memory::GuestMemoryMmap; + + use super::super::tests::MMIODeviceInfo; + use super::*; + use crate::layout::{DRAM_MEM_MAX_SIZE, DRAM_MEM_START, FDT_MAX_SIZE}; + use crate::InitrdConfig; + + const LEN: u64 = 4096; + + fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { + let dram_size = min(size as u64, DRAM_MEM_MAX_SIZE) as usize; + vec![(GuestAddress(DRAM_MEM_START), dram_size)] + } + + // The `load` function from the `device_tree` will mistakenly check the actual size + // of the buffer with the allocated size. This works around that. + fn set_size(buf: &mut [u8], pos: usize, val: usize) { + buf[pos] = ((val >> 24) & 0xff) as u8; + buf[pos + 1] = ((val >> 16) & 0xff) as u8; + buf[pos + 2] = ((val >> 8) & 0xff) as u8; + buf[pos + 3] = (val & 0xff) as u8; + } + + // Initialize vcpu for pmu test + fn initialize_vcpu_with_pmu(vm: &VmFd, vcpu: &VcpuFd) -> Result<()> { + let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default(); + vm.get_preferred_target(&mut kvi) + .expect("Cannot get preferred target"); + kvi.features[0] = 1 << KVM_ARM_VCPU_PSCI_0_2 | 1 << KVM_ARM_VCPU_PMU_V3; + vcpu.vcpu_init(&kvi).map_err(|_| Error::InvalidArguments)?; + initialize_pmu(vm, vcpu).map_err(|_| Error::InvalidArguments)?; + + Ok(()) + } + + // Create fdt dtb file + fn create_dtb_file(name: &str, dtb: &[u8]) { + // Control whether to create new dtb files for unit test. + // Usage: FDT_CREATE_DTB=1 cargo test + if env::var("FDT_CREATE_DTB").is_err() { + return; + } + + // Use this code when wanting to generate a new DTB sample. + // Do manually check dtb files with dtc + // See https://git.kernel.org/pub/scm/utils/dtc/dtc.git/plain/Documentation/manual.txt + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let mut output = OpenOptions::new() + .write(true) + .create(true) + .open(path.join(format!("src/aarch64/test/{name}"))) + .unwrap(); + output + .set_len(FDT_MAX_SIZE as u64) + .map_err(|_| Error::InvalidArguments) + .unwrap(); + output.write_all(dtb).unwrap(); + } + + #[test] + fn test_create_fdt_with_devices() { + let regions = arch_memory_regions(FDT_MAX_SIZE + 0x1000); + let mem = GuestMemoryMmap::<()>::from_ranges(®ions).expect("Cannot initialize memory"); + let dev_info: HashMap<(DeviceType, String), MMIODeviceInfo> = [ + ( + (DeviceType::Serial, DeviceType::Serial.to_string()), + MMIODeviceInfo::new(0, 1), + ), + ( + (DeviceType::Virtio(1), "virtio".to_string()), + MMIODeviceInfo::new(LEN, 2), + ), + ( + (DeviceType::RTC, "rtc".to_string()), + MMIODeviceInfo::new(2 * LEN, 3), + ), + ] + .iter() + .cloned() + .collect(); + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + let gic = create_gic(&vm, 1).unwrap(); + let vpmu_feature = VpmuFeatureLevel::Disabled; + assert!(create_fdt( + FdtVmInfo::new( + &mem, + "console=tty0", + None, + FdtVcpuInfo::new(vec![0], vec![1], vpmu_feature, false) + ), + FdtNumaInfo::default(), + FdtDeviceInfo::new(Some(&dev_info), gic.as_ref()) + ) + .is_ok()) + } + + #[test] + fn test_create_fdt() { + let regions = arch_memory_regions(FDT_MAX_SIZE + 0x1000); + let mem = GuestMemoryMmap::<()>::from_ranges(®ions).expect("Cannot initialize memory"); + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + let gic = create_gic(&vm, 1).unwrap(); + let vpmu_feature = VpmuFeatureLevel::Disabled; + let dtb = create_fdt( + FdtVmInfo::new( + &mem, + "console=tty0", + None, + FdtVcpuInfo::new(vec![0], vec![1], vpmu_feature, false), + ), + FdtNumaInfo::default(), + FdtDeviceInfo::::new(None, gic.as_ref()), + ) + .unwrap(); + + create_dtb_file("output.dtb", &dtb); + + let bytes = include_bytes!("test/output.dtb"); + let pos = 4; + let val = FDT_MAX_SIZE; + let mut buf = vec![]; + buf.extend_from_slice(bytes); + set_size(&mut buf, pos, val); + + let original_fdt = DeviceTree::load(&buf).unwrap(); + let generated_fdt = DeviceTree::load(&dtb).unwrap(); + assert_eq!(format!("{original_fdt:?}"), format!("{generated_fdt:?}")); + } + + #[test] + fn test_create_fdt_with_initrd() { + let regions = arch_memory_regions(FDT_MAX_SIZE + 0x1000); + let mem = GuestMemoryMmap::<()>::from_ranges(®ions).expect("Cannot initialize memory"); + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + let gic = create_gic(&vm, 1).unwrap(); + let initrd = InitrdConfig { + address: GuestAddress(0x10000000), + size: 0x1000, + }; + let vpmu_feature = VpmuFeatureLevel::Disabled; + let dtb = create_fdt( + FdtVmInfo::new( + &mem, + "console=tty0", + Some(&initrd), + FdtVcpuInfo::new(vec![0], vec![1], vpmu_feature, false), + ), + FdtNumaInfo::default(), + FdtDeviceInfo::::new(None, gic.as_ref()), + ) + .unwrap(); + + create_dtb_file("output_with_initrd.dtb", &dtb); + + let bytes = include_bytes!("test/output_with_initrd.dtb"); + let pos = 4; + let val = FDT_MAX_SIZE; + let mut buf = vec![]; + buf.extend_from_slice(bytes); + set_size(&mut buf, pos, val); + + let original_fdt = DeviceTree::load(&buf).unwrap(); + let generated_fdt = DeviceTree::load(&dtb).unwrap(); + assert_eq!(format!("{original_fdt:?}"), format!("{generated_fdt:?}")); + } + + #[test] + fn test_create_fdt_with_pmu() { + let regions = arch_memory_regions(FDT_MAX_SIZE + 0x1000); + let mem = GuestMemoryMmap::<()>::from_ranges(®ions).expect("Cannot initialize memory"); + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + let vcpu = vm.create_vcpu(0).unwrap(); + let gic = create_gic(&vm, 1).unwrap(); + + assert!(initialize_vcpu_with_pmu(&vm, &vcpu).is_ok()); + + let vpmu_feature = VpmuFeatureLevel::FullyEnabled; + let dtb = create_fdt( + FdtVmInfo::new( + &mem, + "console=tty0", + None, + FdtVcpuInfo::new(vec![0], vec![1], vpmu_feature, false), + ), + FdtNumaInfo::default(), + FdtDeviceInfo::::new(None, gic.as_ref()), + ) + .unwrap(); + + create_dtb_file("output_with_pmu.dtb", &dtb); + + let bytes = include_bytes!("test/output_with_pmu.dtb"); + let pos = 4; + let val = FDT_MAX_SIZE; + let mut buf = vec![]; + buf.extend_from_slice(bytes); + set_size(&mut buf, pos, val); + + let original_fdt = DeviceTree::load(&buf).unwrap(); + let generated_fdt = DeviceTree::load(&dtb).unwrap(); + assert_eq!(format!("{original_fdt:?}"), format!("{generated_fdt:?}")); + } +} diff --git a/src/dragonball/src/dbs_boot/src/aarch64/fdt_utils.rs b/src/dragonball/src/dbs_boot/src/aarch64/fdt_utils.rs new file mode 100644 index 000000000..ceb6d8e79 --- /dev/null +++ b/src/dragonball/src/dbs_boot/src/aarch64/fdt_utils.rs @@ -0,0 +1,373 @@ +// Copyright 2023 Alibaba Cloud. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! This module abstract some structs for constructing fdt. Instead of using +//! multiple parameters. + +use std::collections::HashMap; + +use dbs_arch::{gic::GICDevice, DeviceInfoForFDT, DeviceType, VpmuFeatureLevel}; +use vm_memory::mmap::GuestMemoryMmap; + +use crate::InitrdConfig; + +/// Struct to save vcpu information +pub struct FdtVcpuInfo { + /// vcpu mpidrs + vcpu_mpidr: Vec, + /// vcpu boot-onlined + vcpu_boot_onlined: Vec, + /// vpmu feature + vpmu_feature: VpmuFeatureLevel, + // TODO: #274 cache passthrough + /// cache passthrough + cache_passthrough_enabled: bool, +} + +impl FdtVcpuInfo { + /// Generate FdtVcpuInfo + pub fn new( + vcpu_mpidr: Vec, + vcpu_boot_onlined: Vec, + vpmu_feature: VpmuFeatureLevel, + cache_passthrough_enabled: bool, + ) -> Self { + FdtVcpuInfo { + vcpu_mpidr, + vcpu_boot_onlined, + vpmu_feature, + cache_passthrough_enabled, + } + } +} + +/// Struct to save vm information. +pub struct FdtVmInfo<'a> { + /// guest meory + guest_memory: &'a GuestMemoryMmap, + /// command line + cmdline: &'a str, + /// initrd config + initrd_config: Option<&'a InitrdConfig>, + /// vcpu information + vcpu_info: FdtVcpuInfo, +} + +impl FdtVmInfo<'_> { + /// Generate FdtVmInfo. + pub fn new<'a>( + guest_memory: &'a GuestMemoryMmap, + cmdline: &'a str, + initrd_config: Option<&'a InitrdConfig>, + vcpu_info: FdtVcpuInfo, + ) -> FdtVmInfo<'a> { + FdtVmInfo { + guest_memory, + cmdline, + initrd_config, + vcpu_info, + } + } + + /// Get guest_memory. + pub fn get_guest_memory(&self) -> &GuestMemoryMmap { + self.guest_memory + } + + /// Get cmdline. + pub fn get_cmdline(&self) -> &str { + self.cmdline + } + + /// Get initrd_config. + pub fn get_initrd_config(&self) -> Option<&InitrdConfig> { + self.initrd_config + } + + /// Get vcpu_mpidr. + pub fn get_vcpu_mpidr(&self) -> &[u64] { + self.vcpu_info.vcpu_mpidr.as_slice() + } + + /// Get vpmu_feature. + pub fn get_boot_onlined(&self) -> &[u32] { + self.vcpu_info.vcpu_boot_onlined.as_slice() + } + + /// Get vpmu_feature. + pub fn get_vpmu_feature(&self) -> VpmuFeatureLevel { + self.vcpu_info.vpmu_feature + } + + /// Get cache_passthrough_enabled. + pub fn get_cache_passthrough_enabled(&self) -> bool { + self.vcpu_info.cache_passthrough_enabled + } +} + +// This struct is used for cache passthrough and numa passthrough +// TODO: #274 cache passthrough +// TODO: #275 numa passthrough +/// Struct to save numa information. +#[derive(Default)] +pub struct FdtNumaInfo { + /// vcpu -> pcpu maps + cpu_maps: Option>, + /// numa id map vector for memory + memory_numa_id_map: Option>, + /// numa id map vector for vcpu + vcpu_numa_id_map: Option>, +} + +impl FdtNumaInfo { + /// Generate FdtNumaInfo. + pub fn new( + cpu_maps: Option>, + memory_numa_id_map: Option>, + vcpu_numa_id_map: Option>, + ) -> Self { + FdtNumaInfo { + cpu_maps, + memory_numa_id_map, + vcpu_numa_id_map, + } + } + + /// Get cpu_maps struct. + pub fn get_cpu_maps(&self) -> Option> { + self.cpu_maps.clone() + } + + /// Get memory_numa_id_map struct. + pub fn get_memory_numa_id_map(&self) -> Option<&Vec> { + self.memory_numa_id_map.as_ref() + } + + /// Get vcpu_numa_id_map struct. + pub fn get_vcpu_numa_id_map(&self) -> Option<&Vec> { + self.vcpu_numa_id_map.as_ref() + } +} + +/// Struct to save device information. +pub struct FdtDeviceInfo<'a, T: DeviceInfoForFDT> { + /// mmio device information + mmio_device_info: Option<&'a HashMap<(DeviceType, String), T>>, + /// interrupt controller + irq_chip: &'a dyn GICDevice, +} + +impl FdtDeviceInfo<'_, T> { + /// Generate FdtDeviceInfo. + pub fn new<'a>( + mmio_device_info: Option<&'a HashMap<(DeviceType, String), T>>, + irq_chip: &'a dyn GICDevice, + ) -> FdtDeviceInfo<'a, T> { + FdtDeviceInfo { + mmio_device_info, + irq_chip, + } + } + + /// Get mmio device information. + pub fn get_mmio_device_info(&self) -> Option<&HashMap<(DeviceType, String), T>> { + self.mmio_device_info + } + + /// Get interrupt controller. + pub fn get_irqchip(&self) -> &dyn GICDevice { + self.irq_chip + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use dbs_arch::gic::create_gic; + use vm_memory::{GuestAddress, GuestMemory}; + + const CMDLINE: &str = "console=tty0"; + const INITRD_CONFIG: InitrdConfig = InitrdConfig { + address: GuestAddress(0x10000000), + size: 0x1000, + }; + const VCPU_MPIDR: [u64; 1] = [0]; + const VCPU_BOOT_ONLINED: [u32; 1] = [1]; + const VPMU_FEATURE: VpmuFeatureLevel = VpmuFeatureLevel::Disabled; + const CACHE_PASSTHROUGH_ENABLED: bool = false; + + #[inline] + fn helper_generate_fdt_vm_info(guest_memory: &GuestMemoryMmap) -> FdtVmInfo<'_> { + FdtVmInfo::new( + guest_memory, + CMDLINE, + Some(&INITRD_CONFIG), + FdtVcpuInfo::new( + VCPU_MPIDR.to_vec(), + VCPU_BOOT_ONLINED.to_vec(), + VPMU_FEATURE, + CACHE_PASSTHROUGH_ENABLED, + ), + ) + } + + #[test] + fn test_fdtutils_fdt_vm_info() { + let ranges = vec![(GuestAddress(0x80000000), 0x40000)]; + let guest_memory: GuestMemoryMmap<()> = + GuestMemoryMmap::<()>::from_ranges(ranges.as_slice()) + .expect("Cannot initialize memory"); + let vm_info = helper_generate_fdt_vm_info(&guest_memory); + + assert_eq!( + guest_memory.check_address(GuestAddress(0x80001000)), + Some(GuestAddress(0x80001000)) + ); + assert_eq!(guest_memory.check_address(GuestAddress(0x80050000)), None); + assert!(guest_memory.check_range(GuestAddress(0x80000000), 0x40000)); + assert_eq!(vm_info.get_cmdline(), CMDLINE); + assert_eq!( + vm_info.get_initrd_config().unwrap().address, + INITRD_CONFIG.address + ); + assert_eq!( + vm_info.get_initrd_config().unwrap().size, + INITRD_CONFIG.size + ); + assert_eq!(vm_info.get_vcpu_mpidr(), VCPU_MPIDR.as_slice()); + assert_eq!(vm_info.get_boot_onlined(), VCPU_BOOT_ONLINED.as_slice()); + assert_eq!(vm_info.get_vpmu_feature(), VPMU_FEATURE); + assert_eq!( + vm_info.get_cache_passthrough_enabled(), + CACHE_PASSTHROUGH_ENABLED + ); + } + + const CPU_MAPS: [u8; 5] = [1, 2, 3, 4, 5]; + const MEMORY_VEC: [u32; 2] = [0, 1]; + const CPU_VEC: [u32; 5] = [0, 0, 0, 1, 1]; + + #[inline] + fn helper_generate_fdt_numa_info() -> FdtNumaInfo { + FdtNumaInfo::new( + Some(CPU_MAPS.to_vec()), + Some(MEMORY_VEC.to_vec()), + Some(CPU_VEC.to_vec()), + ) + } + + #[test] + fn test_fdtutils_fdt_numa_info() { + // test default + let numa_info = FdtNumaInfo::default(); + assert_eq!(numa_info.get_cpu_maps(), None); + assert_eq!(numa_info.get_memory_numa_id_map(), None); + assert_eq!(numa_info.get_vcpu_numa_id_map(), None); + + let numa_info = helper_generate_fdt_numa_info(); + assert_eq!( + numa_info.get_cpu_maps().unwrap().as_slice(), + CPU_MAPS.as_slice() + ); + assert_eq!( + numa_info.get_memory_numa_id_map().unwrap().as_slice(), + MEMORY_VEC.as_slice() + ); + assert_eq!( + numa_info.get_vcpu_numa_id_map().unwrap().as_slice(), + CPU_VEC.as_slice() + ); + } + + use dbs_arch::gic::its::ItsType; + use dbs_device::resources::{DeviceResources, Resource}; + use kvm_ioctls::Kvm; + + use super::super::tests::MMIODeviceInfo; + + const MEMORY_SIZE: u64 = 4096; + const ECAM_SPACE: [Resource; 1] = [Resource::MmioAddressRange { + base: 0x40000000, + size: 0x1000, + }]; + const BAR_SPACE: [Resource; 2] = [ + Resource::MmioAddressRange { + base: 0x40001000, + size: 0x1000, + }, + Resource::MmioAddressRange { + base: 0x40002000, + size: 0x1000, + }, + ]; + + #[test] + fn test_fdtutils_fdt_device_info() { + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + let gic = create_gic(&vm, 0).unwrap(); + let mmio_device_info: Option> = Some( + [ + ( + (DeviceType::Serial, DeviceType::Serial.to_string()), + MMIODeviceInfo::new(0, 1), + ), + ( + (DeviceType::Virtio(1), "virtio".to_string()), + MMIODeviceInfo::new(MEMORY_SIZE, 2), + ), + ( + (DeviceType::RTC, "rtc".to_string()), + MMIODeviceInfo::new(2 * MEMORY_SIZE, 3), + ), + ] + .iter() + .cloned() + .collect(), + ); + let mut ecam_space = DeviceResources::new(); + ecam_space.append(ECAM_SPACE.as_slice()[0].clone()); + + let mut bar_space = DeviceResources::new(); + bar_space.append(BAR_SPACE.as_slice()[0].clone()); + bar_space.append(BAR_SPACE.as_slice()[1].clone()); + + let its_type1 = ItsType::PciMsiIts; + let its_type2 = ItsType::PlatformMsiIts; + + let device_info = FdtDeviceInfo::new(mmio_device_info.as_ref(), gic.as_ref()); + assert_eq!( + device_info.get_mmio_device_info(), + mmio_device_info.as_ref() + ); + assert_eq!( + format!("{:?}", device_info.get_irqchip().device_fd()), + format!("{:?}", gic.as_ref().device_fd()) + ); + assert_eq!( + device_info.get_irqchip().device_properties(), + gic.as_ref().device_properties() + ); + assert_eq!( + device_info.get_irqchip().fdt_compatibility(), + gic.as_ref().fdt_compatibility() + ); + assert_eq!( + device_info.get_irqchip().fdt_maint_irq(), + gic.as_ref().fdt_maint_irq() + ); + assert_eq!( + device_info.get_irqchip().vcpu_count(), + gic.as_ref().vcpu_count() + ); + assert_eq!( + device_info.get_irqchip().get_its_reg_range(&its_type1), + gic.as_ref().get_its_reg_range(&its_type1) + ); + assert_eq!( + device_info.get_irqchip().get_its_reg_range(&its_type2), + gic.as_ref().get_its_reg_range(&its_type2) + ); + } +} diff --git a/src/dragonball/src/dbs_boot/src/aarch64/layout.rs b/src/dragonball/src/dbs_boot/src/aarch64/layout.rs new file mode 100644 index 000000000..6bc98d55f --- /dev/null +++ b/src/dragonball/src/dbs_boot/src/aarch64/layout.rs @@ -0,0 +1,94 @@ +// Copyright 2021-2022 Alibaba Cloud. All Rights Reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// ==== Address map in use in ARM development systems today ==== +// +// - 32-bit - - 36-bit - - 40-bit - +//1024GB + + +-------------------+ <- 40-bit +// | | DRAM | +// ~ ~ ~ ~ +// | | | +// | | | +// | | | +// | | | +//544GB + + +-------------------+ +// | | Hole or DRAM | +// | | | +//512GB + + +-------------------+ +// | | Mapped | +// | | I/O | +// ~ ~ ~ ~ +// | | | +//256GB + + +-------------------+ +// | | Reserved | +// ~ ~ ~ ~ +// | | | +//64GB + +-----------------------+-------------------+ <- 36-bit +// | | DRAM | +// ~ ~ ~ ~ +// | | | +// | | | +//34GB + +-----------------------+-------------------+ +// | | Hole or DRAM | +//32GB + +-----------------------+-------------------+ +// | | Mapped I/O | +// ~ ~ ~ ~ +// | | | +//16GB + +-----------------------+-------------------+ +// | | Reserved | +// ~ ~ ~ ~ +//4GB +-------------------+-----------------------+-------------------+ <- 32-bit +// | 2GB of DRAM | +// | | +//2GB +-------------------+-----------------------+-------------------+ +// | Mapped I/O | +//1GB +-------------------+-----------------------+-------------------+ +// | ROM & RAM & I/O | +//0GB +-------------------+-----------------------+-------------------+ 0 +// - 32-bit - - 36-bit - - 40-bit - +// +// Taken from (http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf). + +/// Start of RAM on 64 bit ARM. +pub const DRAM_MEM_START: u64 = 0x8000_0000; // 2 GB. +/// The maximum addressable RAM address. +pub const DRAM_MEM_END: u64 = 0x00F8_0000_0000; // 1024 - 32 = 992 GB. +/// The maximum RAM size. +pub const DRAM_MEM_MAX_SIZE: u64 = DRAM_MEM_END - DRAM_MEM_START; + +/// Kernel command line maximum size. +/// As per `arch/arm64/include/uapi/asm/setup.h`. +pub const CMDLINE_MAX_SIZE: usize = 2048; + +/// Maximum size of the device tree blob as specified in https://www.kernel.org/doc/Documentation/arm64/booting.txt. +pub const FDT_MAX_SIZE: usize = 0x20_0000; + +// As per virt/kvm/arm/vgic/vgic-kvm-device.c we need +// the number of interrupts our GIC will support to be: +// * bigger than 32 +// * less than 1023 and +// * a multiple of 32. +// We are setting up our interrupt controller to support a maximum of 128 interrupts. +/// First usable interrupt on aarch64. +pub const IRQ_BASE: u32 = dbs_arch::gic::IRQ_BASE; + +/// Last usable interrupt on aarch64. +pub const IRQ_MAX: u32 = dbs_arch::gic::IRQ_MAX; + +/// Below this address will reside the GIC, above this address will reside the MMIO devices. +pub const MAPPED_IO_START: u64 = dbs_arch::gic::GIC_REG_END_ADDRESS; // 1 GB +/// End address (inclusive) of the MMIO window. +pub const MAPPED_IO_END: u64 = (2 << 30) - 1; // 1 GB + +/// Maximum guest physical address supported. +pub static GUEST_PHYS_END: &u64 = &((1u64 << 40) - 1); +/// Upper bound of guest memory. +pub static GUEST_MEM_END: &u64 = &(DRAM_MEM_END - 1); +/// Lower bound of guest memory. +pub const GUEST_MEM_START: u64 = DRAM_MEM_START; +/// Start address of the lower MMIO window. +pub const MMIO_LOW_START: u64 = MAPPED_IO_START; +/// End address (inclusive) of the lower MMIO window. +pub const MMIO_LOW_END: u64 = MAPPED_IO_END; +/// Size of memory below MMIO hole. +pub const GUEST_MEM_LOW_SIZE: u64 = 0u64; diff --git a/src/dragonball/src/dbs_boot/src/aarch64/mod.rs b/src/dragonball/src/dbs_boot/src/aarch64/mod.rs new file mode 100644 index 000000000..c9aa5fdf6 --- /dev/null +++ b/src/dragonball/src/dbs_boot/src/aarch64/mod.rs @@ -0,0 +1,103 @@ +// Copyright 2021 Alibaba Cloud. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! VM boot related constants and utilities for `aarch64` architecture. + +use vm_fdt::Error as VmFdtError; +use vm_memory::{Address, GuestAddress, GuestMemory, GuestMemoryError}; + +/// Magic addresses externally used to lay out aarch64 VMs. +pub mod layout; + +/// FDT is used to inform the guest kernel of device tree information. +pub mod fdt; + +/// Helper structs for constructing fdt. +pub mod fdt_utils; + +/// Default (smallest) memory page size for the supported architectures. +pub const PAGE_SIZE: usize = 4096; + +/// Errors thrown while configuring the Flattened Device Tree for aarch64. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Failure in creating FDT + #[error("create fdt fail: {0}")] + CreateFdt(#[from] VmFdtError), + /// Failure in writing FDT in memory. + #[error("write fdt to memory fail: {0}")] + WriteFDTToMemory(#[from] GuestMemoryError), + /// Failed to compute the initrd address. + #[error("Failed to compute the initrd address.")] + InitrdAddress, + /// Invalid arguments + #[error("invalid arguments")] + InvalidArguments, +} + +/// Returns the memory address where the kernel could be loaded. +pub fn get_kernel_start() -> u64 { + layout::DRAM_MEM_START +} + +/// Auxiliary function to get the address where the device tree blob is loaded. +pub fn get_fdt_addr(mem: &M) -> u64 { + // If the memory allocated is smaller than the size allocated for the FDT, + // we return the start of the DRAM so that + // we allow the code to try and load the FDT. + if let Some(offset) = mem.last_addr().checked_sub(layout::FDT_MAX_SIZE as u64 - 1) { + if mem.address_in_range(offset) { + return offset.raw_value(); + } + } + layout::DRAM_MEM_START +} + +/// Returns the memory address where the initrd could be loaded. +pub fn initrd_load_addr(guest_mem: &M, initrd_size: u64) -> super::Result { + let round_to_pagesize = |size| (size + (PAGE_SIZE as u64 - 1)) & !(PAGE_SIZE as u64 - 1); + match GuestAddress(get_fdt_addr(guest_mem)).checked_sub(round_to_pagesize(initrd_size)) { + Some(offset) => { + if guest_mem.address_in_range(offset) { + Ok(offset.raw_value()) + } else { + Err(Error::InitrdAddress) + } + } + None => Err(Error::InitrdAddress), + } +} + +#[cfg(test)] +pub mod tests { + use dbs_arch::{DeviceInfoForFDT, Error as ArchError}; + + const LEN: u64 = 4096; + + #[derive(Clone, Debug, PartialEq)] + pub struct MMIODeviceInfo { + addr: u64, + irq: u32, + } + + impl MMIODeviceInfo { + pub fn new(addr: u64, irq: u32) -> Self { + MMIODeviceInfo { addr, irq } + } + } + + impl DeviceInfoForFDT for MMIODeviceInfo { + fn addr(&self) -> u64 { + self.addr + } + fn irq(&self) -> std::result::Result { + Ok(self.irq) + } + fn length(&self) -> u64 { + LEN + } + fn get_device_id(&self) -> Option { + None + } + } +} diff --git a/src/dragonball/src/dbs_boot/src/aarch64/test/output.dtb b/src/dragonball/src/dbs_boot/src/aarch64/test/output.dtb new file mode 100644 index 000000000..8329528f3 Binary files /dev/null and b/src/dragonball/src/dbs_boot/src/aarch64/test/output.dtb differ diff --git a/src/dragonball/src/dbs_boot/src/aarch64/test/output_with_initrd.dtb b/src/dragonball/src/dbs_boot/src/aarch64/test/output_with_initrd.dtb new file mode 100644 index 000000000..6fe25cde1 Binary files /dev/null and b/src/dragonball/src/dbs_boot/src/aarch64/test/output_with_initrd.dtb differ diff --git a/src/dragonball/src/dbs_boot/src/aarch64/test/output_with_pmu.dtb b/src/dragonball/src/dbs_boot/src/aarch64/test/output_with_pmu.dtb new file mode 100644 index 000000000..16c554821 Binary files /dev/null and b/src/dragonball/src/dbs_boot/src/aarch64/test/output_with_pmu.dtb differ diff --git a/src/dragonball/src/dbs_boot/src/lib.rs b/src/dragonball/src/dbs_boot/src/lib.rs new file mode 100644 index 000000000..e281b8d3c --- /dev/null +++ b/src/dragonball/src/dbs_boot/src/lib.rs @@ -0,0 +1,27 @@ +// Copyright 2021-2022 Alibaba Cloud. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![deny(missing_docs)] + +//! Constants, Structs and Utilities to setup boot environment for virtual machines. + +#[cfg(target_arch = "x86_64")] +mod x86_64; +#[cfg(target_arch = "x86_64")] +pub use x86_64::*; + +#[cfg(target_arch = "aarch64")] +mod aarch64; +#[cfg(target_arch = "aarch64")] +pub use aarch64::*; + +/// Specialized [std::result::Result] for boot related operations. +pub type Result = std::result::Result; + +/// Type for passing information about the initrd in the guest memory. +pub struct InitrdConfig { + /// Load address of initrd in guest memory + pub address: vm_memory::GuestAddress, + /// Size of initrd in guest memory + pub size: usize, +} diff --git a/src/dragonball/src/dbs_boot/src/vendor/bootparam.rs b/src/dragonball/src/dbs_boot/src/vendor/bootparam.rs new file mode 100644 index 000000000..b093ef842 --- /dev/null +++ b/src/dragonball/src/dbs_boot/src/vendor/bootparam.rs @@ -0,0 +1,41 @@ +// Copyright (C) 2023 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use arch_gen::x86::bootparam::{__u32, __u64}; +use vm_memory::bytes::Bytes; +use vm_memory::guest_memory::GuestAddress; +use vm_memory::{ByteValued, GuestMemory}; + +use super::layout; + +/// With reference to the x86_hardware_subarch enumeration type of the +/// kernel, we newly added the X86_SUBARCH_DRAGONBALL type and defined +/// it as 0xdbdbdb01 to mark this as a guest kernel. +#[allow(dead_code)] +pub enum X86HardwareSubarch { + X86SubarchPC = 0, + X86SubarchLGUEST = 1, + X86SubarchXEN = 2, + X86SubarchIntelMID = 3, + X86SubarchCE4100 = 4, + X86SubarchDragonball = 0xdbdbdb01, +} + +/// Recorded in subarch_data, used to verify the validity of dragonball subarch_data. +pub const DB_BOOT_PARAM_SIGNATURE: u64 = 0xdbdbb007700bbdbd; + +#[derive(Debug, PartialEq, thiserror::Error)] +pub enum Error { + /// Error dragonball boot parameter length + #[error("dragonball boot param exceeds max size")] + DragonballBootParamPastMaxSize, + + /// Error dragonball boot parameter location + #[error("dragonball boot param past ram end")] + DragonballBootParamPastRamEnd, + + /// Error writing dragonball boot parameter + #[error("dragonball boot param setup fail")] + WriteDragonballBootParam, +} + diff --git a/src/dragonball/src/dbs_boot/src/x86_64/bootparam.rs b/src/dragonball/src/dbs_boot/src/x86_64/bootparam.rs new file mode 100644 index 000000000..db5d3d550 --- /dev/null +++ b/src/dragonball/src/dbs_boot/src/x86_64/bootparam.rs @@ -0,0 +1,4628 @@ +// Copyright 2021-2022 Alibaba Cloud. All Rights Reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +/* + * automatically generated by rust-bindgen + * From upstream linux arch/x86/include/uapi/asm/bootparam.h at commit: + * 806276b7f07a39a1cc3f38bb1ef5c573d4594a38 + */ +#![allow(unused)] +#![allow(non_snake_case)] +#![allow(non_camel_case_types)] +#![allow(missing_docs)] +#![allow(deref_nullptr)] + +/* automatically generated by rust-bindgen 0.59.2 */ + +#[repr(C)] +#[derive(Default)] +pub struct __IncompleteArrayField(::std::marker::PhantomData, [T; 0]); +impl __IncompleteArrayField { + #[inline] + pub const fn new() -> Self { + __IncompleteArrayField(::std::marker::PhantomData, []) + } + #[inline] + pub fn as_ptr(&self) -> *const T { + self as *const _ as *const T + } + #[inline] + pub fn as_mut_ptr(&mut self) -> *mut T { + self as *mut _ as *mut T + } + #[allow(clippy::missing_safety_doc)] + #[inline] + pub unsafe fn as_slice(&self, len: usize) -> &[T] { + ::std::slice::from_raw_parts(self.as_ptr(), len) + } + #[allow(clippy::missing_safety_doc)] + #[inline] + pub unsafe fn as_mut_slice(&mut self, len: usize) -> &mut [T] { + ::std::slice::from_raw_parts_mut(self.as_mut_ptr(), len) + } +} +impl ::std::fmt::Debug for __IncompleteArrayField { + fn fmt(&self, fmt: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { + fmt.write_str("__IncompleteArrayField") + } +} +pub const SETUP_NONE: u32 = 0; +pub const SETUP_E820_EXT: u32 = 1; +pub const SETUP_DTB: u32 = 2; +pub const SETUP_PCI: u32 = 3; +pub const SETUP_EFI: u32 = 4; +pub const SETUP_APPLE_PROPERTIES: u32 = 5; +pub const SETUP_JAILHOUSE: u32 = 6; +pub const SETUP_INDIRECT: u32 = 2147483648; +pub const SETUP_TYPE_MAX: u32 = 2147483654; +pub const RAMDISK_IMAGE_START_MASK: u32 = 2047; +pub const RAMDISK_PROMPT_FLAG: u32 = 32768; +pub const RAMDISK_LOAD_FLAG: u32 = 16384; +pub const LOADED_HIGH: u32 = 1; +pub const KASLR_FLAG: u32 = 2; +pub const QUIET_FLAG: u32 = 32; +pub const KEEP_SEGMENTS: u32 = 64; +pub const CAN_USE_HEAP: u32 = 128; +pub const XLF_KERNEL_64: u32 = 1; +pub const XLF_CAN_BE_LOADED_ABOVE_4G: u32 = 2; +pub const XLF_EFI_HANDOVER_32: u32 = 4; +pub const XLF_EFI_HANDOVER_64: u32 = 8; +pub const XLF_EFI_KEXEC: u32 = 16; +pub const XLF_5LEVEL: u32 = 32; +pub const XLF_5LEVEL_ENABLED: u32 = 64; +pub const __BITS_PER_LONG: u32 = 64; +pub const __FD_SETSIZE: u32 = 1024; +pub const VIDEO_TYPE_MDA: u32 = 16; +pub const VIDEO_TYPE_CGA: u32 = 17; +pub const VIDEO_TYPE_EGAM: u32 = 32; +pub const VIDEO_TYPE_EGAC: u32 = 33; +pub const VIDEO_TYPE_VGAC: u32 = 34; +pub const VIDEO_TYPE_VLFB: u32 = 35; +pub const VIDEO_TYPE_PICA_S3: u32 = 48; +pub const VIDEO_TYPE_MIPS_G364: u32 = 49; +pub const VIDEO_TYPE_SGI: u32 = 51; +pub const VIDEO_TYPE_TGAC: u32 = 64; +pub const VIDEO_TYPE_SUN: u32 = 80; +pub const VIDEO_TYPE_SUNPCI: u32 = 81; +pub const VIDEO_TYPE_PMAC: u32 = 96; +pub const VIDEO_TYPE_EFI: u32 = 112; +pub const VIDEO_FLAGS_NOCURSOR: u32 = 1; +pub const VIDEO_CAPABILITY_SKIP_QUIRKS: u32 = 1; +pub const VIDEO_CAPABILITY_64BIT_BASE: u32 = 2; +pub const APM_STATE_READY: u32 = 0; +pub const APM_STATE_STANDBY: u32 = 1; +pub const APM_STATE_SUSPEND: u32 = 2; +pub const APM_STATE_OFF: u32 = 3; +pub const APM_STATE_BUSY: u32 = 4; +pub const APM_STATE_REJECT: u32 = 5; +pub const APM_STATE_OEM_SYS: u32 = 32; +pub const APM_STATE_OEM_DEV: u32 = 64; +pub const APM_STATE_DISABLE: u32 = 0; +pub const APM_STATE_ENABLE: u32 = 1; +pub const APM_STATE_DISENGAGE: u32 = 0; +pub const APM_STATE_ENGAGE: u32 = 1; +pub const APM_SYS_STANDBY: u32 = 1; +pub const APM_SYS_SUSPEND: u32 = 2; +pub const APM_NORMAL_RESUME: u32 = 3; +pub const APM_CRITICAL_RESUME: u32 = 4; +pub const APM_LOW_BATTERY: u32 = 5; +pub const APM_POWER_STATUS_CHANGE: u32 = 6; +pub const APM_UPDATE_TIME: u32 = 7; +pub const APM_CRITICAL_SUSPEND: u32 = 8; +pub const APM_USER_STANDBY: u32 = 9; +pub const APM_USER_SUSPEND: u32 = 10; +pub const APM_STANDBY_RESUME: u32 = 11; +pub const APM_CAPABILITY_CHANGE: u32 = 12; +pub const APM_USER_HIBERNATION: u32 = 13; +pub const APM_HIBERNATION_RESUME: u32 = 14; +pub const APM_SUCCESS: u32 = 0; +pub const APM_DISABLED: u32 = 1; +pub const APM_CONNECTED: u32 = 2; +pub const APM_NOT_CONNECTED: u32 = 3; +pub const APM_16_CONNECTED: u32 = 5; +pub const APM_16_UNSUPPORTED: u32 = 6; +pub const APM_32_CONNECTED: u32 = 7; +pub const APM_32_UNSUPPORTED: u32 = 8; +pub const APM_BAD_DEVICE: u32 = 9; +pub const APM_BAD_PARAM: u32 = 10; +pub const APM_NOT_ENGAGED: u32 = 11; +pub const APM_BAD_FUNCTION: u32 = 12; +pub const APM_RESUME_DISABLED: u32 = 13; +pub const APM_NO_ERROR: u32 = 83; +pub const APM_BAD_STATE: u32 = 96; +pub const APM_NO_EVENTS: u32 = 128; +pub const APM_NOT_PRESENT: u32 = 134; +pub const APM_DEVICE_BIOS: u32 = 0; +pub const APM_DEVICE_ALL: u32 = 1; +pub const APM_DEVICE_DISPLAY: u32 = 256; +pub const APM_DEVICE_STORAGE: u32 = 512; +pub const APM_DEVICE_PARALLEL: u32 = 768; +pub const APM_DEVICE_SERIAL: u32 = 1024; +pub const APM_DEVICE_NETWORK: u32 = 1280; +pub const APM_DEVICE_PCMCIA: u32 = 1536; +pub const APM_DEVICE_BATTERY: u32 = 32768; +pub const APM_DEVICE_OEM: u32 = 57344; +pub const APM_DEVICE_OLD_ALL: u32 = 65535; +pub const APM_DEVICE_CLASS: u32 = 255; +pub const APM_DEVICE_MASK: u32 = 65280; +pub const APM_MAX_BATTERIES: u32 = 2; +pub const APM_CAP_GLOBAL_STANDBY: u32 = 1; +pub const APM_CAP_GLOBAL_SUSPEND: u32 = 2; +pub const APM_CAP_RESUME_STANDBY_TIMER: u32 = 4; +pub const APM_CAP_RESUME_SUSPEND_TIMER: u32 = 8; +pub const APM_CAP_RESUME_STANDBY_RING: u32 = 16; +pub const APM_CAP_RESUME_SUSPEND_RING: u32 = 32; +pub const APM_CAP_RESUME_STANDBY_PCMCIA: u32 = 64; +pub const APM_CAP_RESUME_SUSPEND_PCMCIA: u32 = 128; +pub const _IOC_NRBITS: u32 = 8; +pub const _IOC_TYPEBITS: u32 = 8; +pub const _IOC_SIZEBITS: u32 = 14; +pub const _IOC_DIRBITS: u32 = 2; +pub const _IOC_NRMASK: u32 = 255; +pub const _IOC_TYPEMASK: u32 = 255; +pub const _IOC_SIZEMASK: u32 = 16383; +pub const _IOC_DIRMASK: u32 = 3; +pub const _IOC_NRSHIFT: u32 = 0; +pub const _IOC_TYPESHIFT: u32 = 8; +pub const _IOC_SIZESHIFT: u32 = 16; +pub const _IOC_DIRSHIFT: u32 = 30; +pub const _IOC_NONE: u32 = 0; +pub const _IOC_WRITE: u32 = 1; +pub const _IOC_READ: u32 = 2; +pub const IOC_IN: u32 = 1073741824; +pub const IOC_OUT: u32 = 2147483648; +pub const IOC_INOUT: u32 = 3221225472; +pub const IOCSIZE_MASK: u32 = 1073676288; +pub const IOCSIZE_SHIFT: u32 = 16; +pub const EDDNR: u32 = 489; +pub const EDDBUF: u32 = 3328; +pub const EDDMAXNR: u32 = 6; +pub const EDDEXTSIZE: u32 = 8; +pub const EDDPARMSIZE: u32 = 74; +pub const CHECKEXTENSIONSPRESENT: u32 = 65; +pub const GETDEVICEPARAMETERS: u32 = 72; +pub const LEGACYGETDEVICEPARAMETERS: u32 = 8; +pub const EDDMAGIC1: u32 = 21930; +pub const EDDMAGIC2: u32 = 43605; +pub const READ_SECTORS: u32 = 2; +pub const EDD_MBR_SIG_OFFSET: u32 = 440; +pub const EDD_MBR_SIG_BUF: u32 = 656; +pub const EDD_MBR_SIG_MAX: u32 = 16; +pub const EDD_MBR_SIG_NR_BUF: u32 = 490; +pub const EDD_EXT_FIXED_DISK_ACCESS: u32 = 1; +pub const EDD_EXT_DEVICE_LOCKING_AND_EJECTING: u32 = 2; +pub const EDD_EXT_ENHANCED_DISK_DRIVE_SUPPORT: u32 = 4; +pub const EDD_EXT_64BIT_EXTENSIONS: u32 = 8; +pub const EDD_INFO_DMA_BOUNDARY_ERROR_TRANSPARENT: u32 = 1; +pub const EDD_INFO_GEOMETRY_VALID: u32 = 2; +pub const EDD_INFO_REMOVABLE: u32 = 4; +pub const EDD_INFO_WRITE_VERIFY: u32 = 8; +pub const EDD_INFO_MEDIA_CHANGE_NOTIFICATION: u32 = 16; +pub const EDD_INFO_LOCKABLE: u32 = 32; +pub const EDD_INFO_NO_MEDIA_PRESENT: u32 = 64; +pub const EDD_INFO_USE_INT13_FN50: u32 = 128; +pub const E820_MAX_ENTRIES_ZEROPAGE: u32 = 128; +pub const JAILHOUSE_SETUP_REQUIRED_VERSION: u32 = 1; +pub const E820MAP: ::std::os::raw::c_uint = 720; +pub const E820MAX: ::std::os::raw::c_uint = 128; +pub const E820_X_MAX: ::std::os::raw::c_uint = 128; +pub const E820NR: ::std::os::raw::c_uint = 488; +pub const E820_RAM: ::std::os::raw::c_uint = 1; +pub const E820_RESERVED: ::std::os::raw::c_uint = 2; +pub const E820_ACPI: ::std::os::raw::c_uint = 3; +pub const E820_NVS: ::std::os::raw::c_uint = 4; +pub const E820_UNUSABLE: ::std::os::raw::c_uint = 5; +pub const E820_RESERVED_KERN: ::std::os::raw::c_uint = 128; +pub type __s8 = ::std::os::raw::c_schar; +pub type __u8 = ::std::os::raw::c_uchar; +pub type __s16 = ::std::os::raw::c_short; +pub type __u16 = ::std::os::raw::c_ushort; +pub type __s32 = ::std::os::raw::c_int; +pub type __u32 = ::std::os::raw::c_uint; +pub type __s64 = ::std::os::raw::c_longlong; +pub type __u64 = ::std::os::raw::c_ulonglong; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct __kernel_fd_set { + pub fds_bits: [::std::os::raw::c_ulong; 16usize], +} +#[test] +fn bindgen_test_layout___kernel_fd_set() { + assert_eq!( + ::std::mem::size_of::<__kernel_fd_set>(), + 128usize, + concat!("Size of: ", stringify!(__kernel_fd_set)) + ); + assert_eq!( + ::std::mem::align_of::<__kernel_fd_set>(), + 8usize, + concat!("Alignment of ", stringify!(__kernel_fd_set)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::<__kernel_fd_set>())).fds_bits) as *const _ + as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(__kernel_fd_set), + "::", + stringify!(fds_bits) + ) + ); +} +pub type __kernel_sighandler_t = + ::std::option::Option; +pub type __kernel_key_t = ::std::os::raw::c_int; +pub type __kernel_mqd_t = ::std::os::raw::c_int; +pub type __kernel_old_uid_t = ::std::os::raw::c_ushort; +pub type __kernel_old_gid_t = ::std::os::raw::c_ushort; +pub type __kernel_old_dev_t = ::std::os::raw::c_ulong; +pub type __kernel_long_t = ::std::os::raw::c_long; +pub type __kernel_ulong_t = ::std::os::raw::c_ulong; +pub type __kernel_ino_t = __kernel_ulong_t; +pub type __kernel_mode_t = ::std::os::raw::c_uint; +pub type __kernel_pid_t = ::std::os::raw::c_int; +pub type __kernel_ipc_pid_t = ::std::os::raw::c_int; +pub type __kernel_uid_t = ::std::os::raw::c_uint; +pub type __kernel_gid_t = ::std::os::raw::c_uint; +pub type __kernel_suseconds_t = __kernel_long_t; +pub type __kernel_daddr_t = ::std::os::raw::c_int; +pub type __kernel_uid32_t = ::std::os::raw::c_uint; +pub type __kernel_gid32_t = ::std::os::raw::c_uint; +pub type __kernel_size_t = __kernel_ulong_t; +pub type __kernel_ssize_t = __kernel_long_t; +pub type __kernel_ptrdiff_t = __kernel_long_t; +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct __kernel_fsid_t { + pub val: [::std::os::raw::c_int; 2usize], +} +#[test] +fn bindgen_test_layout___kernel_fsid_t() { + assert_eq!( + ::std::mem::size_of::<__kernel_fsid_t>(), + 8usize, + concat!("Size of: ", stringify!(__kernel_fsid_t)) + ); + assert_eq!( + ::std::mem::align_of::<__kernel_fsid_t>(), + 4usize, + concat!("Alignment of ", stringify!(__kernel_fsid_t)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::<__kernel_fsid_t>())).val) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(__kernel_fsid_t), + "::", + stringify!(val) + ) + ); +} +pub type __kernel_off_t = __kernel_long_t; +pub type __kernel_loff_t = ::std::os::raw::c_longlong; +pub type __kernel_old_time_t = __kernel_long_t; +pub type __kernel_time_t = __kernel_long_t; +pub type __kernel_time64_t = ::std::os::raw::c_longlong; +pub type __kernel_clock_t = __kernel_long_t; +pub type __kernel_timer_t = ::std::os::raw::c_int; +pub type __kernel_clockid_t = ::std::os::raw::c_int; +pub type __kernel_caddr_t = *mut ::std::os::raw::c_char; +pub type __kernel_uid16_t = ::std::os::raw::c_ushort; +pub type __kernel_gid16_t = ::std::os::raw::c_ushort; +pub type __le16 = __u16; +pub type __be16 = __u16; +pub type __le32 = __u32; +pub type __be32 = __u32; +pub type __le64 = __u64; +pub type __be64 = __u64; +pub type __sum16 = __u16; +pub type __wsum = __u32; +pub type __poll_t = ::std::os::raw::c_uint; +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub struct screen_info { + pub orig_x: __u8, + pub orig_y: __u8, + pub ext_mem_k: __u16, + pub orig_video_page: __u16, + pub orig_video_mode: __u8, + pub orig_video_cols: __u8, + pub flags: __u8, + pub unused2: __u8, + pub orig_video_ega_bx: __u16, + pub unused3: __u16, + pub orig_video_lines: __u8, + pub orig_video_isVGA: __u8, + pub orig_video_points: __u16, + pub lfb_width: __u16, + pub lfb_height: __u16, + pub lfb_depth: __u16, + pub lfb_base: __u32, + pub lfb_size: __u32, + pub cl_magic: __u16, + pub cl_offset: __u16, + pub lfb_linelength: __u16, + pub red_size: __u8, + pub red_pos: __u8, + pub green_size: __u8, + pub green_pos: __u8, + pub blue_size: __u8, + pub blue_pos: __u8, + pub rsvd_size: __u8, + pub rsvd_pos: __u8, + pub vesapm_seg: __u16, + pub vesapm_off: __u16, + pub pages: __u16, + pub vesa_attributes: __u16, + pub capabilities: __u32, + pub ext_lfb_base: __u32, + pub _reserved: [__u8; 2usize], +} +#[test] +fn bindgen_test_layout_screen_info() { + assert_eq!( + ::std::mem::size_of::(), + 64usize, + concat!("Size of: ", stringify!(screen_info)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(screen_info)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).orig_x) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(orig_x) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).orig_y) as *const _ as usize + }, + 1usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(orig_y) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ext_mem_k) as *const _ + as usize + }, + 2usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(ext_mem_k) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).orig_video_page) as *const _ + as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(orig_video_page) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).orig_video_mode) as *const _ + as usize + }, + 6usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(orig_video_mode) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).orig_video_cols) as *const _ + as usize + }, + 7usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(orig_video_cols) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).flags) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(flags) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).unused2) as *const _ as usize + }, + 9usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(unused2) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).orig_video_ega_bx) as *const _ + as usize + }, + 10usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(orig_video_ega_bx) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).unused3) as *const _ as usize + }, + 12usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(unused3) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).orig_video_lines) as *const _ + as usize + }, + 14usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(orig_video_lines) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).orig_video_isVGA) as *const _ + as usize + }, + 15usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(orig_video_isVGA) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).orig_video_points) as *const _ + as usize + }, + 16usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(orig_video_points) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).lfb_width) as *const _ + as usize + }, + 18usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(lfb_width) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).lfb_height) as *const _ + as usize + }, + 20usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(lfb_height) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).lfb_depth) as *const _ + as usize + }, + 22usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(lfb_depth) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).lfb_base) as *const _ as usize + }, + 24usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(lfb_base) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).lfb_size) as *const _ as usize + }, + 28usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(lfb_size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).cl_magic) as *const _ as usize + }, + 32usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(cl_magic) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).cl_offset) as *const _ + as usize + }, + 34usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(cl_offset) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).lfb_linelength) as *const _ + as usize + }, + 36usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(lfb_linelength) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).red_size) as *const _ as usize + }, + 38usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(red_size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).red_pos) as *const _ as usize + }, + 39usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(red_pos) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).green_size) as *const _ + as usize + }, + 40usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(green_size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).green_pos) as *const _ + as usize + }, + 41usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(green_pos) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).blue_size) as *const _ + as usize + }, + 42usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(blue_size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).blue_pos) as *const _ as usize + }, + 43usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(blue_pos) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).rsvd_size) as *const _ + as usize + }, + 44usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(rsvd_size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).rsvd_pos) as *const _ as usize + }, + 45usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(rsvd_pos) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).vesapm_seg) as *const _ + as usize + }, + 46usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(vesapm_seg) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).vesapm_off) as *const _ + as usize + }, + 48usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(vesapm_off) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).pages) as *const _ as usize + }, + 50usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(pages) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).vesa_attributes) as *const _ + as usize + }, + 52usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(vesa_attributes) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).capabilities) as *const _ + as usize + }, + 54usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(capabilities) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ext_lfb_base) as *const _ + as usize + }, + 58usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(ext_lfb_base) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::()))._reserved) as *const _ + as usize + }, + 62usize, + concat!( + "Offset of field: ", + stringify!(screen_info), + "::", + stringify!(_reserved) + ) + ); +} +pub type apm_event_t = ::std::os::raw::c_ushort; +pub type apm_eventinfo_t = ::std::os::raw::c_ushort; +#[repr(C)] +#[derive(Debug, Copy, Clone, Default)] +pub struct apm_bios_info { + pub version: __u16, + pub cseg: __u16, + pub offset: __u32, + pub cseg_16: __u16, + pub dseg: __u16, + pub flags: __u16, + pub cseg_len: __u16, + pub cseg_16_len: __u16, + pub dseg_len: __u16, +} +#[test] +fn bindgen_test_layout_apm_bios_info() { + assert_eq!( + ::std::mem::size_of::(), + 20usize, + concat!("Size of: ", stringify!(apm_bios_info)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(apm_bios_info)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).version) as *const _ + as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(apm_bios_info), + "::", + stringify!(version) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).cseg) as *const _ as usize + }, + 2usize, + concat!( + "Offset of field: ", + stringify!(apm_bios_info), + "::", + stringify!(cseg) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).offset) as *const _ as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(apm_bios_info), + "::", + stringify!(offset) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).cseg_16) as *const _ + as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(apm_bios_info), + "::", + stringify!(cseg_16) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).dseg) as *const _ as usize + }, + 10usize, + concat!( + "Offset of field: ", + stringify!(apm_bios_info), + "::", + stringify!(dseg) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).flags) as *const _ as usize + }, + 12usize, + concat!( + "Offset of field: ", + stringify!(apm_bios_info), + "::", + stringify!(flags) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).cseg_len) as *const _ + as usize + }, + 14usize, + concat!( + "Offset of field: ", + stringify!(apm_bios_info), + "::", + stringify!(cseg_len) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).cseg_16_len) as *const _ + as usize + }, + 16usize, + concat!( + "Offset of field: ", + stringify!(apm_bios_info), + "::", + stringify!(cseg_16_len) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).dseg_len) as *const _ + as usize + }, + 18usize, + concat!( + "Offset of field: ", + stringify!(apm_bios_info), + "::", + stringify!(dseg_len) + ) + ); +} +#[repr(C, packed)] +#[derive(Copy, Clone)] +pub struct edd_device_params { + pub length: __u16, + pub info_flags: __u16, + pub num_default_cylinders: __u32, + pub num_default_heads: __u32, + pub sectors_per_track: __u32, + pub number_of_sectors: __u64, + pub bytes_per_sector: __u16, + pub dpte_ptr: __u32, + pub key: __u16, + pub device_path_info_length: __u8, + pub reserved2: __u8, + pub reserved3: __u16, + pub host_bus_type: [__u8; 4usize], + pub interface_type: [__u8; 8usize], + pub interface_path: edd_device_params__bindgen_ty_1, + pub device_path: edd_device_params__bindgen_ty_2, + pub reserved4: __u8, + pub checksum: __u8, +} +#[repr(C)] +#[derive(Copy, Clone)] +pub union edd_device_params__bindgen_ty_1 { + pub isa: edd_device_params__bindgen_ty_1__bindgen_ty_1, + pub pci: edd_device_params__bindgen_ty_1__bindgen_ty_2, + pub ibnd: edd_device_params__bindgen_ty_1__bindgen_ty_3, + pub xprs: edd_device_params__bindgen_ty_1__bindgen_ty_4, + pub htpt: edd_device_params__bindgen_ty_1__bindgen_ty_5, + pub unknown: edd_device_params__bindgen_ty_1__bindgen_ty_6, +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_1__bindgen_ty_1 { + pub base_address: __u16, + pub reserved1: __u16, + pub reserved2: __u32, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_1__bindgen_ty_1() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_1) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_1) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())) + .base_address + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_1), + "::", + stringify!(base_address) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved1 + ) as *const _ as usize + }, + 2usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_1), + "::", + stringify!(reserved1) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved2 + ) as *const _ as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_1), + "::", + stringify!(reserved2) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_1__bindgen_ty_2 { + pub bus: __u8, + pub slot: __u8, + pub function: __u8, + pub channel: __u8, + pub reserved: __u32, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_1__bindgen_ty_2() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_2) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_2) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).bus + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_2), + "::", + stringify!(bus) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).slot + ) as *const _ as usize + }, + 1usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_2), + "::", + stringify!(slot) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).function + ) as *const _ as usize + }, + 2usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_2), + "::", + stringify!(function) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).channel + ) as *const _ as usize + }, + 3usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_2), + "::", + stringify!(channel) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved + ) as *const _ as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_2), + "::", + stringify!(reserved) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_1__bindgen_ty_3 { + pub reserved: __u64, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_1__bindgen_ty_3() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_3) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_3) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_3), + "::", + stringify!(reserved) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_1__bindgen_ty_4 { + pub reserved: __u64, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_1__bindgen_ty_4() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_4) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_4) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_4), + "::", + stringify!(reserved) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_1__bindgen_ty_5 { + pub reserved: __u64, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_1__bindgen_ty_5() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_5) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_5) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_5), + "::", + stringify!(reserved) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_1__bindgen_ty_6 { + pub reserved: __u64, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_1__bindgen_ty_6() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_6) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_6) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1__bindgen_ty_6), + "::", + stringify!(reserved) + ) + ); +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_1() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!("Size of: ", stringify!(edd_device_params__bindgen_ty_1)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(edd_device_params__bindgen_ty_1)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).isa) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1), + "::", + stringify!(isa) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).pci) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1), + "::", + stringify!(pci) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ibnd) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1), + "::", + stringify!(ibnd) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).xprs) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1), + "::", + stringify!(xprs) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).htpt) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1), + "::", + stringify!(htpt) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).unknown) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_1), + "::", + stringify!(unknown) + ) + ); +} +#[repr(C)] +#[derive(Copy, Clone)] +pub union edd_device_params__bindgen_ty_2 { + pub ata: edd_device_params__bindgen_ty_2__bindgen_ty_1, + pub atapi: edd_device_params__bindgen_ty_2__bindgen_ty_2, + pub scsi: edd_device_params__bindgen_ty_2__bindgen_ty_3, + pub usb: edd_device_params__bindgen_ty_2__bindgen_ty_4, + pub i1394: edd_device_params__bindgen_ty_2__bindgen_ty_5, + pub fibre: edd_device_params__bindgen_ty_2__bindgen_ty_6, + pub i2o: edd_device_params__bindgen_ty_2__bindgen_ty_7, + pub raid: edd_device_params__bindgen_ty_2__bindgen_ty_8, + pub sata: edd_device_params__bindgen_ty_2__bindgen_ty_9, + pub unknown: edd_device_params__bindgen_ty_2__bindgen_ty_10, +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_2__bindgen_ty_1 { + pub device: __u8, + pub reserved1: __u8, + pub reserved2: __u16, + pub reserved3: __u32, + pub reserved4: __u64, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_2__bindgen_ty_1() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_1) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_1) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).device + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_1), + "::", + stringify!(device) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved1 + ) as *const _ as usize + }, + 1usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_1), + "::", + stringify!(reserved1) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved2 + ) as *const _ as usize + }, + 2usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_1), + "::", + stringify!(reserved2) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved3 + ) as *const _ as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_1), + "::", + stringify!(reserved3) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved4 + ) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_1), + "::", + stringify!(reserved4) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_2__bindgen_ty_2 { + pub device: __u8, + pub lun: __u8, + pub reserved1: __u8, + pub reserved2: __u8, + pub reserved3: __u32, + pub reserved4: __u64, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_2__bindgen_ty_2() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_2) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_2) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).device + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_2), + "::", + stringify!(device) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).lun + ) as *const _ as usize + }, + 1usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_2), + "::", + stringify!(lun) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved1 + ) as *const _ as usize + }, + 2usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_2), + "::", + stringify!(reserved1) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved2 + ) as *const _ as usize + }, + 3usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_2), + "::", + stringify!(reserved2) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved3 + ) as *const _ as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_2), + "::", + stringify!(reserved3) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved4 + ) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_2), + "::", + stringify!(reserved4) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_2__bindgen_ty_3 { + pub id: __u16, + pub lun: __u64, + pub reserved1: __u16, + pub reserved2: __u32, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_2__bindgen_ty_3() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_3) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_3) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).id + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_3), + "::", + stringify!(id) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).lun + ) as *const _ as usize + }, + 2usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_3), + "::", + stringify!(lun) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved1 + ) as *const _ as usize + }, + 10usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_3), + "::", + stringify!(reserved1) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved2 + ) as *const _ as usize + }, + 12usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_3), + "::", + stringify!(reserved2) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_2__bindgen_ty_4 { + pub serial_number: __u64, + pub reserved: __u64, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_2__bindgen_ty_4() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_4) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_4) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())) + .serial_number + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_4), + "::", + stringify!(serial_number) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved + ) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_4), + "::", + stringify!(reserved) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_2__bindgen_ty_5 { + pub eui: __u64, + pub reserved: __u64, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_2__bindgen_ty_5() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_5) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_5) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).eui + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_5), + "::", + stringify!(eui) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved + ) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_5), + "::", + stringify!(reserved) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_2__bindgen_ty_6 { + pub wwid: __u64, + pub lun: __u64, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_2__bindgen_ty_6() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_6) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_6) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).wwid + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_6), + "::", + stringify!(wwid) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).lun + ) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_6), + "::", + stringify!(lun) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_2__bindgen_ty_7 { + pub identity_tag: __u64, + pub reserved: __u64, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_2__bindgen_ty_7() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_7) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_7) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())) + .identity_tag + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_7), + "::", + stringify!(identity_tag) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved + ) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_7), + "::", + stringify!(reserved) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_2__bindgen_ty_8 { + pub array_number: __u32, + pub reserved1: __u32, + pub reserved2: __u64, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_2__bindgen_ty_8() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_8) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_8) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())) + .array_number + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_8), + "::", + stringify!(array_number) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved1 + ) as *const _ as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_8), + "::", + stringify!(reserved1) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved2 + ) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_8), + "::", + stringify!(reserved2) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_2__bindgen_ty_9 { + pub device: __u8, + pub reserved1: __u8, + pub reserved2: __u16, + pub reserved3: __u32, + pub reserved4: __u64, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_2__bindgen_ty_9() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_9) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_9) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).device + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_9), + "::", + stringify!(device) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved1 + ) as *const _ as usize + }, + 1usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_9), + "::", + stringify!(reserved1) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved2 + ) as *const _ as usize + }, + 2usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_9), + "::", + stringify!(reserved2) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved3 + ) as *const _ as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_9), + "::", + stringify!(reserved3) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved4 + ) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_9), + "::", + stringify!(reserved4) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct edd_device_params__bindgen_ty_2__bindgen_ty_10 { + pub reserved1: __u64, + pub reserved2: __u64, +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_2__bindgen_ty_10() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!( + "Size of: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_10) + ) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_10) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved1 + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_10), + "::", + stringify!(reserved1) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).reserved2 + ) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2__bindgen_ty_10), + "::", + stringify!(reserved2) + ) + ); +} +#[test] +fn bindgen_test_layout_edd_device_params__bindgen_ty_2() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(edd_device_params__bindgen_ty_2)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(edd_device_params__bindgen_ty_2)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ata) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2), + "::", + stringify!(ata) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).atapi) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2), + "::", + stringify!(atapi) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).scsi) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2), + "::", + stringify!(scsi) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).usb) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2), + "::", + stringify!(usb) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).i1394) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2), + "::", + stringify!(i1394) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).fibre) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2), + "::", + stringify!(fibre) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).i2o) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2), + "::", + stringify!(i2o) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).raid) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2), + "::", + stringify!(raid) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).sata) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2), + "::", + stringify!(sata) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).unknown) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params__bindgen_ty_2), + "::", + stringify!(unknown) + ) + ); +} +#[test] +fn bindgen_test_layout_edd_device_params() { + assert_eq!( + ::std::mem::size_of::(), + 74usize, + concat!("Size of: ", stringify!(edd_device_params)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(edd_device_params)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).length) as *const _ + as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(length) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).info_flags) as *const _ + as usize + }, + 2usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(info_flags) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).num_default_cylinders) + as *const _ as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(num_default_cylinders) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).num_default_heads) + as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(num_default_heads) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).sectors_per_track) + as *const _ as usize + }, + 12usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(sectors_per_track) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).number_of_sectors) + as *const _ as usize + }, + 16usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(number_of_sectors) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).bytes_per_sector) + as *const _ as usize + }, + 24usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(bytes_per_sector) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).dpte_ptr) as *const _ + as usize + }, + 26usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(dpte_ptr) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).key) as *const _ + as usize + }, + 30usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(key) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).device_path_info_length) + as *const _ as usize + }, + 32usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(device_path_info_length) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).reserved2) as *const _ + as usize + }, + 33usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(reserved2) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).reserved3) as *const _ + as usize + }, + 34usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(reserved3) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).host_bus_type) + as *const _ as usize + }, + 36usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(host_bus_type) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).interface_type) + as *const _ as usize + }, + 40usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(interface_type) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).interface_path) + as *const _ as usize + }, + 48usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(interface_path) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).device_path) as *const _ + as usize + }, + 56usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(device_path) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).reserved4) as *const _ + as usize + }, + 72usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(reserved4) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).checksum) as *const _ + as usize + }, + 73usize, + concat!( + "Offset of field: ", + stringify!(edd_device_params), + "::", + stringify!(checksum) + ) + ); +} +#[repr(C, packed)] +#[derive(Copy, Clone)] +pub struct edd_info { + pub device: __u8, + pub version: __u8, + pub interface_support: __u16, + pub legacy_max_cylinder: __u16, + pub legacy_max_head: __u8, + pub legacy_sectors_per_track: __u8, + pub params: edd_device_params, +} +#[test] +fn bindgen_test_layout_edd_info() { + assert_eq!( + ::std::mem::size_of::(), + 82usize, + concat!("Size of: ", stringify!(edd_info)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(edd_info)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).device) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd_info), + "::", + stringify!(device) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).version) as *const _ as usize + }, + 1usize, + concat!( + "Offset of field: ", + stringify!(edd_info), + "::", + stringify!(version) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).interface_support) as *const _ + as usize + }, + 2usize, + concat!( + "Offset of field: ", + stringify!(edd_info), + "::", + stringify!(interface_support) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).legacy_max_cylinder) as *const _ + as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(edd_info), + "::", + stringify!(legacy_max_cylinder) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).legacy_max_head) as *const _ + as usize + }, + 6usize, + concat!( + "Offset of field: ", + stringify!(edd_info), + "::", + stringify!(legacy_max_head) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).legacy_sectors_per_track) + as *const _ as usize + }, + 7usize, + concat!( + "Offset of field: ", + stringify!(edd_info), + "::", + stringify!(legacy_sectors_per_track) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).params) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(edd_info), + "::", + stringify!(params) + ) + ); +} +#[repr(C)] +#[derive(Copy, Clone)] +pub struct edd { + pub mbr_signature: [::std::os::raw::c_uint; 16usize], + pub edd_info: [edd_info; 6usize], + pub mbr_signature_nr: ::std::os::raw::c_uchar, + pub edd_info_nr: ::std::os::raw::c_uchar, +} +#[test] +fn bindgen_test_layout_edd() { + assert_eq!( + ::std::mem::size_of::(), + 560usize, + concat!("Size of: ", stringify!(edd)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(edd)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).mbr_signature) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edd), + "::", + stringify!(mbr_signature) + ) + ); + assert_eq!( + unsafe { std::ptr::addr_of!((*(::std::ptr::null::())).edd_info) as *const _ as usize }, + 64usize, + concat!( + "Offset of field: ", + stringify!(edd), + "::", + stringify!(edd_info) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).mbr_signature_nr) as *const _ as usize + }, + 556usize, + concat!( + "Offset of field: ", + stringify!(edd), + "::", + stringify!(mbr_signature_nr) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).edd_info_nr) as *const _ as usize + }, + 557usize, + concat!( + "Offset of field: ", + stringify!(edd), + "::", + stringify!(edd_info_nr) + ) + ); +} +#[repr(C)] +#[derive(Debug, Copy, Clone, Default)] +pub struct ist_info { + pub signature: __u32, + pub command: __u32, + pub event: __u32, + pub perf_level: __u32, +} +#[test] +fn bindgen_test_layout_ist_info() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(ist_info)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(ist_info)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).signature) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(ist_info), + "::", + stringify!(signature) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).command) as *const _ as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(ist_info), + "::", + stringify!(command) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).event) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(ist_info), + "::", + stringify!(event) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).perf_level) as *const _ as usize + }, + 12usize, + concat!( + "Offset of field: ", + stringify!(ist_info), + "::", + stringify!(perf_level) + ) + ); +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct edid_info { + pub dummy: [::std::os::raw::c_uchar; 128usize], +} +#[test] +fn bindgen_test_layout_edid_info() { + assert_eq!( + ::std::mem::size_of::(), + 128usize, + concat!("Size of: ", stringify!(edid_info)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(edid_info)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).dummy) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(edid_info), + "::", + stringify!(dummy) + ) + ); +} +#[repr(C)] +#[derive(Debug)] +pub struct setup_data { + pub next: __u64, + pub type_: __u32, + pub len: __u32, + pub data: __IncompleteArrayField<__u8>, +} +#[test] +fn bindgen_test_layout_setup_data() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(setup_data)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(setup_data)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).next) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(setup_data), + "::", + stringify!(next) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).type_) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(setup_data), + "::", + stringify!(type_) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).len) as *const _ as usize + }, + 12usize, + concat!( + "Offset of field: ", + stringify!(setup_data), + "::", + stringify!(len) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).data) as *const _ as usize + }, + 16usize, + concat!( + "Offset of field: ", + stringify!(setup_data), + "::", + stringify!(data) + ) + ); +} +#[repr(C)] +#[derive(Debug, Copy, Clone)] +pub struct setup_indirect { + pub type_: __u32, + pub reserved: __u32, + pub len: __u64, + pub addr: __u64, +} +#[test] +fn bindgen_test_layout_setup_indirect() { + assert_eq!( + ::std::mem::size_of::(), + 24usize, + concat!("Size of: ", stringify!(setup_indirect)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(setup_indirect)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).type_) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(setup_indirect), + "::", + stringify!(type_) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).reserved) as *const _ + as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(setup_indirect), + "::", + stringify!(reserved) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).len) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(setup_indirect), + "::", + stringify!(len) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).addr) as *const _ as usize + }, + 16usize, + concat!( + "Offset of field: ", + stringify!(setup_indirect), + "::", + stringify!(addr) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub struct setup_header { + pub setup_sects: __u8, + pub root_flags: __u16, + pub syssize: __u32, + pub ram_size: __u16, + pub vid_mode: __u16, + pub root_dev: __u16, + pub boot_flag: __u16, + pub jump: __u16, + pub header: __u32, + pub version: __u16, + pub realmode_swtch: __u32, + pub start_sys_seg: __u16, + pub kernel_version: __u16, + pub type_of_loader: __u8, + pub loadflags: __u8, + pub setup_move_size: __u16, + pub code32_start: __u32, + pub ramdisk_image: __u32, + pub ramdisk_size: __u32, + pub bootsect_kludge: __u32, + pub heap_end_ptr: __u16, + pub ext_loader_ver: __u8, + pub ext_loader_type: __u8, + pub cmd_line_ptr: __u32, + pub initrd_addr_max: __u32, + pub kernel_alignment: __u32, + pub relocatable_kernel: __u8, + pub min_alignment: __u8, + pub xloadflags: __u16, + pub cmdline_size: __u32, + pub hardware_subarch: __u32, + pub hardware_subarch_data: __u64, + pub payload_offset: __u32, + pub payload_length: __u32, + pub setup_data: __u64, + pub pref_address: __u64, + pub init_size: __u32, + pub handover_offset: __u32, + pub kernel_info_offset: __u32, +} +#[test] +fn bindgen_test_layout_setup_header() { + assert_eq!( + ::std::mem::size_of::(), + 123usize, + concat!("Size of: ", stringify!(setup_header)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(setup_header)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).setup_sects) as *const _ + as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(setup_sects) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).root_flags) as *const _ + as usize + }, + 1usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(root_flags) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).syssize) as *const _ as usize + }, + 3usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(syssize) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ram_size) as *const _ + as usize + }, + 7usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(ram_size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).vid_mode) as *const _ + as usize + }, + 9usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(vid_mode) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).root_dev) as *const _ + as usize + }, + 11usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(root_dev) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).boot_flag) as *const _ + as usize + }, + 13usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(boot_flag) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).jump) as *const _ as usize + }, + 15usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(jump) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).header) as *const _ as usize + }, + 17usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(header) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).version) as *const _ as usize + }, + 21usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(version) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).realmode_swtch) as *const _ + as usize + }, + 23usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(realmode_swtch) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).start_sys_seg) as *const _ + as usize + }, + 27usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(start_sys_seg) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).kernel_version) as *const _ + as usize + }, + 29usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(kernel_version) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).type_of_loader) as *const _ + as usize + }, + 31usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(type_of_loader) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).loadflags) as *const _ + as usize + }, + 32usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(loadflags) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).setup_move_size) as *const _ + as usize + }, + 33usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(setup_move_size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).code32_start) as *const _ + as usize + }, + 35usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(code32_start) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ramdisk_image) as *const _ + as usize + }, + 39usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(ramdisk_image) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ramdisk_size) as *const _ + as usize + }, + 43usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(ramdisk_size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).bootsect_kludge) as *const _ + as usize + }, + 47usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(bootsect_kludge) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).heap_end_ptr) as *const _ + as usize + }, + 51usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(heap_end_ptr) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ext_loader_ver) as *const _ + as usize + }, + 53usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(ext_loader_ver) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ext_loader_type) as *const _ + as usize + }, + 54usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(ext_loader_type) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).cmd_line_ptr) as *const _ + as usize + }, + 55usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(cmd_line_ptr) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).initrd_addr_max) as *const _ + as usize + }, + 59usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(initrd_addr_max) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).kernel_alignment) as *const _ + as usize + }, + 63usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(kernel_alignment) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).relocatable_kernel) + as *const _ as usize + }, + 67usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(relocatable_kernel) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).min_alignment) as *const _ + as usize + }, + 68usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(min_alignment) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).xloadflags) as *const _ + as usize + }, + 69usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(xloadflags) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).cmdline_size) as *const _ + as usize + }, + 71usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(cmdline_size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).hardware_subarch) as *const _ + as usize + }, + 75usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(hardware_subarch) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).hardware_subarch_data) + as *const _ as usize + }, + 79usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(hardware_subarch_data) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).payload_offset) as *const _ + as usize + }, + 87usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(payload_offset) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).payload_length) as *const _ + as usize + }, + 91usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(payload_length) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).setup_data) as *const _ + as usize + }, + 95usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(setup_data) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).pref_address) as *const _ + as usize + }, + 103usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(pref_address) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).init_size) as *const _ + as usize + }, + 111usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(init_size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).handover_offset) as *const _ + as usize + }, + 115usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(handover_offset) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).kernel_info_offset) + as *const _ as usize + }, + 119usize, + concat!( + "Offset of field: ", + stringify!(setup_header), + "::", + stringify!(kernel_info_offset) + ) + ); +} +#[repr(C)] +#[derive(Debug, Copy, Clone, Default)] +pub struct sys_desc_table { + pub length: __u16, + pub table: [__u8; 14usize], +} +#[test] +fn bindgen_test_layout_sys_desc_table() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(sys_desc_table)) + ); + assert_eq!( + ::std::mem::align_of::(), + 2usize, + concat!("Alignment of ", stringify!(sys_desc_table)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).length) as *const _ + as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(sys_desc_table), + "::", + stringify!(length) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).table) as *const _ as usize + }, + 2usize, + concat!( + "Offset of field: ", + stringify!(sys_desc_table), + "::", + stringify!(table) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub struct olpc_ofw_header { + pub ofw_magic: __u32, + pub ofw_version: __u32, + pub cif_handler: __u32, + pub irq_desc_table: __u32, +} +#[test] +fn bindgen_test_layout_olpc_ofw_header() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(olpc_ofw_header)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(olpc_ofw_header)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ofw_magic) as *const _ + as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(olpc_ofw_header), + "::", + stringify!(ofw_magic) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ofw_version) as *const _ + as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(olpc_ofw_header), + "::", + stringify!(ofw_version) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).cif_handler) as *const _ + as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(olpc_ofw_header), + "::", + stringify!(cif_handler) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).irq_desc_table) + as *const _ as usize + }, + 12usize, + concat!( + "Offset of field: ", + stringify!(olpc_ofw_header), + "::", + stringify!(irq_desc_table) + ) + ); +} +#[repr(C)] +#[derive(Debug, Copy, Clone, Default)] +pub struct efi_info { + pub efi_loader_signature: __u32, + pub efi_systab: __u32, + pub efi_memdesc_size: __u32, + pub efi_memdesc_version: __u32, + pub efi_memmap: __u32, + pub efi_memmap_size: __u32, + pub efi_systab_hi: __u32, + pub efi_memmap_hi: __u32, +} +#[test] +fn bindgen_test_layout_efi_info() { + assert_eq!( + ::std::mem::size_of::(), + 32usize, + concat!("Size of: ", stringify!(efi_info)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(efi_info)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).efi_loader_signature) as *const _ + as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(efi_info), + "::", + stringify!(efi_loader_signature) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).efi_systab) as *const _ as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(efi_info), + "::", + stringify!(efi_systab) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).efi_memdesc_size) as *const _ + as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(efi_info), + "::", + stringify!(efi_memdesc_size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).efi_memdesc_version) as *const _ + as usize + }, + 12usize, + concat!( + "Offset of field: ", + stringify!(efi_info), + "::", + stringify!(efi_memdesc_version) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).efi_memmap) as *const _ as usize + }, + 16usize, + concat!( + "Offset of field: ", + stringify!(efi_info), + "::", + stringify!(efi_memmap) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).efi_memmap_size) as *const _ + as usize + }, + 20usize, + concat!( + "Offset of field: ", + stringify!(efi_info), + "::", + stringify!(efi_memmap_size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).efi_systab_hi) as *const _ + as usize + }, + 24usize, + concat!( + "Offset of field: ", + stringify!(efi_info), + "::", + stringify!(efi_systab_hi) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).efi_memmap_hi) as *const _ + as usize + }, + 28usize, + concat!( + "Offset of field: ", + stringify!(efi_info), + "::", + stringify!(efi_memmap_hi) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone, Default)] +pub struct boot_e820_entry { + pub addr: __u64, + pub size: __u64, + pub type_: __u32, +} +#[test] +fn bindgen_test_layout_boot_e820_entry() { + assert_eq!( + ::std::mem::size_of::(), + 20usize, + concat!("Size of: ", stringify!(boot_e820_entry)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(boot_e820_entry)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).addr) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(boot_e820_entry), + "::", + stringify!(addr) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).size) as *const _ as usize + }, + 8usize, + concat!( + "Offset of field: ", + stringify!(boot_e820_entry), + "::", + stringify!(size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).type_) as *const _ + as usize + }, + 16usize, + concat!( + "Offset of field: ", + stringify!(boot_e820_entry), + "::", + stringify!(type_) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct jailhouse_setup_data { + pub hdr: jailhouse_setup_data__bindgen_ty_1, + pub v1: jailhouse_setup_data__bindgen_ty_2, + pub v2: jailhouse_setup_data__bindgen_ty_3, +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct jailhouse_setup_data__bindgen_ty_1 { + pub version: __u16, + pub compatible_version: __u16, +} +#[test] +fn bindgen_test_layout_jailhouse_setup_data__bindgen_ty_1() { + assert_eq!( + ::std::mem::size_of::(), + 4usize, + concat!("Size of: ", stringify!(jailhouse_setup_data__bindgen_ty_1)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(jailhouse_setup_data__bindgen_ty_1) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).version + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(jailhouse_setup_data__bindgen_ty_1), + "::", + stringify!(version) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).compatible_version + ) as *const _ as usize + }, + 2usize, + concat!( + "Offset of field: ", + stringify!(jailhouse_setup_data__bindgen_ty_1), + "::", + stringify!(compatible_version) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct jailhouse_setup_data__bindgen_ty_2 { + pub pm_timer_address: __u16, + pub num_cpus: __u16, + pub pci_mmconfig_base: __u64, + pub tsc_khz: __u32, + pub apic_khz: __u32, + pub standard_ioapic: __u8, + pub cpu_ids: [__u8; 255usize], +} +#[test] +fn bindgen_test_layout_jailhouse_setup_data__bindgen_ty_2() { + assert_eq!( + ::std::mem::size_of::(), + 276usize, + concat!("Size of: ", stringify!(jailhouse_setup_data__bindgen_ty_2)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(jailhouse_setup_data__bindgen_ty_2) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).pm_timer_address + ) as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(jailhouse_setup_data__bindgen_ty_2), + "::", + stringify!(pm_timer_address) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).num_cpus + ) as *const _ as usize + }, + 2usize, + concat!( + "Offset of field: ", + stringify!(jailhouse_setup_data__bindgen_ty_2), + "::", + stringify!(num_cpus) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).pci_mmconfig_base + ) as *const _ as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(jailhouse_setup_data__bindgen_ty_2), + "::", + stringify!(pci_mmconfig_base) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).tsc_khz + ) as *const _ as usize + }, + 12usize, + concat!( + "Offset of field: ", + stringify!(jailhouse_setup_data__bindgen_ty_2), + "::", + stringify!(tsc_khz) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).apic_khz + ) as *const _ as usize + }, + 16usize, + concat!( + "Offset of field: ", + stringify!(jailhouse_setup_data__bindgen_ty_2), + "::", + stringify!(apic_khz) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).standard_ioapic + ) as *const _ as usize + }, + 20usize, + concat!( + "Offset of field: ", + stringify!(jailhouse_setup_data__bindgen_ty_2), + "::", + stringify!(standard_ioapic) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!( + (*(::std::ptr::null::())).cpu_ids + ) as *const _ as usize + }, + 21usize, + concat!( + "Offset of field: ", + stringify!(jailhouse_setup_data__bindgen_ty_2), + "::", + stringify!(cpu_ids) + ) + ); +} +#[repr(C, packed)] +#[derive(Debug, Copy, Clone)] +pub struct jailhouse_setup_data__bindgen_ty_3 { + pub flags: __u32, +} +#[test] +fn bindgen_test_layout_jailhouse_setup_data__bindgen_ty_3() { + assert_eq!( + ::std::mem::size_of::(), + 4usize, + concat!("Size of: ", stringify!(jailhouse_setup_data__bindgen_ty_3)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!( + "Alignment of ", + stringify!(jailhouse_setup_data__bindgen_ty_3) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).flags) + as *const _ as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(jailhouse_setup_data__bindgen_ty_3), + "::", + stringify!(flags) + ) + ); +} +#[test] +fn bindgen_test_layout_jailhouse_setup_data() { + assert_eq!( + ::std::mem::size_of::(), + 284usize, + concat!("Size of: ", stringify!(jailhouse_setup_data)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(jailhouse_setup_data)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).hdr) as *const _ + as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(jailhouse_setup_data), + "::", + stringify!(hdr) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).v1) as *const _ + as usize + }, + 4usize, + concat!( + "Offset of field: ", + stringify!(jailhouse_setup_data), + "::", + stringify!(v1) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).v2) as *const _ + as usize + }, + 280usize, + concat!( + "Offset of field: ", + stringify!(jailhouse_setup_data), + "::", + stringify!(v2) + ) + ); +} +#[repr(C, packed)] +#[derive(Copy, Clone)] +pub struct boot_params { + pub screen_info: screen_info, + pub apm_bios_info: apm_bios_info, + pub _pad2: [__u8; 4usize], + pub tboot_addr: __u64, + pub ist_info: ist_info, + pub acpi_rsdp_addr: __u64, + pub _pad3: [__u8; 8usize], + pub hd0_info: [__u8; 16usize], + pub hd1_info: [__u8; 16usize], + pub sys_desc_table: sys_desc_table, + pub olpc_ofw_header: olpc_ofw_header, + pub ext_ramdisk_image: __u32, + pub ext_ramdisk_size: __u32, + pub ext_cmd_line_ptr: __u32, + pub _pad4: [__u8; 116usize], + pub edid_info: edid_info, + pub efi_info: efi_info, + pub alt_mem_k: __u32, + pub scratch: __u32, + pub e820_entries: __u8, + pub eddbuf_entries: __u8, + pub edd_mbr_sig_buf_entries: __u8, + pub kbd_status: __u8, + pub secure_boot: __u8, + pub _pad5: [__u8; 2usize], + pub sentinel: __u8, + pub _pad6: [__u8; 1usize], + pub hdr: setup_header, + pub _pad7: [__u8; 36usize], + pub edd_mbr_sig_buffer: [__u32; 16usize], + pub e820_table: [boot_e820_entry; 128usize], + pub _pad8: [__u8; 48usize], + pub eddbuf: [edd_info; 6usize], + pub _pad9: [__u8; 276usize], +} +#[test] +fn bindgen_test_layout_boot_params() { + assert_eq!( + ::std::mem::size_of::(), + 4096usize, + concat!("Size of: ", stringify!(boot_params)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(boot_params)) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).screen_info) as *const _ + as usize + }, + 0usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(screen_info) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).apm_bios_info) as *const _ + as usize + }, + 64usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(apm_bios_info) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::()))._pad2) as *const _ as usize + }, + 84usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(_pad2) + ) + ); + + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).tboot_addr) as *const _ + as usize + }, + 88usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(tboot_addr) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ist_info) as *const _ as usize + }, + 96usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(ist_info) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).acpi_rsdp_addr) as *const _ + as usize + }, + 112usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(acpi_rsdp_addr) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::()))._pad3) as *const _ as usize + }, + 120usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(_pad3) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).hd0_info) as *const _ as usize + }, + 128usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(hd0_info) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).hd1_info) as *const _ as usize + }, + 144usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(hd1_info) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).sys_desc_table) as *const _ + as usize + }, + 160usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(sys_desc_table) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).olpc_ofw_header) as *const _ + as usize + }, + 176usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(olpc_ofw_header) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ext_ramdisk_image) as *const _ + as usize + }, + 192usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(ext_ramdisk_image) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ext_ramdisk_size) as *const _ + as usize + }, + 196usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(ext_ramdisk_size) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).ext_cmd_line_ptr) as *const _ + as usize + }, + 200usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(ext_cmd_line_ptr) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::()))._pad4) as *const _ as usize + }, + 204usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(_pad4) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).edid_info) as *const _ + as usize + }, + 320usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(edid_info) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).efi_info) as *const _ as usize + }, + 448usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(efi_info) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).alt_mem_k) as *const _ + as usize + }, + 480usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(alt_mem_k) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).scratch) as *const _ as usize + }, + 484usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(scratch) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).e820_entries) as *const _ + as usize + }, + 488usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(e820_entries) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).eddbuf_entries) as *const _ + as usize + }, + 489usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(eddbuf_entries) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).edd_mbr_sig_buf_entries) + as *const _ as usize + }, + 490usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(edd_mbr_sig_buf_entries) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).kbd_status) as *const _ + as usize + }, + 491usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(kbd_status) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).secure_boot) as *const _ + as usize + }, + 492usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(secure_boot) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::()))._pad5) as *const _ as usize + }, + 493usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(_pad5) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).sentinel) as *const _ as usize + }, + 495usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(sentinel) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::()))._pad6) as *const _ as usize + }, + 496usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(_pad6) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).hdr) as *const _ as usize + }, + 497usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(hdr) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::()))._pad7) as *const _ as usize + }, + 620usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(_pad7) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).edd_mbr_sig_buffer) + as *const _ as usize + }, + 656usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(edd_mbr_sig_buffer) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).e820_table) as *const _ + as usize + }, + 720usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(e820_table) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::()))._pad8) as *const _ as usize + }, + 3280usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(_pad8) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::())).eddbuf) as *const _ as usize + }, + 3328usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(eddbuf) + ) + ); + assert_eq!( + unsafe { + std::ptr::addr_of!((*(::std::ptr::null::()))._pad9) as *const _ as usize + }, + 3820usize, + concat!( + "Offset of field: ", + stringify!(boot_params), + "::", + stringify!(_pad9) + ) + ); +} + +impl Default for boot_params { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +pub const X86_HARDWARE_SUBARCH_X86_SUBARCH_PC: x86_hardware_subarch = 0; +pub const X86_HARDWARE_SUBARCH_X86_SUBARCH_LGUEST: x86_hardware_subarch = 1; +pub const X86_HARDWARE_SUBARCH_X86_SUBARCH_XEN: x86_hardware_subarch = 2; +pub const X86_HARDWARE_SUBARCH_X86_SUBARCH_INTEL_MID: x86_hardware_subarch = 3; +pub const X86_HARDWARE_SUBARCH_X86_SUBARCH_CE4100: x86_hardware_subarch = 4; +pub const X86_HARDWARE_SUBARCH_X86_NR_SUBARCHS: x86_hardware_subarch = 5; + +#[doc = " enum x86_hardware_subarch - x86 hardware subarchitecture"] +#[doc = ""] +#[doc = " The x86 hardware_subarch and hardware_subarch_data were added as of the x86"] +#[doc = " boot protocol 2.07 to help distinguish and support custom x86 boot"] +#[doc = " sequences. This enum represents accepted values for the x86"] +#[doc = " hardware_subarch. Custom x86 boot sequences (not X86_SUBARCH_PC) do not"] +#[doc = " have or simply *cannot* make use of natural stubs like BIOS or EFI, the"] +#[doc = " hardware_subarch can be used on the Linux entry path to revector to a"] +#[doc = " subarchitecture stub when needed. This subarchitecture stub can be used to"] +#[doc = " set up Linux boot parameters or for special care to account for nonstandard"] +#[doc = " handling of page tables."] +#[doc = ""] +#[doc = " These enums should only ever be used by x86 code, and the code that uses"] +#[doc = " it should be well contained and compartmentalized."] +#[doc = ""] +#[doc = " KVM and Xen HVM do not have a subarch as these are expected to follow"] +#[doc = " standard x86 boot entries. If there is a genuine need for \"hypervisor\" type"] +#[doc = " that should be considered separately in the future. Future guest types"] +#[doc = " should seriously consider working with standard x86 boot stubs such as"] +#[doc = " the BIOS or EFI boot stubs."] +#[doc = ""] +#[doc = " WARNING: this enum is only used for legacy hacks, for platform features that"] +#[doc = "\t are not easily enumerated or discoverable. You should not ever use"] +#[doc = "\t this for new features."] +#[doc = ""] +#[doc = " @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard"] +#[doc = "\tPC mechanisms (PCI, ACPI) and doesn't need a special boot flow."] +#[doc = " @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated"] +#[doc = " @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,"] +#[doc = " \twhich start at asm startup_xen() entry point and later jump to the C"] +#[doc = " \txen_start_kernel() entry point. Both domU and dom0 type of guests are"] +#[doc = " \tcurrently supported through this PV boot path."] +#[doc = " @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform"] +#[doc = "\tsystems which do not have the PCI legacy interfaces."] +#[doc = " @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC"] +#[doc = " \tfor settop boxes and media devices, the use of a subarch for CE4100"] +#[doc = " \tis more of a hack..."] +pub type x86_hardware_subarch = ::std::os::raw::c_uint; diff --git a/src/dragonball/src/dbs_boot/src/x86_64/layout.rs b/src/dragonball/src/dbs_boot/src/x86_64/layout.rs new file mode 100644 index 000000000..4bd65dbe0 --- /dev/null +++ b/src/dragonball/src/dbs_boot/src/x86_64/layout.rs @@ -0,0 +1,100 @@ +// Copyright 2021-2022 Alibaba Cloud. All Rights Reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use lazy_static::lazy_static; + +/// Magic addresses externally used to lay out x86_64 VMs. + +/// Global Descriptor Table Offset +pub const BOOT_GDT_OFFSET: u64 = 0x500; +/// Interrupt Descriptor Table Offset +pub const BOOT_IDT_OFFSET: u64 = 0x520; + +/// Address of Global Descriptor Table (GDT) +pub const BOOT_GDT_ADDRESS: u64 = 0x500; +/// Number of initial GDT entries. +pub const BOOT_GDT_MAX: usize = 4; + +/// Address of Interrupt Descriptor Table (IDT) +pub const BOOT_IDT_ADDRESS: u64 = 0x520; + +/// The 'zero page', a.k.a linux kernel bootparams. +pub const ZERO_PAGE_START: u64 = 0x7000; + +/// Initial stack for the boot CPU. +pub const BOOT_STACK_POINTER: u64 = 0x8ff0; + +/// Address of page table level 4 page +pub const PML4_START: u64 = 0x9000; +/// Address of page table level 3 page +pub const PDPTE_START: u64 = 0xa000; +/// Address of page table level 2 page +pub const PDE_START: u64 = 0xb000; + +/// Kernel command line start address. +pub const CMDLINE_START: u64 = 0x20000; +/// Kernel command line start address maximum size. +pub const CMDLINE_MAX_SIZE: usize = 0x10000; + +/// Kernel dragonball boot parameters start address. +pub const DB_BOOT_PARAM_START: u64 = 0x30000; +/// Kernel dragonball boot parameters length maximum size. +pub const DB_BOOT_PARAM_MAX_SIZE: u32 = 0x10000; + +/// Start of the high memory. +pub const HIMEM_START: u64 = 0x0010_0000; //1 MB. + +// Typically, on x86 systems 16 IRQs are used (0-15). +/// First usable IRQ ID for virtio device interrupts on x86_64. +pub const IRQ_BASE: u32 = 5; +/// Last usable IRQ ID for virtio device interrupts on x86_64. +pub const IRQ_MAX: u32 = 15; + +/// Address for the TSS setup. +pub const KVM_TSS_ADDRESS: u64 = 0xfffb_d000; + +/// Where BIOS/VGA magic would live on a real PC. +pub const EBDA_START: u64 = 0x9fc00; + +/// Start address of the lower MMIO window. +pub const MMIO_LOW_START: u64 = 3u64 << 30; +/// End address (inclusive) of the lower MMIO window. +pub const MMIO_LOW_END: u64 = (4u64 << 30) - 1; +/// Lower bound of guest memory. +pub const GUEST_MEM_START: u64 = 0u64; +/// Size of memory below MMIO hole. +pub const GUEST_MEM_LOW_SIZE: u64 = MMIO_LOW_START - GUEST_MEM_START; + +/// Max retry times for reading /proc/cpuinfo +const CPUINFO_READ_RETRY: u64 = 5; + +lazy_static! { + /// Maximum guest physical address supported. + pub static ref GUEST_PHYS_END: u64 = { + for _ in 0..CPUINFO_READ_RETRY { + if let Ok(buf) = std::fs::read("/proc/cpuinfo") { + let content = String::from_utf8_lossy(&buf); + for line in content.lines() { + if line.starts_with("address sizes : ") { + if let Some(end) = line.find(" bits physical") { + if let Ok(size) = line[16..end].parse::() { + if (36..=64).contains(&size) { + return (1u64 << size) - 1; + } + } + } + } + } + } + } + panic!("Exceed max retry times. Cannot get physical address size from /proc/cpuinfo"); + }; + + /// Upper bound of guest memory. + pub static ref GUEST_MEM_END: u64 = *GUEST_PHYS_END >> 1; +} diff --git a/src/dragonball/src/dbs_boot/src/x86_64/mod.rs b/src/dragonball/src/dbs_boot/src/x86_64/mod.rs new file mode 100644 index 000000000..50443ed14 --- /dev/null +++ b/src/dragonball/src/dbs_boot/src/x86_64/mod.rs @@ -0,0 +1,325 @@ +// Copyright 2021 Alibaba Cloud. All Rights Reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! VM boot related constants and utilities for `x86_64` architecture. + +use dbs_arch::gdt::gdt_entry; +use vm_memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryRegion}; + +use self::layout::{BOOT_GDT_ADDRESS, BOOT_GDT_MAX, BOOT_IDT_ADDRESS}; +use super::Result; + +/// Magic addresses externally used to lay out x86_64 VMs. +pub mod layout; + +/// Structure definitions for SMP machines following the Intel Multiprocessing Specification 1.1 and 1.4. +pub mod mpspec; + +/// MP Table configurations used for defining VM boot status. +pub mod mptable; + +/// Guest boot parameters used for config guest information. +pub mod bootparam; + +/// Default (smallest) memory page size for the supported architectures. +pub const PAGE_SIZE: usize = 4096; + +/// Boot parameters wrapper for ByteValue trait +// This is a workaround to the Rust enforcement specifying that any implementation of a foreign +// trait (in this case `ByteValued`) where: +// * the type that is implementing the trait is foreign or +// * all of the parameters being passed to the trait (if there are any) are also foreign +// is prohibited. +#[repr(transparent)] +#[derive(Copy, Clone, Default)] +pub struct BootParamsWrapper(pub bootparam::boot_params); + +// It is safe to initialize BootParamsWrap which is a wrapper over `boot_params` (a series of ints). +unsafe impl ByteValued for BootParamsWrapper {} + +/// Errors thrown while configuring x86_64 system. +#[derive(Debug, Eq, PartialEq, thiserror::Error)] +pub enum Error { + /// Invalid e820 setup params. + #[error("invalid e820 setup parameters")] + E820Configuration, + + /// Error writing MP table to memory. + #[error("failed to write MP table to guest memory")] + MpTableSetup(#[source] mptable::Error), + + /// The zero page extends past the end of guest_mem. + #[error("the guest zero page extends past the end of guest memory")] + ZeroPagePastRamEnd, + + /// Error writing the zero page of guest memory. + #[error("failed to write to guest zero page")] + ZeroPageSetup, + + /// Failed to compute initrd address. + #[error("invalid guest memory address for Initrd")] + InitrdAddress, + + /// boot parameter setup fail. + #[error("write boot parameter fail")] + BootParamSetup, + + /// Empty AddressSpace from parameters. + #[error("Empty AddressSpace from parameters")] + AddressSpace, + + /// Writing PDPTE to RAM failed. + #[error("Writing PDPTE to RAM failed.")] + WritePDPTEAddress, + + /// Writing PDE to RAM failed. + #[error("Writing PDE to RAM failed.")] + WritePDEAddress, + + #[error("Writing PML4 to RAM failed.")] + /// Writing PML4 to RAM failed. + WritePML4Address, +} + +/// Initialize the 1:1 identity mapping table for guest memory range [0..1G). +/// +/// Also, return the pml4 address for sregs setting and AP boot +pub fn setup_identity_mapping(mem: &M) -> Result { + // Puts PML4 right after zero page but aligned to 4k. + let boot_pml4_addr = GuestAddress(layout::PML4_START); + let boot_pdpte_addr = GuestAddress(layout::PDPTE_START); + let boot_pde_addr = GuestAddress(layout::PDE_START); + + // Entry covering VA [0..512GB) + mem.write_obj(boot_pdpte_addr.raw_value() | 0x03, boot_pml4_addr) + .map_err(|_| Error::WritePML4Address)?; + + // Entry covering VA [0..1GB) + mem.write_obj(boot_pde_addr.raw_value() | 0x03, boot_pdpte_addr) + .map_err(|_| Error::WritePDPTEAddress)?; + + // 512 2MB entries together covering VA [0..1GB). Note we are assuming + // CPU supports 2MB pages (/proc/cpuinfo has 'pse'). All modern CPUs do. + for i in 0..512 { + mem.write_obj((i << 21) + 0x83u64, boot_pde_addr.unchecked_add(i * 8)) + .map_err(|_| Error::WritePDEAddress)?; + } + + // return the pml4 address that could be used for AP boot up and later sreg setting process. + Ok(boot_pml4_addr) +} + +/// Get information to configure GDT/IDT. +pub fn get_descriptor_config_info() -> ([u64; BOOT_GDT_MAX], u64, u64) { + let gdt_table: [u64; BOOT_GDT_MAX] = [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xa09b, 0, 0xfffff), // CODE + gdt_entry(0xc093, 0, 0xfffff), // DATA + gdt_entry(0x808b, 0, 0xfffff), // TSS + ]; + + (gdt_table, BOOT_GDT_ADDRESS, BOOT_IDT_ADDRESS) +} + +/// Returns the memory address where the initrd could be loaded. +pub fn initrd_load_addr(guest_mem: &M, initrd_size: u64) -> Result { + let lowmem_size = guest_mem + .find_region(GuestAddress(0)) + .ok_or(Error::InitrdAddress) + .map(|r| r.len())?; + + // For safety to avoid overlap, reserve 32M for kernel and boot params in low end. + if lowmem_size < initrd_size + (32 << 20) { + return Err(Error::InitrdAddress); + } + + let align_to_pagesize = |address| address & !(PAGE_SIZE as u64 - 1); + Ok(align_to_pagesize(lowmem_size - initrd_size)) +} + +/// Returns the memory address where the kernel could be loaded. +pub fn get_kernel_start() -> u64 { + layout::HIMEM_START +} + +/// Add an e820 region to the e820 map. +/// Returns Ok(()) if successful, or an error if there is no space left in the map. +pub fn add_e820_entry( + params: &mut bootparam::boot_params, + addr: u64, + size: u64, + mem_type: u32, +) -> Result<()> { + if params.e820_entries >= params.e820_table.len() as u8 { + return Err(Error::E820Configuration); + } + + params.e820_table[params.e820_entries as usize].addr = addr; + params.e820_table[params.e820_entries as usize].size = size; + params.e820_table[params.e820_entries as usize].type_ = mem_type; + params.e820_entries += 1; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::bootparam::{boot_e820_entry, boot_params}; + use crate::layout::{PDE_START, PDPTE_START, PML4_START}; + use kvm_bindings::kvm_sregs; + use kvm_ioctls::Kvm; + use vm_memory::GuestMemoryMmap; + + const BOOT_GDT_OFFSET: u64 = 0x500; + const BOOT_IDT_OFFSET: u64 = 0x520; + + fn read_u64(gm: &GuestMemoryMmap, offset: u64) -> u64 { + let read_addr = GuestAddress(offset); + gm.read_obj(read_addr).unwrap() + } + + #[test] + fn test_get_descriptor_config_info() { + let (gdt_table, gdt_addr, idt_addr) = get_descriptor_config_info(); + + assert_eq!(gdt_table.len(), BOOT_GDT_MAX); + assert_eq!(gdt_addr, BOOT_GDT_ADDRESS); + assert_eq!(idt_addr, BOOT_IDT_ADDRESS); + } + + #[test] + fn test_setup_identity_mapping() { + let gm = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + setup_identity_mapping(&gm).unwrap(); + assert_eq!(0xa003, read_u64(&gm, PML4_START)); + assert_eq!(0xb003, read_u64(&gm, PDPTE_START)); + for i in 0..512 { + assert_eq!((i << 21) + 0x83u64, read_u64(&gm, PDE_START + (i * 8))); + } + } + + #[test] + fn test_write_boot_param() { + const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55; + const KERNEL_HDR_MAGIC: u32 = 0x5372_6448; + const KERNEL_LOADER_OTHER: u8 = 0xff; + const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x0100_0000; // Must be non-zero. + let mut params: BootParamsWrapper = BootParamsWrapper(bootparam::boot_params::default()); + + params.0.hdr.type_of_loader = KERNEL_LOADER_OTHER; + params.0.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC; + params.0.hdr.header = KERNEL_HDR_MAGIC; + params.0.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES; + + assert_eq!(params.0.hdr.type_of_loader, KERNEL_LOADER_OTHER); + assert_eq!( + unsafe { std::ptr::addr_of!(params.0.hdr.boot_flag).read_unaligned() }, + KERNEL_BOOT_FLAG_MAGIC + ); + assert_eq!( + unsafe { std::ptr::addr_of!(params.0.hdr.header).read_unaligned() }, + KERNEL_HDR_MAGIC + ); + assert_eq!( + unsafe { std::ptr::addr_of!(params.0.hdr.kernel_alignment).read_unaligned() }, + KERNEL_MIN_ALIGNMENT_BYTES + ); + } + + fn validate_page_tables( + gm: &GuestMemoryMmap, + sregs: &kvm_sregs, + existing_pgtable: Option, + ) { + assert_eq!(0xa003, read_u64(gm, PML4_START)); + assert_eq!(0xb003, read_u64(gm, PDPTE_START)); + for i in 0..512 { + assert_eq!((i << 21) + 0x83u64, read_u64(gm, PDE_START + (i * 8))); + } + + if let Some(pgtable_base) = existing_pgtable { + assert_eq!(pgtable_base.raw_value(), sregs.cr3); + } else { + assert_eq!(PML4_START, sregs.cr3); + } + assert!(sregs.cr4 & dbs_arch::regs::X86_CR4_PAE != 0); + assert!(sregs.cr0 & dbs_arch::regs::X86_CR0_PG != 0); + } + + fn create_guest_mem() -> GuestMemoryMmap { + GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap() + } + + #[test] + fn test_setup_page_tables() { + let kvm = Kvm::new().unwrap(); + let vm = kvm.create_vm().unwrap(); + let vcpu = vm.create_vcpu(0).unwrap(); + let gm = create_guest_mem(); + let gdt_table: [u64; layout::BOOT_GDT_MAX] = [ + gdt_entry(0, 0, 0), // NULL + gdt_entry(0xa09b, 0, 0xfffff), // CODE + gdt_entry(0xc093, 0, 0xfffff), // DATA + gdt_entry(0x808b, 0, 0xfffff), // TSS + ]; + + let page_address = setup_identity_mapping(&gm).unwrap(); + dbs_arch::regs::setup_sregs( + &gm, + &vcpu, + page_address, + &gdt_table, + BOOT_GDT_OFFSET, + BOOT_IDT_OFFSET, + ) + .unwrap(); + let sregs: kvm_sregs = vcpu.get_sregs().unwrap(); + validate_page_tables(&gm, &sregs, Some(page_address)); + } + + #[test] + fn test_add_e820_entry() { + let e820_table = [(boot_e820_entry { + addr: 0x1, + size: 4, + type_: 1, + }); 128]; + + let expected_params = boot_params { + e820_table, + e820_entries: 1, + ..Default::default() + }; + + let mut params: boot_params = Default::default(); + add_e820_entry( + &mut params, + e820_table[0].addr, + e820_table[0].size, + e820_table[0].type_, + ) + .unwrap(); + assert_eq!( + format!("{:?}", params.e820_table[0]), + format!("{:?}", expected_params.e820_table[0]) + ); + assert_eq!(params.e820_entries, expected_params.e820_entries); + + // Exercise the scenario where the field storing the length of the e820 entry table is + // is bigger than the allocated memory. + params.e820_entries = params.e820_table.len() as u8 + 1; + assert!(add_e820_entry( + &mut params, + e820_table[0].addr, + e820_table[0].size, + e820_table[0].type_ + ) + .is_err()); + } +} diff --git a/src/dragonball/src/dbs_boot/src/x86_64/mpspec.rs b/src/dragonball/src/dbs_boot/src/x86_64/mpspec.rs new file mode 100644 index 000000000..98c9d91b7 --- /dev/null +++ b/src/dragonball/src/dbs_boot/src/x86_64/mpspec.rs @@ -0,0 +1,936 @@ +// Copyright 2023 Alibaba Cloud. All Rights Reserved. +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. +// SPDX-License-Identifier: Apache-2.0 + +//! Structure definitions for SMP machines following the Intel Multiprocessing Specification 1.1 and 1.4. + +/* automatically generated by rust-bindgen */ + +#![allow(missing_docs)] +#![allow(non_camel_case_types)] +#![allow(non_upper_case_globals)] +#![allow(deref_nullptr)] + +pub const MPC_SIGNATURE: &[u8; 5usize] = b"PCMP\x00"; +pub const MP_PROCESSOR: ::std::os::raw::c_uint = 0; +pub const MP_BUS: ::std::os::raw::c_uint = 1; +pub const MP_IOAPIC: ::std::os::raw::c_uint = 2; +pub const MP_INTSRC: ::std::os::raw::c_uint = 3; +pub const MP_LINTSRC: ::std::os::raw::c_uint = 4; +pub const MP_TRANSLATION: ::std::os::raw::c_uint = 192; +pub const CPU_ENABLED: ::std::os::raw::c_uint = 1; +pub const CPU_BOOTPROCESSOR: ::std::os::raw::c_uint = 2; +pub const CPU_STEPPING_MASK: ::std::os::raw::c_uint = 15; +pub const CPU_MODEL_MASK: ::std::os::raw::c_uint = 240; +pub const CPU_FAMILY_MASK: ::std::os::raw::c_uint = 3840; +pub const BUSTYPE_EISA: &[u8; 5usize] = b"EISA\x00"; +pub const BUSTYPE_ISA: &[u8; 4usize] = b"ISA\x00"; +pub const BUSTYPE_INTERN: &[u8; 7usize] = b"INTERN\x00"; +pub const BUSTYPE_MCA: &[u8; 4usize] = b"MCA\x00"; +pub const BUSTYPE_VL: &[u8; 3usize] = b"VL\x00"; +pub const BUSTYPE_PCI: &[u8; 4usize] = b"PCI\x00"; +pub const BUSTYPE_PCMCIA: &[u8; 7usize] = b"PCMCIA\x00"; +pub const BUSTYPE_CBUS: &[u8; 5usize] = b"CBUS\x00"; +pub const BUSTYPE_CBUSII: &[u8; 7usize] = b"CBUSII\x00"; +pub const BUSTYPE_FUTURE: &[u8; 7usize] = b"FUTURE\x00"; +pub const BUSTYPE_MBI: &[u8; 4usize] = b"MBI\x00"; +pub const BUSTYPE_MBII: &[u8; 5usize] = b"MBII\x00"; +pub const BUSTYPE_MPI: &[u8; 4usize] = b"MPI\x00"; +pub const BUSTYPE_MPSA: &[u8; 5usize] = b"MPSA\x00"; +pub const BUSTYPE_NUBUS: &[u8; 6usize] = b"NUBUS\x00"; +pub const BUSTYPE_TC: &[u8; 3usize] = b"TC\x00"; +pub const BUSTYPE_VME: &[u8; 4usize] = b"VME\x00"; +pub const BUSTYPE_XPRESS: &[u8; 7usize] = b"XPRESS\x00"; +pub const MPC_APIC_USABLE: ::std::os::raw::c_uint = 1; +pub const MP_IRQDIR_DEFAULT: ::std::os::raw::c_uint = 0; +pub const MP_IRQDIR_HIGH: ::std::os::raw::c_uint = 1; +pub const MP_IRQDIR_LOW: ::std::os::raw::c_uint = 3; +pub const MP_APIC_ALL: ::std::os::raw::c_uint = 255; +pub const MPC_OEM_SIGNATURE: &[u8; 5usize] = b"_OEM\x00"; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct mpf_intel { + pub signature: [::std::os::raw::c_char; 4usize], + pub physptr: ::std::os::raw::c_uint, + pub length: ::std::os::raw::c_uchar, + pub specification: ::std::os::raw::c_uchar, + pub checksum: ::std::os::raw::c_uchar, + pub feature1: ::std::os::raw::c_uchar, + pub feature2: ::std::os::raw::c_uchar, + pub feature3: ::std::os::raw::c_uchar, + pub feature4: ::std::os::raw::c_uchar, + pub feature5: ::std::os::raw::c_uchar, +} + +#[test] +fn default_mpf_intel() { + let mpf_intel = mpf_intel::default(); + assert_eq!(mpf_intel.signature, [0i8, 0i8, 0i8, 0i8]); + assert_eq!(mpf_intel.physptr, 0u32); + assert_eq!(mpf_intel.length, 0u8); + assert_eq!(mpf_intel.specification, 0u8); + assert_eq!(mpf_intel.checksum, 0u8); + assert_eq!(mpf_intel.feature1, 0u8); + assert_eq!(mpf_intel.feature2, 0u8); + assert_eq!(mpf_intel.feature3, 0u8); + assert_eq!(mpf_intel.feature4, 0u8); + assert_eq!(mpf_intel.feature5, 0u8); +} + +#[test] +fn bindgen_test_layout_mpf_intel() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(mpf_intel)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(mpf_intel)) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).signature as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(mpf_intel), + "::", + stringify!(signature) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).physptr as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(mpf_intel), + "::", + stringify!(physptr) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).length as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(mpf_intel), + "::", + stringify!(length) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).specification as *const _ as usize }, + 9usize, + concat!( + "Alignment of field: ", + stringify!(mpf_intel), + "::", + stringify!(specification) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).checksum as *const _ as usize }, + 10usize, + concat!( + "Alignment of field: ", + stringify!(mpf_intel), + "::", + stringify!(checksum) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).feature1 as *const _ as usize }, + 11usize, + concat!( + "Alignment of field: ", + stringify!(mpf_intel), + "::", + stringify!(feature1) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).feature2 as *const _ as usize }, + 12usize, + concat!( + "Alignment of field: ", + stringify!(mpf_intel), + "::", + stringify!(feature2) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).feature3 as *const _ as usize }, + 13usize, + concat!( + "Alignment of field: ", + stringify!(mpf_intel), + "::", + stringify!(feature3) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).feature4 as *const _ as usize }, + 14usize, + concat!( + "Alignment of field: ", + stringify!(mpf_intel), + "::", + stringify!(feature4) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).feature5 as *const _ as usize }, + 15usize, + concat!( + "Alignment of field: ", + stringify!(mpf_intel), + "::", + stringify!(feature5) + ) + ); +} + +impl Clone for mpf_intel { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct mpc_table { + pub signature: [::std::os::raw::c_char; 4usize], + pub length: ::std::os::raw::c_ushort, + pub spec: ::std::os::raw::c_char, + pub checksum: ::std::os::raw::c_char, + pub oem: [::std::os::raw::c_char; 8usize], + pub productid: [::std::os::raw::c_char; 12usize], + pub oemptr: ::std::os::raw::c_uint, + pub oemsize: ::std::os::raw::c_ushort, + pub oemcount: ::std::os::raw::c_ushort, + pub lapic: ::std::os::raw::c_uint, + pub reserved: ::std::os::raw::c_uint, +} + +#[test] +fn default_mpc_table() { + let mpc_table = mpc_table::default(); + assert_eq!(mpc_table.signature, [0i8, 0i8, 0i8, 0i8]); + assert_eq!(mpc_table.length, 0u16); + assert_eq!(mpc_table.spec, 0i8); + assert_eq!(mpc_table.checksum, 0i8); + assert_eq!(mpc_table.oem, [0i8, 0i8, 0i8, 0i8, 0i8, 0i8, 0i8, 0i8]); + assert_eq!( + mpc_table.productid, + [0i8, 0i8, 0i8, 0i8, 0i8, 0i8, 0i8, 0i8, 0i8, 0i8, 0i8, 0i8] + ); + assert_eq!(mpc_table.oemptr, 0u32); + assert_eq!(mpc_table.oemsize, 0u16); + assert_eq!(mpc_table.oemcount, 0u16); + assert_eq!(mpc_table.lapic, 0u32); + assert_eq!(mpc_table.reserved, 0u32); +} +#[test] +fn bindgen_test_layout_mpc_table() { + assert_eq!( + ::std::mem::size_of::(), + 44usize, + concat!("Size of: ", stringify!(mpc_table)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(mpc_table)) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).signature as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(mpc_table), + "::", + stringify!(signature) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).length as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(mpc_table), + "::", + stringify!(length) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).spec as *const _ as usize }, + 6usize, + concat!( + "Alignment of field: ", + stringify!(mpc_table), + "::", + stringify!(spec) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).checksum as *const _ as usize }, + 7usize, + concat!( + "Alignment of field: ", + stringify!(mpc_table), + "::", + stringify!(checksum) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).oem as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(mpc_table), + "::", + stringify!(oem) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).productid as *const _ as usize }, + 16usize, + concat!( + "Alignment of field: ", + stringify!(mpc_table), + "::", + stringify!(productid) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).oemptr as *const _ as usize }, + 28usize, + concat!( + "Alignment of field: ", + stringify!(mpc_table), + "::", + stringify!(oemptr) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).oemsize as *const _ as usize }, + 32usize, + concat!( + "Alignment of field: ", + stringify!(mpc_table), + "::", + stringify!(oemsize) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).oemcount as *const _ as usize }, + 34usize, + concat!( + "Alignment of field: ", + stringify!(mpc_table), + "::", + stringify!(oemcount) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).lapic as *const _ as usize }, + 36usize, + concat!( + "Alignment of field: ", + stringify!(mpc_table), + "::", + stringify!(lapic) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).reserved as *const _ as usize }, + 40usize, + concat!( + "Alignment of field: ", + stringify!(mpc_table), + "::", + stringify!(reserved) + ) + ); +} +impl Clone for mpc_table { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct mpc_cpu { + pub type_: ::std::os::raw::c_uchar, + pub apicid: ::std::os::raw::c_uchar, + pub apicver: ::std::os::raw::c_uchar, + pub cpuflag: ::std::os::raw::c_uchar, + pub cpufeature: ::std::os::raw::c_uint, + pub featureflag: ::std::os::raw::c_uint, + pub reserved: [::std::os::raw::c_uint; 2usize], +} +#[test] +fn default_mpc_cpu() { + let mpc_cpu = mpc_cpu::default(); + assert_eq!(mpc_cpu.type_, 0u8); + assert_eq!(mpc_cpu.apicid, 0u8); + assert_eq!(mpc_cpu.apicver, 0u8); + assert_eq!(mpc_cpu.cpuflag, 0u8); + assert_eq!(mpc_cpu.cpufeature, 0u32); + assert_eq!(mpc_cpu.featureflag, 0u32); + assert_eq!(mpc_cpu.reserved, [0u32, 0u32]); +} +#[test] +fn bindgen_test_layout_mpc_cpu() { + assert_eq!( + ::std::mem::size_of::(), + 20usize, + concat!("Size of: ", stringify!(mpc_cpu)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(mpc_cpu)) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).type_ as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(mpc_cpu), + "::", + stringify!(type_) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).apicid as *const _ as usize }, + 1usize, + concat!( + "Alignment of field: ", + stringify!(mpc_cpu), + "::", + stringify!(apicid) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).apicver as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(mpc_cpu), + "::", + stringify!(apicver) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).cpuflag as *const _ as usize }, + 3usize, + concat!( + "Alignment of field: ", + stringify!(mpc_cpu), + "::", + stringify!(cpuflag) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).cpufeature as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(mpc_cpu), + "::", + stringify!(cpufeature) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).featureflag as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(mpc_cpu), + "::", + stringify!(featureflag) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).reserved as *const _ as usize }, + 12usize, + concat!( + "Alignment of field: ", + stringify!(mpc_cpu), + "::", + stringify!(reserved) + ) + ); +} +impl Clone for mpc_cpu { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct mpc_bus { + pub type_: ::std::os::raw::c_uchar, + pub busid: ::std::os::raw::c_uchar, + pub bustype: [::std::os::raw::c_uchar; 6usize], +} +#[test] +fn default_mpc_bus() { + let mpc_bus = mpc_bus::default(); + assert_eq!(mpc_bus.type_, 0u8); + assert_eq!(mpc_bus.busid, 0u8); + assert_eq!(mpc_bus.bustype, [0u8, 0u8, 0u8, 0u8, 0u8, 0u8]); +} +#[test] +fn bindgen_test_layout_mpc_bus() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!("Size of: ", stringify!(mpc_bus)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(mpc_bus)) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).type_ as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(mpc_bus), + "::", + stringify!(type_) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).busid as *const _ as usize }, + 1usize, + concat!( + "Alignment of field: ", + stringify!(mpc_bus), + "::", + stringify!(busid) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).bustype as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(mpc_bus), + "::", + stringify!(bustype) + ) + ); +} +impl Clone for mpc_bus { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct mpc_ioapic { + pub type_: ::std::os::raw::c_uchar, + pub apicid: ::std::os::raw::c_uchar, + pub apicver: ::std::os::raw::c_uchar, + pub flags: ::std::os::raw::c_uchar, + pub apicaddr: ::std::os::raw::c_uint, +} +#[test] +fn default_mpc_ioapic() { + let mpc_ioapic = mpc_ioapic::default(); + assert_eq!(mpc_ioapic.type_, 0u8); + assert_eq!(mpc_ioapic.apicid, 0u8); + assert_eq!(mpc_ioapic.apicver, 0u8); + assert_eq!(mpc_ioapic.flags, 0u8); + assert_eq!(mpc_ioapic.apicaddr, 0u32); +} +#[test] +fn bindgen_test_layout_mpc_ioapic() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!("Size of: ", stringify!(mpc_ioapic)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(mpc_ioapic)) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).type_ as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(mpc_ioapic), + "::", + stringify!(type_) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).apicid as *const _ as usize }, + 1usize, + concat!( + "Alignment of field: ", + stringify!(mpc_ioapic), + "::", + stringify!(apicid) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).apicver as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(mpc_ioapic), + "::", + stringify!(apicver) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).flags as *const _ as usize }, + 3usize, + concat!( + "Alignment of field: ", + stringify!(mpc_ioapic), + "::", + stringify!(flags) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).apicaddr as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(mpc_ioapic), + "::", + stringify!(apicaddr) + ) + ); +} +impl Clone for mpc_ioapic { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct mpc_intsrc { + pub type_: ::std::os::raw::c_uchar, + pub irqtype: ::std::os::raw::c_uchar, + pub irqflag: ::std::os::raw::c_ushort, + pub srcbus: ::std::os::raw::c_uchar, + pub srcbusirq: ::std::os::raw::c_uchar, + pub dstapic: ::std::os::raw::c_uchar, + pub dstirq: ::std::os::raw::c_uchar, +} +#[test] +fn default_mpc_intsrc() { + let mpc_intsrc = mpc_intsrc::default(); + assert_eq!(mpc_intsrc.type_, 0u8); + assert_eq!(mpc_intsrc.irqtype, 0u8); + assert_eq!(mpc_intsrc.irqflag, 0u16); + assert_eq!(mpc_intsrc.srcbus, 0u8); + assert_eq!(mpc_intsrc.srcbusirq, 0u8); + assert_eq!(mpc_intsrc.dstapic, 0u8); + assert_eq!(mpc_intsrc.dstirq, 0u8); +} +#[test] +fn bindgen_test_layout_mpc_intsrc() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!("Size of: ", stringify!(mpc_intsrc)) + ); + assert_eq!( + ::std::mem::align_of::(), + 2usize, + concat!("Alignment of ", stringify!(mpc_intsrc)) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).type_ as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(mpc_intsrc), + "::", + stringify!(type_) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).irqtype as *const _ as usize }, + 1usize, + concat!( + "Alignment of field: ", + stringify!(mpc_intsrc), + "::", + stringify!(irqtype) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).irqflag as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(mpc_intsrc), + "::", + stringify!(irqflag) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).srcbus as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(mpc_intsrc), + "::", + stringify!(srcbus) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).srcbusirq as *const _ as usize }, + 5usize, + concat!( + "Alignment of field: ", + stringify!(mpc_intsrc), + "::", + stringify!(srcbusirq) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).dstapic as *const _ as usize }, + 6usize, + concat!( + "Alignment of field: ", + stringify!(mpc_intsrc), + "::", + stringify!(dstapic) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).dstirq as *const _ as usize }, + 7usize, + concat!( + "Alignment of field: ", + stringify!(mpc_intsrc), + "::", + stringify!(dstirq) + ) + ); +} +impl Clone for mpc_intsrc { + fn clone(&self) -> Self { + *self + } +} +pub const mp_irq_source_types_mp_INT: mp_irq_source_types = 0; +pub const mp_irq_source_types_mp_NMI: mp_irq_source_types = 1; +pub const mp_irq_source_types_mp_SMI: mp_irq_source_types = 2; +pub const mp_irq_source_types_mp_ExtINT: mp_irq_source_types = 3; +pub type mp_irq_source_types = ::std::os::raw::c_uint; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct mpc_lintsrc { + pub type_: ::std::os::raw::c_uchar, + pub irqtype: ::std::os::raw::c_uchar, + pub irqflag: ::std::os::raw::c_ushort, + pub srcbusid: ::std::os::raw::c_uchar, + pub srcbusirq: ::std::os::raw::c_uchar, + pub destapic: ::std::os::raw::c_uchar, + pub destapiclint: ::std::os::raw::c_uchar, +} +#[test] +fn default_mpc_lintsrc() { + let mpc_lintsrc = mpc_lintsrc::default(); + assert_eq!(mpc_lintsrc.type_, 0u8); + assert_eq!(mpc_lintsrc.irqtype, 0u8); + assert_eq!(mpc_lintsrc.irqflag, 0u16); + assert_eq!(mpc_lintsrc.srcbusid, 0u8); + assert_eq!(mpc_lintsrc.srcbusirq, 0u8); + assert_eq!(mpc_lintsrc.destapic, 0u8); + assert_eq!(mpc_lintsrc.destapiclint, 0u8); +} +#[test] +fn bindgen_test_layout_mpc_lintsrc() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!("Size of: ", stringify!(mpc_lintsrc)) + ); + assert_eq!( + ::std::mem::align_of::(), + 2usize, + concat!("Alignment of ", stringify!(mpc_lintsrc)) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).type_ as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(mpc_lintsrc), + "::", + stringify!(type_) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).irqtype as *const _ as usize }, + 1usize, + concat!( + "Alignment of field: ", + stringify!(mpc_lintsrc), + "::", + stringify!(irqtype) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).irqflag as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(mpc_lintsrc), + "::", + stringify!(irqflag) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).srcbusid as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(mpc_lintsrc), + "::", + stringify!(srcbusid) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).srcbusirq as *const _ as usize }, + 5usize, + concat!( + "Alignment of field: ", + stringify!(mpc_lintsrc), + "::", + stringify!(srcbusirq) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).destapic as *const _ as usize }, + 6usize, + concat!( + "Alignment of field: ", + stringify!(mpc_lintsrc), + "::", + stringify!(destapic) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).destapiclint as *const _ as usize }, + 7usize, + concat!( + "Alignment of field: ", + stringify!(mpc_lintsrc), + "::", + stringify!(destapiclint) + ) + ); +} +impl Clone for mpc_lintsrc { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct mpc_oemtable { + pub signature: [::std::os::raw::c_char; 4usize], + pub length: ::std::os::raw::c_ushort, + pub rev: ::std::os::raw::c_char, + pub checksum: ::std::os::raw::c_char, + pub mpc: [::std::os::raw::c_char; 8usize], +} +#[test] +fn default_mpc_oemtable() { + let mpc_oemtable = mpc_oemtable::default(); + assert_eq!(mpc_oemtable.signature, [0i8, 0i8, 0i8, 0i8]); + assert_eq!(mpc_oemtable.length, 0u16); + assert_eq!(mpc_oemtable.rev, 0i8); + assert_eq!(mpc_oemtable.checksum, 0i8); + assert_eq!(mpc_oemtable.mpc, [0i8, 0i8, 0i8, 0i8, 0i8, 0i8, 0i8, 0i8]); +} +#[test] +fn bindgen_test_layout_mpc_oemtable() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(mpc_oemtable)) + ); + assert_eq!( + ::std::mem::align_of::(), + 2usize, + concat!("Alignment of ", stringify!(mpc_oemtable)) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).signature as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(mpc_oemtable), + "::", + stringify!(signature) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).length as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(mpc_oemtable), + "::", + stringify!(length) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).rev as *const _ as usize }, + 6usize, + concat!( + "Alignment of field: ", + stringify!(mpc_oemtable), + "::", + stringify!(rev) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).checksum as *const _ as usize }, + 7usize, + concat!( + "Alignment of field: ", + stringify!(mpc_oemtable), + "::", + stringify!(checksum) + ) + ); + assert_eq!( + unsafe { &(*(std::ptr::null::())).mpc as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(mpc_oemtable), + "::", + stringify!(mpc) + ) + ); +} +impl Clone for mpc_oemtable { + fn clone(&self) -> Self { + *self + } +} + +pub const mp_bustype_MP_BUS_ISA: mp_bustype = 1; +pub const mp_bustype_MP_BUS_EISA: mp_bustype = 2; +pub const mp_bustype_MP_BUS_PCI: mp_bustype = 3; +pub type mp_bustype = ::std::os::raw::c_uint; diff --git a/src/dragonball/src/dbs_boot/src/x86_64/mptable.rs b/src/dragonball/src/dbs_boot/src/x86_64/mptable.rs new file mode 100644 index 000000000..008e972a5 --- /dev/null +++ b/src/dragonball/src/dbs_boot/src/x86_64/mptable.rs @@ -0,0 +1,523 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! MP Table configurations used for defining VM boot status. + +use libc::c_char; +use std::io; +use std::mem; +use std::result; +use std::slice; + +use super::mpspec; +use vm_memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemory}; + +// This is a workaround to the Rust enforcement specifying that any implementation of a foreign +// trait (in this case `ByteValued`) where: +// * the type that is implementing the trait is foreign or +// * all of the parameters being passed to the trait (if there are any) are also foreign +// is prohibited. +#[repr(transparent)] +#[derive(Copy, Clone, Default)] +struct MpcBusWrapper(mpspec::mpc_bus); +#[repr(transparent)] +#[derive(Copy, Clone, Default)] +struct MpcCpuWrapper(mpspec::mpc_cpu); +#[repr(transparent)] +#[derive(Copy, Clone, Default)] +struct MpcIntsrcWrapper(mpspec::mpc_intsrc); +#[repr(transparent)] +#[derive(Copy, Clone, Default)] +struct MpcIoapicWrapper(mpspec::mpc_ioapic); +#[repr(transparent)] +#[derive(Copy, Clone, Default)] +struct MpcTableWrapper(mpspec::mpc_table); +#[repr(transparent)] +#[derive(Copy, Clone, Default)] +struct MpcLintsrcWrapper(mpspec::mpc_lintsrc); +#[repr(transparent)] +#[derive(Copy, Clone, Default)] +struct MpfIntelWrapper(mpspec::mpf_intel); + +// These `mpspec` wrapper types are only data, reading them from data is a safe initialization. +unsafe impl ByteValued for MpcBusWrapper {} +unsafe impl ByteValued for MpcCpuWrapper {} +unsafe impl ByteValued for MpcIntsrcWrapper {} +unsafe impl ByteValued for MpcIoapicWrapper {} +unsafe impl ByteValued for MpcTableWrapper {} +unsafe impl ByteValued for MpcLintsrcWrapper {} +unsafe impl ByteValued for MpfIntelWrapper {} + +// MPTABLE, describing VCPUS. +const MPTABLE_START: u64 = 0x9fc00; + +#[derive(Debug, Eq, PartialEq, thiserror::Error)] +/// MP Table related errors +pub enum Error { + /// There was too little guest memory to store the entire MP table. + #[error("too little guest memory to store the entire MP table")] + NotEnoughMemory, + + /// The MP table has too little address space to be stored. + #[error("the MP table has no enough space")] + AddressOverflow, + + /// Failure while zeroing out the memory for the MP table. + #[error("failure while zeroing out the memory for the MP table")] + Clear, + + /// Number of CPUs exceeds the maximum supported CPUs + #[error("number of CPUs exceeds the maximum supported CPUs")] + TooManyCpus, + + /// Number of CPUs exceeds the maximum supported CPUs + #[error("number of boot CPUs exceeds the maximum number of CPUs")] + TooManyBootCpus, + + /// Failure to write the MP floating pointer. + #[error("failure to write the MP floating pointer")] + WriteMpfIntel, + + /// Failure to write MP CPU entry. + #[error("failure to write MP CPU entry")] + WriteMpcCpu, + + /// Failure to write MP ioapic entry. + #[error("failure to write MP ioapic entry")] + WriteMpcIoapic, + + /// Failure to write MP bus entry. + #[error("failure to write MP bus entry")] + WriteMpcBus, + + /// Failure to write MP interrupt source entry. + #[error("failure to write MP interrupt source entry")] + WriteMpcIntsrc, + + /// Failure to write MP local interrupt source entry. + #[error("failure to write MP local interrupt source entry")] + WriteMpcLintsrc, + + /// Failure to write MP OEM table entry. + #[error("failure to write MP OEM table entry")] + WriteMpcOemtable, + + /// Failure to write MP table header. + #[error("failure to write MP table header")] + WriteMpcTable, +} + +/// Generic type for MP Table Results. +pub type Result = result::Result; + +/// With APIC/xAPIC, there are only 255 APIC IDs available. And IOAPIC occupies +/// one APIC ID, so only 254 CPUs at maximum may be supported. Actually it's +/// a large number for Dragonball usecases. +pub const MAX_SUPPORTED_CPUS: u32 = 254; + +// Convenience macro for making arrays of diverse character types. +macro_rules! char_array { + ($t:ty; $( $c:expr ),*) => ( [ $( $c as $t ),* ] ) +} + +// Most of these variables are sourced from the Intel MP Spec 1.4. +const SMP_MAGIC_IDENT: [c_char; 4] = char_array!(c_char; '_', 'M', 'P', '_'); +const MPC_SIGNATURE: [c_char; 4] = char_array!(c_char; 'P', 'C', 'M', 'P'); +const MPC_SPEC: i8 = 4; +const MPC_OEM: [c_char; 8] = char_array!(c_char; 'A', 'L', 'I', 'C', 'L', 'O', 'U', 'D'); +const MPC_PRODUCT_ID: [c_char; 12] = + char_array!(c_char; 'D', 'R', 'A', 'G', 'O', 'N', 'B', 'A', 'L', 'L', '1', '0'); +const BUS_TYPE_ISA: [u8; 6] = char_array!(u8; 'I', 'S', 'A', ' ', ' ', ' '); +const IO_APIC_DEFAULT_PHYS_BASE: u32 = 0xfec0_0000; // source: linux/arch/x86/include/asm/apicdef.h +const APIC_DEFAULT_PHYS_BASE: u32 = 0xfee0_0000; // source: linux/arch/x86/include/asm/apicdef.h + +/// APIC version in mptable +pub const APIC_VERSION: u8 = 0x14; + +const CPU_STEPPING: u32 = 0x600; +const CPU_FEATURE_APIC: u32 = 0x200; +const CPU_FEATURE_FPU: u32 = 0x001; + +const BUS_ID_ISA: u8 = 0; + +fn compute_checksum(v: &T) -> u8 { + // Safe because we are only reading the bytes within the size of the `T` reference `v`. + let v_slice = unsafe { slice::from_raw_parts(v as *const T as *const u8, mem::size_of::()) }; + let mut checksum: u8 = 0; + for i in v_slice.iter() { + checksum = checksum.wrapping_add(*i); + } + checksum +} + +fn mpf_intel_compute_checksum(v: &mpspec::mpf_intel) -> u8 { + let checksum = compute_checksum(v).wrapping_sub(v.checksum); + (!checksum).wrapping_add(1) +} + +fn compute_mp_size(num_cpus: u8) -> usize { + mem::size_of::() + + mem::size_of::() + + mem::size_of::() * (num_cpus as usize) + + mem::size_of::() + + mem::size_of::() * 2 + + mem::size_of::() * 16 + + mem::size_of::() * 2 +} + +/// Performs setup of the MP table for the given `num_cpus` +pub fn setup_mptable(mem: &M, boot_cpus: u8, max_cpus: u8) -> Result<()> { + if boot_cpus > max_cpus { + return Err(Error::TooManyBootCpus); + } + if u32::from(max_cpus) > MAX_SUPPORTED_CPUS { + return Err(Error::TooManyCpus); + } + + // Used to keep track of the next base pointer into the MP table. + let mut base_mp = GuestAddress(MPTABLE_START); + + let mp_size = compute_mp_size(max_cpus); + + let mut checksum: u8 = 0; + let ioapicid: u8 = max_cpus + 1; + + // The checked_add here ensures the all of the following base_mp.unchecked_add's will be without + // overflow. + if let Some(end_mp) = base_mp.checked_add((mp_size - 1) as u64) { + if !mem.address_in_range(end_mp) { + return Err(Error::NotEnoughMemory); + } + } else { + return Err(Error::AddressOverflow); + } + + mem.read_from(base_mp, &mut io::repeat(0), mp_size) + .map_err(|_| Error::Clear)?; + + { + let mut mpf_intel = MpfIntelWrapper(mpspec::mpf_intel::default()); + let size = mem::size_of::() as u64; + mpf_intel.0.signature = SMP_MAGIC_IDENT; + mpf_intel.0.length = 1; + mpf_intel.0.specification = 4; + mpf_intel.0.physptr = (base_mp.raw_value() + size) as u32; + mpf_intel.0.checksum = mpf_intel_compute_checksum(&mpf_intel.0); + mem.write_obj(mpf_intel, base_mp) + .map_err(|_| Error::WriteMpfIntel)?; + base_mp = base_mp.unchecked_add(size); + } + + // We set the location of the mpc_table here but we can't fill it out until we have the length + // of the entire table later. + let table_base = base_mp; + base_mp = base_mp.unchecked_add(mem::size_of::() as u64); + + { + let size = mem::size_of::() as u64; + for cpu_id in 0..max_cpus { + let mut mpc_cpu = MpcCpuWrapper(mpspec::mpc_cpu::default()); + mpc_cpu.0.type_ = mpspec::MP_PROCESSOR as u8; + mpc_cpu.0.apicid = cpu_id; + mpc_cpu.0.apicver = APIC_VERSION; + if cpu_id < boot_cpus { + mpc_cpu.0.cpuflag |= mpspec::CPU_ENABLED as u8; + } + if cpu_id == 0 { + mpc_cpu.0.cpuflag |= mpspec::CPU_BOOTPROCESSOR as u8; + } + mpc_cpu.0.cpufeature = CPU_STEPPING; + mpc_cpu.0.featureflag = CPU_FEATURE_APIC | CPU_FEATURE_FPU; + mem.write_obj(mpc_cpu, base_mp) + .map_err(|_| Error::WriteMpcCpu)?; + base_mp = base_mp.unchecked_add(size); + checksum = checksum.wrapping_add(compute_checksum(&mpc_cpu.0)); + } + } + + { + let size = mem::size_of::() as u64; + let mut mpc_bus = MpcBusWrapper(mpspec::mpc_bus::default()); + mpc_bus.0.type_ = mpspec::MP_BUS as u8; + mpc_bus.0.busid = BUS_ID_ISA; + mpc_bus.0.bustype = BUS_TYPE_ISA; + mem.write_obj(mpc_bus, base_mp) + .map_err(|_| Error::WriteMpcBus)?; + base_mp = base_mp.unchecked_add(size); + checksum = checksum.wrapping_add(compute_checksum(&mpc_bus.0)); + } + + { + let size = mem::size_of::() as u64; + let mut mpc_ioapic = MpcIoapicWrapper(mpspec::mpc_ioapic::default()); + mpc_ioapic.0.type_ = mpspec::MP_IOAPIC as u8; + mpc_ioapic.0.apicid = ioapicid; + mpc_ioapic.0.apicver = APIC_VERSION; + mpc_ioapic.0.flags = mpspec::MPC_APIC_USABLE as u8; + mpc_ioapic.0.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; + mem.write_obj(mpc_ioapic, base_mp) + .map_err(|_| Error::WriteMpcIoapic)?; + base_mp = base_mp.unchecked_add(size); + checksum = checksum.wrapping_add(compute_checksum(&mpc_ioapic.0)); + } + // Per kvm_setup_default_irq_routing() in kernel + for i in 0..16 { + let size = mem::size_of::() as u64; + let mut mpc_intsrc = MpcIntsrcWrapper(mpspec::mpc_intsrc::default()); + mpc_intsrc.0.type_ = mpspec::MP_INTSRC as u8; + mpc_intsrc.0.irqtype = mpspec::mp_irq_source_types_mp_INT as u8; + mpc_intsrc.0.irqflag = mpspec::MP_IRQDIR_DEFAULT as u16; + mpc_intsrc.0.srcbus = BUS_ID_ISA; + mpc_intsrc.0.srcbusirq = i; + mpc_intsrc.0.dstapic = ioapicid; + mpc_intsrc.0.dstirq = i; + mem.write_obj(mpc_intsrc, base_mp) + .map_err(|_| Error::WriteMpcIntsrc)?; + base_mp = base_mp.unchecked_add(size); + checksum = checksum.wrapping_add(compute_checksum(&mpc_intsrc.0)); + } + { + let size = mem::size_of::() as u64; + let mut mpc_lintsrc = MpcLintsrcWrapper(mpspec::mpc_lintsrc::default()); + mpc_lintsrc.0.type_ = mpspec::MP_LINTSRC as u8; + mpc_lintsrc.0.irqtype = mpspec::mp_irq_source_types_mp_ExtINT as u8; + mpc_lintsrc.0.irqflag = mpspec::MP_IRQDIR_DEFAULT as u16; + mpc_lintsrc.0.srcbusid = 0; + mpc_lintsrc.0.srcbusirq = 0; + mpc_lintsrc.0.destapic = 0; + mpc_lintsrc.0.destapiclint = 0; + mem.write_obj(mpc_lintsrc, base_mp) + .map_err(|_| Error::WriteMpcLintsrc)?; + base_mp = base_mp.unchecked_add(size); + checksum = checksum.wrapping_add(compute_checksum(&mpc_lintsrc.0)); + } + { + let size = mem::size_of::() as u64; + let mut mpc_lintsrc = MpcLintsrcWrapper(mpspec::mpc_lintsrc::default()); + mpc_lintsrc.0.type_ = mpspec::MP_LINTSRC as u8; + mpc_lintsrc.0.irqtype = mpspec::mp_irq_source_types_mp_NMI as u8; + mpc_lintsrc.0.irqflag = mpspec::MP_IRQDIR_DEFAULT as u16; + mpc_lintsrc.0.srcbusid = 0; + mpc_lintsrc.0.srcbusirq = 0; + mpc_lintsrc.0.destapic = 0xFF; /* to all local APICs */ + mpc_lintsrc.0.destapiclint = 1; + mem.write_obj(mpc_lintsrc, base_mp) + .map_err(|_| Error::WriteMpcLintsrc)?; + base_mp = base_mp.unchecked_add(size); + checksum = checksum.wrapping_add(compute_checksum(&mpc_lintsrc.0)); + } + + // At this point we know the size of the mp_table. + let table_end = base_mp; + + let mpc_table_size = mem::size_of::() as u64; + base_mp = base_mp.unchecked_add(mpc_table_size); + let oem_count = 0; + let oem_size = 0; + let oem_ptr = base_mp; + + { + let mut mpc_table = MpcTableWrapper(mpspec::mpc_table::default()); + mpc_table.0.signature = MPC_SIGNATURE; + // it's safe to use unchecked_offset_from because + // table_end > table_base + mpc_table.0.length = table_end.unchecked_offset_from(table_base) as u16; + mpc_table.0.spec = MPC_SPEC; + mpc_table.0.oem = MPC_OEM; + mpc_table.0.oemcount = oem_count; + mpc_table.0.oemptr = oem_ptr.0 as u32; + mpc_table.0.oemsize = oem_size as u16; + mpc_table.0.productid = MPC_PRODUCT_ID; + mpc_table.0.lapic = APIC_DEFAULT_PHYS_BASE; + checksum = checksum.wrapping_add(compute_checksum(&mpc_table.0)); + mpc_table.0.checksum = (!checksum).wrapping_add(1) as i8; + mem.write_obj(mpc_table, table_base) + .map_err(|_| Error::WriteMpcTable)?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use vm_memory::{Bytes, GuestMemoryMmap}; + + fn table_entry_size(type_: u8) -> usize { + match u32::from(type_) { + mpspec::MP_PROCESSOR => mem::size_of::(), + mpspec::MP_BUS => mem::size_of::(), + mpspec::MP_IOAPIC => mem::size_of::(), + mpspec::MP_INTSRC => mem::size_of::(), + mpspec::MP_LINTSRC => mem::size_of::(), + _ => panic!("unrecognized mpc table entry type: {}", type_), + } + } + + #[test] + fn bounds_check() { + let num_cpus = 4; + let mem = GuestMemoryMmap::<()>::from_ranges(&[( + GuestAddress(MPTABLE_START), + compute_mp_size(num_cpus), + )]) + .unwrap(); + + setup_mptable(&mem, num_cpus, num_cpus).unwrap(); + } + + #[test] + fn bounds_check_fails() { + let num_cpus = 4; + let mem = GuestMemoryMmap::<()>::from_ranges(&[( + GuestAddress(MPTABLE_START), + compute_mp_size(num_cpus) - 1, + )]) + .unwrap(); + + assert!(setup_mptable(&mem, num_cpus, num_cpus).is_err()); + } + + #[test] + fn mpf_intel_checksum() { + let num_cpus = 1; + let mem = GuestMemoryMmap::<()>::from_ranges(&[( + GuestAddress(MPTABLE_START), + compute_mp_size(num_cpus), + )]) + .unwrap(); + + setup_mptable(&mem, num_cpus, num_cpus).unwrap(); + + let mpf_intel: MpfIntelWrapper = mem.read_obj(GuestAddress(MPTABLE_START)).unwrap(); + + assert_eq!( + mpf_intel_compute_checksum(&mpf_intel.0), + mpf_intel.0.checksum + ); + } + + #[test] + fn mpc_table_checksum() { + let num_cpus = 4; + let mem = GuestMemoryMmap::<()>::from_ranges(&[( + GuestAddress(MPTABLE_START), + compute_mp_size(num_cpus), + )]) + .unwrap(); + + setup_mptable(&mem, num_cpus, num_cpus).unwrap(); + + let mpf_intel: MpfIntelWrapper = mem.read_obj(GuestAddress(MPTABLE_START)).unwrap(); + let mpc_offset = GuestAddress(u64::from(mpf_intel.0.physptr)); + let mpc_table: MpcTableWrapper = mem.read_obj(mpc_offset).unwrap(); + + struct Sum(u8); + impl io::Write for Sum { + fn write(&mut self, buf: &[u8]) -> io::Result { + for v in buf.iter() { + self.0 = self.0.wrapping_add(*v); + } + Ok(buf.len()) + } + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } + } + + let mut sum = Sum(0); + mem.write_to(mpc_offset, &mut sum, mpc_table.0.length as usize) + .unwrap(); + assert_eq!(sum.0, 0); + } + + #[test] + fn max_cpu_entry_count() { + let mem = GuestMemoryMmap::<()>::from_ranges(&[( + GuestAddress(MPTABLE_START), + compute_mp_size(MAX_SUPPORTED_CPUS as u8), + )]) + .unwrap(); + + for i in 0..MAX_SUPPORTED_CPUS as u8 { + setup_mptable(&mem, i, i).unwrap(); + + let mpf_intel: MpfIntelWrapper = mem.read_obj(GuestAddress(MPTABLE_START)).unwrap(); + let mpc_offset = GuestAddress(u64::from(mpf_intel.0.physptr)); + let mpc_table: MpcTableWrapper = mem.read_obj(mpc_offset).unwrap(); + let mpc_end = mpc_offset + .checked_add(u64::from(mpc_table.0.length)) + .unwrap(); + + let mut entry_offset = mpc_offset + .checked_add(mem::size_of::() as u64) + .unwrap(); + let mut max_cpu_count = 0; + while entry_offset < mpc_end { + let entry_type: u8 = mem.read_obj(entry_offset).unwrap(); + entry_offset = entry_offset + .checked_add(table_entry_size(entry_type) as u64) + .unwrap(); + assert!(entry_offset <= mpc_end); + if u32::from(entry_type) == mpspec::MP_PROCESSOR { + max_cpu_count += 1; + } + } + assert_eq!(max_cpu_count, i); + } + } + + #[test] + fn boot_cpu_entry_count() { + let mem = GuestMemoryMmap::<()>::from_ranges(&[( + GuestAddress(MPTABLE_START), + compute_mp_size(MAX_SUPPORTED_CPUS as u8), + )]) + .unwrap(); + + for i in 0..MAX_SUPPORTED_CPUS as u8 { + setup_mptable(&mem, i, MAX_SUPPORTED_CPUS as u8).unwrap(); + + let mpf_intel: MpfIntelWrapper = mem.read_obj(GuestAddress(MPTABLE_START)).unwrap(); + let mpc_offset = GuestAddress(u64::from(mpf_intel.0.physptr)); + let mpc_table: MpcTableWrapper = mem.read_obj(mpc_offset).unwrap(); + let mpc_end = mpc_offset + .checked_add(u64::from(mpc_table.0.length)) + .unwrap(); + + let mut entry_offset = mpc_offset + .checked_add(mem::size_of::() as u64) + .unwrap(); + let mut boot_cpu_count = 0; + for _ in 0..MAX_SUPPORTED_CPUS { + let mpc_cpu: MpcCpuWrapper = mem.read_obj(entry_offset).unwrap(); + if mpc_cpu.0.cpuflag & mpspec::CPU_ENABLED as u8 != 0 { + boot_cpu_count += 1; + } + entry_offset = entry_offset + .checked_add(table_entry_size(mpc_cpu.0.type_) as u64) + .unwrap(); + assert!(entry_offset <= mpc_end); + } + assert_eq!(boot_cpu_count, i); + } + } + + #[test] + fn cpu_entry_count_max() { + let cpus = MAX_SUPPORTED_CPUS + 1; + let mem = GuestMemoryMmap::<()>::from_ranges(&[( + GuestAddress(MPTABLE_START), + compute_mp_size(cpus as u8), + )]) + .unwrap(); + + let result = setup_mptable(&mem, cpus as u8, cpus as u8).unwrap_err(); + assert_eq!(result, Error::TooManyCpus); + } +} diff --git a/src/dragonball/src/dbs_device/Cargo.toml b/src/dragonball/src/dbs_device/Cargo.toml new file mode 100644 index 000000000..594698ce2 --- /dev/null +++ b/src/dragonball/src/dbs_device/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "dbs-device" +version = "0.2.0" +authors = ["Alibaba Dragonball Team"] +description = "Device model for Dragonball Sandbox" +license = "Apache-2.0" +edition = "2018" +homepage = "https://github.com/openanolis/dragonball-sandbox" +repository = "https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-device" +keywords = ["dragonball", "secure-sandbox", "device", "resource"] +readme = "README.md" + +[dependencies] +thiserror = "1" diff --git a/src/dragonball/src/dbs_device/LICENSE b/src/dragonball/src/dbs_device/LICENSE new file mode 120000 index 000000000..30cff7403 --- /dev/null +++ b/src/dragonball/src/dbs_device/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/src/dragonball/src/dbs_device/README.md b/src/dragonball/src/dbs_device/README.md new file mode 100644 index 000000000..5f13c00cd --- /dev/null +++ b/src/dragonball/src/dbs_device/README.md @@ -0,0 +1,141 @@ +# dbs-device + +The `dbs-device` crate, as a counterpart of [`vm-device`], defines device model for the Dragonball Secure Sandbox. +The `dbs-device` crate shares some common concepts and data structures with [`vm-device`], but it also diverges from +[`vm-device`] due to different VMM designs. + +The dbs-device crate provides: + +- [`DeviceIo`] and [`DeviceIoMut`]: trait for device to handle trapped MMIO/PIO access requests. +- [`IoManager`]: IO manager to handle trapped MMIO/PIO access requests. +- [`IoManagerContext`]: trait for IO manager context object to support device hotplug at runtime. +- [`ResourceConstraint`], [Resource] and [`DeviceResources`]: resource allocation requirements and constraints. + +## Design + +The dbs-device crate is designed to support the virtual machine's device model. + +The core concepts of device model are [Port I/O](https://wiki.osdev.org/I/O_Ports) and +[Memory-mapped I/O](https://en.wikipedia.org/wiki/Memory-mapped_I/O), +which are two main methods of performing I/O between CPU and devices. + +The device model provided by the dbs-device crate works as below: +- The VMM creates a global resource manager, device manager and IO manager. +- The device manager creates virtual devices configured by the VMM + - create device object + - query device allocation requirements and constraints, the device returns an array of [`ResourceConstraint`]. + - allocate resources for device from the resource manager, resource manager returns a [`DeviceResources`] object. + - assign the allocated resources to the device. + - The device manager register devices to the IO manager. + - query trapped address ranges by calling [`DeviceIo::get_trapped_io_resources()`] + - register the device to the IO manager with trapped address range + - The guest access those trapped MMIO/PIO address ranges, and triggers VM IO Exit events to trap into the VMM. + - The VMM parses the VM exit events and dispatch those events to the IO manager. + - The IO manager looks up device by searching trapped address ranges, and call the device's [`DeviceIO`] + handler to process those trapped MMIO/PIO access requests. + +## Usage + +First, a VM needs to create an [`IoManager`] to help it dispatch I/O events to devices. +And an [`IoManager`] has two types of bus, the PIO bus and the MMIO bus, to handle different types of IO. + +Then, when creating a device, it needs to implement the [`DeviceIo`] or [`DeviceIoMut`] trait to receive read or write +events send by driver in guest OS: +- `read()` and `write()` methods is used to deal with MMIO events +- `pio_read()` and `pio_write()` methods is used to deal with PIO events +- `get_assigned_resources()` method is used to get all resources assigned to the device +- `get_trapped_io_resources()` method is used to get only MMIO/PIO resources assigned to the device + +The difference of [`DeviceIo`] and [`DeviceIoMut`] is the reference type of `self` passed to method: +- [`DeviceIo`] trait would pass a immutable reference `&self` to method, so the implementation of device would provide + interior mutability and thread-safe protection itself +- [`DeviceIoMut`] trait would pass a mutable reference `&mut self` to method, and it can give mutability to device + which is wrapped by `Mutex` directly to simplify the difficulty of achieving interior mutability. + +Additionally, the [`DeviceIo`] trait has an auto implement for `Mutex` + +Last, the device needs to be added to [`IoManager`] by using `register_device_io()`, and the function would add device +to PIO bus and/or MMIO bus by the resources it have. If a device has not only MMIO resource but PIO resource, +it would be added to both pio bus and mmio bus. So the device would wrapped by `Arc`. + +From now on, the [`IoManager`] will dispatch I/O requests for the registered address ranges to the device. + +## Examples + + +```rust +use std::sync::Arc; + +use dbs_device::device_manager::IoManager; +use dbs_device::resources::{DeviceResources, Resource}; +use dbs_device::{DeviceIo, IoAddress, PioAddress}; + +struct DummyDevice {} + +impl DeviceIo for DummyDevice { + fn read(&self, base: IoAddress, offset: IoAddress, data: &mut [u8]) { + println!( + "mmio read, base: 0x{:x}, offset: 0x{:x}", + base.raw_value(), + offset.raw_value() + ); + } + + fn write(&self, base: IoAddress, offset: IoAddress, data: &[u8]) { + println!( + "mmio write, base: 0x{:x}, offset: 0x{:x}", + base.raw_value(), + offset.raw_value() + ); + } + + fn pio_read(&self, base: PioAddress, offset: PioAddress, data: &mut [u8]) { + println!( + "pio read, base: 0x{:x}, offset: 0x{:x}", + base.raw_value(), + offset.raw_value() + ); + } + + fn pio_write(&self, base: PioAddress, offset: PioAddress, data: &[u8]) { + println!( + "pio write, base: 0x{:x}, offset: 0x{:x}", + base.raw_value(), + offset.raw_value() + ); + } +} + +// Allocate resources for device +let mut resources = DeviceResources::new(); +resources.append(Resource::MmioAddressRange { + base: 0, + size: 4096, +}); +resources.append(Resource::PioAddressRange { base: 0, size: 32 }); + +// Register device to `IoManager` with resources +let device = Arc::new(DummyDevice {}); +let mut manager = IoManager::new(); +manager.register_device_io(device, &resources).unwrap(); + +// Dispatch I/O event from `IoManager` to device +manager.mmio_write(0, &vec![0, 1]).unwrap(); + +let mut buffer = vec![0; 4]; +manager.pio_read(0, &mut buffer); +``` + +## License + +This project is licensed under [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). + +[DeviceIo::get_trapped_io_resources()]: https://docs.rs/dbs-device/0.1.0/dbs_device/trait.DeviceIo.html#method.get_trapped_io_resources +[DeviceIo]: src/lib.rs +[DeviceIoMut]: src/lib.rs +[IoManager]: src/device_manager.rs +[IoManagerContext]: src/device_manager.rs +[ResourceConstraint]: src/resources.rs +[Resource]: src/resources.rs +[DeviceResources]: src/resources.rs +[vm-device]: https://github.com/rust-vmm/vm-device diff --git a/src/dragonball/src/dbs_device/src/device_manager.rs b/src/dragonball/src/dbs_device/src/device_manager.rs new file mode 100644 index 000000000..63aef81e9 --- /dev/null +++ b/src/dragonball/src/dbs_device/src/device_manager.rs @@ -0,0 +1,695 @@ +// Copyright 2020-2022 Alibaba Cloud. All Rights Reserved. +// Copyright © 2019 Intel Corporation. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! IO Device Manager to handle trapped MMIO/PIO access requests. +//! +//! The [IoManager](self::IoManager) is responsible for managing all trapped MMIO/PIO accesses for +//! virtual devices. It cooperates with the Secure Sandbox/VMM and device drivers to handle trapped +//! accesses. The flow is as below: +//! - device drivers allocate resources from the VMM/resource manager, including trapped MMIO/PIO +//! address ranges. +//! - the device manager registers devices to the [IoManager](self::IoManager) with trapped MMIO/PIO +//! address ranges. +//! - VM IO Exit events get triggered when the guest accesses those trapped address ranges. +//! - the vmm handle those VM IO Exit events, and dispatch them to the [IoManager]. +//! - the [IoManager] invokes registered callbacks/device drivers to handle those accesses, if there +//! is a device registered for the address. +//! +//! # Examples +//! +//! Creating a dummy deivce which implement DeviceIo trait, and register it to [IoManager] with +//! trapped MMIO/PIO address ranges: +//! +//! ``` +//! use std::sync::Arc; +//! use std::any::Any; +//! +//! use dbs_device::device_manager::IoManager; +//! use dbs_device::resources::{DeviceResources, Resource}; +//! use dbs_device::{DeviceIo, IoAddress, PioAddress}; +//! +//! struct DummyDevice {} +//! +//! impl DeviceIo for DummyDevice { +//! fn read(&self, base: IoAddress, offset: IoAddress, data: &mut [u8]) { +//! println!( +//! "mmio read, base: 0x{:x}, offset: 0x{:x}", +//! base.raw_value(), +//! offset.raw_value() +//! ); +//! } +//! +//! fn write(&self, base: IoAddress, offset: IoAddress, data: &[u8]) { +//! println!( +//! "mmio write, base: 0x{:x}, offset: 0x{:x}", +//! base.raw_value(), +//! offset.raw_value() +//! ); +//! } +//! +//! fn pio_read(&self, base: PioAddress, offset: PioAddress, data: &mut [u8]) { +//! println!( +//! "pio read, base: 0x{:x}, offset: 0x{:x}", +//! base.raw_value(), +//! offset.raw_value() +//! ); +//! } +//! +//! fn pio_write(&self, base: PioAddress, offset: PioAddress, data: &[u8]) { +//! println!( +//! "pio write, base: 0x{:x}, offset: 0x{:x}", +//! base.raw_value(), +//! offset.raw_value() +//! ); +//! } +//! +//! fn as_any(&self) -> &dyn Any { +//! self +//! } +//! } +//! +//! // Allocate resources for device +//! let mut resources = DeviceResources::new(); +//! resources.append(Resource::MmioAddressRange { +//! base: 0, +//! size: 4096, +//! }); +//! resources.append(Resource::PioAddressRange { base: 0, size: 32 }); +//! +//! // Register device to `IoManager` with resources +//! let device = Arc::new(DummyDevice {}); +//! let mut manager = IoManager::new(); +//! manager.register_device_io(device, &resources).unwrap(); +//! +//! // Dispatch I/O event from `IoManager` to device +//! manager.mmio_write(0, &vec![0, 1]).unwrap(); +//! { +//! let mut buffer = vec![0; 4]; +//! manager.pio_read(0, &mut buffer); +//! } +//! ``` + +use std::cmp::{Ord, Ordering, PartialEq, PartialOrd}; +use std::collections::btree_map::BTreeMap; +use std::ops::Deref; +use std::result; +use std::sync::Arc; + +use thiserror::Error; + +use crate::resources::Resource; +use crate::{DeviceIo, IoAddress, IoSize, PioAddress}; + +/// Error types for `IoManager` related operations. +#[derive(Error, Debug)] +pub enum Error { + /// The inserting device overlaps with a current device. + #[error("device address conflicts with existing devices")] + DeviceOverlap, + /// The device doesn't exist. + #[error("no such device")] + NoDevice, +} + +/// A specialized version of [std::result::Result] for [IoManager] realted operations. +pub type Result = result::Result; + +/// Structure representing an IO address range. +#[derive(Debug, Copy, Clone, Eq)] +pub struct IoRange { + base: IoAddress, + size: IoSize, +} + +impl IoRange { + fn new_pio_range(base: u16, size: u16) -> Self { + IoRange { + base: IoAddress(base as u64), + size: IoSize(size as u64), + } + } + + fn new_mmio_range(base: u64, size: u64) -> Self { + IoRange { + base: IoAddress(base), + size: IoSize(size), + } + } +} + +impl PartialEq for IoRange { + fn eq(&self, other: &IoRange) -> bool { + self.base == other.base + } +} + +impl Ord for IoRange { + fn cmp(&self, other: &IoRange) -> Ordering { + self.base.cmp(&other.base) + } +} + +impl PartialOrd for IoRange { + fn partial_cmp(&self, other: &IoRange) -> Option { + self.base.partial_cmp(&other.base) + } +} + +/// IO manager to handle all trapped MMIO/PIO access requests. +/// +/// All devices handling trapped MMIO/PIO accesses should register themself to the IO manager +/// with trapped address ranges. When guest vm accesses those trapped MMIO/PIO address ranges, +/// VM IO Exit events will be triggered and the VMM dispatches those events to IO manager. +/// And then the registered callbacks will invoked by IO manager. +#[derive(Clone, Default)] +pub struct IoManager { + /// Range mapping for VM exit pio operations. + pio_bus: BTreeMap>, + /// Range mapping for VM exit mmio operations. + mmio_bus: BTreeMap>, +} + +impl IoManager { + /// Create a new instance of [IoManager]. + pub fn new() -> Self { + IoManager::default() + } + + /// Register a new device to the [IoManager], with trapped MMIO/PIO address ranges. + /// + /// # Arguments + /// + /// * `device`: device object to handle trapped IO access requests + /// * `resources`: resources representing trapped MMIO/PIO address ranges. Only MMIO/PIO address + /// ranges will be handled, and other types of resource will be ignored. So the caller does + /// not need to filter out non-MMIO/PIO resources. + pub fn register_device_io( + &mut self, + device: Arc, + resources: &[Resource], + ) -> Result<()> { + for (idx, res) in resources.iter().enumerate() { + match *res { + Resource::PioAddressRange { base, size } => { + if self + .pio_bus + .insert(IoRange::new_pio_range(base, size), device.clone()) + .is_some() + { + // Rollback registered resources. + self.unregister_device_io(&resources[0..idx]) + .expect("failed to unregister devices"); + + return Err(Error::DeviceOverlap); + } + } + Resource::MmioAddressRange { base, size } => { + if self + .mmio_bus + .insert(IoRange::new_mmio_range(base, size), device.clone()) + .is_some() + { + // Rollback registered resources. + self.unregister_device_io(&resources[0..idx]) + .expect("failed to unregister devices"); + + return Err(Error::DeviceOverlap); + } + } + _ => continue, + } + } + Ok(()) + } + + /// Unregister a device from `IoManager`. + /// + /// # Arguments + /// + /// * `resources`: resource list containing all trapped address ranges for the device. + pub fn unregister_device_io(&mut self, resources: &[Resource]) -> Result<()> { + for res in resources.iter() { + match *res { + Resource::PioAddressRange { base, size } => { + self.pio_bus.remove(&IoRange::new_pio_range(base, size)); + } + Resource::MmioAddressRange { base, size } => { + self.mmio_bus.remove(&IoRange::new_mmio_range(base, size)); + } + _ => continue, + } + } + Ok(()) + } + + /// Handle VM IO Exit events triggered by trapped MMIO read accesses. + /// + /// Return error if failed to get the device. + pub fn mmio_read(&self, addr: u64, data: &mut [u8]) -> Result<()> { + self.get_mmio_device(IoAddress(addr)) + .map(|(device, base)| device.read(base, IoAddress(addr - base.raw_value()), data)) + .ok_or(Error::NoDevice) + } + + /// Handle VM IO Exit events triggered by trapped MMIO write accesses. + /// + /// Return error if failed to get the device. + pub fn mmio_write(&self, addr: u64, data: &[u8]) -> Result<()> { + self.get_mmio_device(IoAddress(addr)) + .map(|(device, base)| device.write(base, IoAddress(addr - base.raw_value()), data)) + .ok_or(Error::NoDevice) + } + + /// Get the registered device handling the trapped MMIO address `addr`. + fn get_mmio_device(&self, addr: IoAddress) -> Option<(&Arc, IoAddress)> { + let range = IoRange::new_mmio_range(addr.raw_value(), 0); + if let Some((range, dev)) = self.mmio_bus.range(..=&range).nth_back(0) { + if (addr.raw_value() - range.base.raw_value()) < range.size.raw_value() { + return Some((dev, range.base)); + } + } + None + } +} + +impl IoManager { + /// Handle VM IO Exit events triggered by trapped PIO read accesses. + /// + /// Return error if failed to get the device. + pub fn pio_read(&self, addr: u16, data: &mut [u8]) -> Result<()> { + self.get_pio_device(PioAddress(addr)) + .map(|(device, base)| device.pio_read(base, PioAddress(addr - base.raw_value()), data)) + .ok_or(Error::NoDevice) + } + + /// Handle VM IO Exit events triggered by trapped PIO write accesses. + /// + /// Return error if failed to get the device. + pub fn pio_write(&self, addr: u16, data: &[u8]) -> Result<()> { + self.get_pio_device(PioAddress(addr)) + .map(|(device, base)| device.pio_write(base, PioAddress(addr - base.raw_value()), data)) + .ok_or(Error::NoDevice) + } + + /// Get the registered device handling the trapped PIO address `addr`. + fn get_pio_device(&self, addr: PioAddress) -> Option<(&Arc, PioAddress)> { + let range = IoRange::new_pio_range(addr.raw_value(), 0); + if let Some((range, dev)) = self.pio_bus.range(..=&range).nth_back(0) { + if (addr.raw_value() as u64 - range.base.raw_value()) < range.size.raw_value() { + return Some((dev, PioAddress(range.base.0 as u16))); + } + } + None + } +} + +impl PartialEq for IoManager { + fn eq(&self, other: &IoManager) -> bool { + if self.pio_bus.len() != other.pio_bus.len() { + return false; + } + if self.mmio_bus.len() != other.mmio_bus.len() { + return false; + } + + for (io_range, device_io) in self.pio_bus.iter() { + if !other.pio_bus.contains_key(io_range) { + return false; + } + let other_device_io = &other.pio_bus[io_range]; + if device_io.get_trapped_io_resources() != other_device_io.get_trapped_io_resources() { + return false; + } + } + + for (io_range, device_io) in self.mmio_bus.iter() { + if !other.mmio_bus.contains_key(io_range) { + return false; + } + let other_device_io = &other.mmio_bus[io_range]; + if device_io.get_trapped_io_resources() != other_device_io.get_trapped_io_resources() { + return false; + } + } + + true + } +} + +/// Trait for IO manager context object to support device hotplug at runtime. +/// +/// The `IoManagerContext` objects are passed to devices by the IO manager, so the devices could +/// use it to hot-add/hot-remove other devices at runtime. It provides a transaction mechanism +/// to hot-add/hot-remove devices. +pub trait IoManagerContext { + /// Type of context object passed to the callbacks. + type Context; + + /// Begin a transaction and return a context object. + /// + /// The returned context object must be passed to commit_tx() or cancel_tx() later. + fn begin_tx(&self) -> Self::Context; + + /// Commit the transaction. + fn commit_tx(&self, ctx: Self::Context); + + /// Cancel the transaction. + fn cancel_tx(&self, ctx: Self::Context); + + /// Register a new device with its associated resources to the IO manager. + /// + /// # Arguments + /// + /// * `ctx`: context object returned by begin_tx(). + /// * `device`: device instance object to be registered + /// * `resources`: resources representing trapped MMIO/PIO address ranges. Only MMIO/PIO address + /// ranges will be handled, and other types of resource will be ignored. So the caller does + /// not need to filter out non-MMIO/PIO resources. + fn register_device_io( + &self, + ctx: &mut Self::Context, + device: Arc, + resources: &[Resource], + ) -> Result<()>; + + /// Unregister a device from the IO manager. + /// + /// # Arguments + /// + /// * `ctx`: context object returned by begin_tx(). + /// * `resources`: resource list containing all trapped address ranges for the device. + fn unregister_device_io(&self, ctx: &mut Self::Context, resources: &[Resource]) -> Result<()>; +} + +impl IoManagerContext for Arc { + type Context = T::Context; + + fn begin_tx(&self) -> Self::Context { + self.deref().begin_tx() + } + + fn commit_tx(&self, ctx: Self::Context) { + self.deref().commit_tx(ctx) + } + + fn cancel_tx(&self, ctx: Self::Context) { + self.deref().cancel_tx(ctx) + } + + fn register_device_io( + &self, + ctx: &mut Self::Context, + device: Arc, + resources: &[Resource], + ) -> std::result::Result<(), Error> { + self.deref().register_device_io(ctx, device, resources) + } + + fn unregister_device_io( + &self, + ctx: &mut Self::Context, + resources: &[Resource], + ) -> std::result::Result<(), Error> { + self.deref().unregister_device_io(ctx, resources) + } +} + +#[cfg(test)] +mod tests { + use std::error::Error; + use std::sync::Mutex; + + use super::*; + use crate::resources::DeviceResources; + + const PIO_ADDRESS_SIZE: u16 = 4; + const PIO_ADDRESS_BASE: u16 = 0x40; + const MMIO_ADDRESS_SIZE: u64 = 0x8765_4321; + const MMIO_ADDRESS_BASE: u64 = 0x1234_5678; + const LEGACY_IRQ: u32 = 4; + const CONFIG_DATA: u32 = 0x1234; + + struct DummyDevice { + config: Mutex, + } + + impl DummyDevice { + fn new(config: u32) -> Self { + DummyDevice { + config: Mutex::new(config), + } + } + } + + impl DeviceIo for DummyDevice { + fn read(&self, _base: IoAddress, _offset: IoAddress, data: &mut [u8]) { + if data.len() > 4 { + return; + } + for (idx, iter) in data.iter_mut().enumerate() { + let config = self.config.lock().expect("failed to acquire lock"); + *iter = (*config >> (idx * 8) & 0xff) as u8; + } + } + + fn write(&self, _base: IoAddress, _offset: IoAddress, data: &[u8]) { + let mut config = self.config.lock().expect("failed to acquire lock"); + *config = u32::from(data[0]) & 0xff; + } + + fn pio_read(&self, _base: PioAddress, _offset: PioAddress, data: &mut [u8]) { + if data.len() > 4 { + return; + } + for (idx, iter) in data.iter_mut().enumerate() { + let config = self.config.lock().expect("failed to acquire lock"); + *iter = (*config >> (idx * 8) & 0xff) as u8; + } + } + + fn pio_write(&self, _base: PioAddress, _offset: PioAddress, data: &[u8]) { + let mut config = self.config.lock().expect("failed to acquire lock"); + *config = u32::from(data[0]) & 0xff; + } + fn as_any(&self) -> &dyn std::any::Any { + self + } + } + + #[test] + fn test_clone_io_manager() { + let mut io_mgr = IoManager::new(); + let dummy = DummyDevice::new(0); + let dum = Arc::new(dummy); + + let mut resource: Vec = Vec::new(); + let mmio = Resource::MmioAddressRange { + base: MMIO_ADDRESS_BASE, + size: MMIO_ADDRESS_SIZE, + }; + let irq = Resource::LegacyIrq(LEGACY_IRQ); + + resource.push(mmio); + resource.push(irq); + + let pio = Resource::PioAddressRange { + base: PIO_ADDRESS_BASE, + size: PIO_ADDRESS_SIZE, + }; + resource.push(pio); + + assert!(io_mgr.register_device_io(dum.clone(), &resource).is_ok()); + + let io_mgr2 = io_mgr.clone(); + assert_eq!(io_mgr2.mmio_bus.len(), 1); + + assert_eq!(io_mgr2.pio_bus.len(), 1); + + let (dev, addr) = io_mgr2 + .get_mmio_device(IoAddress(MMIO_ADDRESS_BASE + 1)) + .unwrap(); + assert_eq!(Arc::strong_count(dev), 5); + + assert_eq!(addr, IoAddress(MMIO_ADDRESS_BASE)); + + drop(io_mgr); + assert_eq!(Arc::strong_count(dev), 3); + + drop(io_mgr2); + assert_eq!(Arc::strong_count(&dum), 1); + } + + #[test] + fn test_register_unregister_device_io() { + let mut io_mgr = IoManager::new(); + let dummy = DummyDevice::new(0); + let dum = Arc::new(dummy); + + let mut resources = DeviceResources::new(); + let mmio = Resource::MmioAddressRange { + base: MMIO_ADDRESS_BASE, + size: MMIO_ADDRESS_SIZE, + }; + let pio = Resource::PioAddressRange { + base: PIO_ADDRESS_BASE, + size: PIO_ADDRESS_SIZE, + }; + let irq = Resource::LegacyIrq(LEGACY_IRQ); + + resources.append(mmio); + resources.append(pio); + resources.append(irq); + + assert!(io_mgr.register_device_io(dum.clone(), &resources).is_ok()); + assert!(io_mgr.register_device_io(dum, &resources).is_err()); + assert!(io_mgr.unregister_device_io(&resources).is_ok()) + } + + #[test] + fn test_mmio_read_write() { + let mut io_mgr: IoManager = Default::default(); + let dum = Arc::new(DummyDevice::new(CONFIG_DATA)); + let mut resource: Vec = Vec::new(); + + let mmio = Resource::MmioAddressRange { + base: MMIO_ADDRESS_BASE, + size: MMIO_ADDRESS_SIZE, + }; + resource.push(mmio); + assert!(io_mgr.register_device_io(dum.clone(), &resource).is_ok()); + + let mut data = [0; 4]; + assert!(io_mgr.mmio_read(MMIO_ADDRESS_BASE, &mut data).is_ok()); + assert_eq!(data, [0x34, 0x12, 0, 0]); + + assert!(io_mgr + .mmio_read(MMIO_ADDRESS_BASE + MMIO_ADDRESS_SIZE, &mut data) + .is_err()); + + data = [0; 4]; + assert!(io_mgr.mmio_write(MMIO_ADDRESS_BASE, &data).is_ok()); + assert_eq!(*dum.config.lock().unwrap(), 0); + + assert!(io_mgr + .mmio_write(MMIO_ADDRESS_BASE + MMIO_ADDRESS_SIZE, &data) + .is_err()); + } + + #[test] + fn test_pio_read_write() { + let mut io_mgr: IoManager = Default::default(); + let dum = Arc::new(DummyDevice::new(CONFIG_DATA)); + let mut resource: Vec = Vec::new(); + + let pio = Resource::PioAddressRange { + base: PIO_ADDRESS_BASE, + size: PIO_ADDRESS_SIZE, + }; + resource.push(pio); + assert!(io_mgr.register_device_io(dum.clone(), &resource).is_ok()); + + let mut data = [0; 4]; + assert!(io_mgr.pio_read(PIO_ADDRESS_BASE, &mut data).is_ok()); + assert_eq!(data, [0x34, 0x12, 0, 0]); + + assert!(io_mgr + .pio_read(PIO_ADDRESS_BASE + PIO_ADDRESS_SIZE, &mut data) + .is_err()); + + data = [0; 4]; + assert!(io_mgr.pio_write(PIO_ADDRESS_BASE, &data).is_ok()); + assert_eq!(*dum.config.lock().unwrap(), 0); + + assert!(io_mgr + .pio_write(PIO_ADDRESS_BASE + PIO_ADDRESS_SIZE, &data) + .is_err()); + } + + #[test] + fn test_device_manager_data_structs() { + let range1 = IoRange::new_mmio_range(0x1000, 0x1000); + let range2 = IoRange::new_mmio_range(0x1000, 0x2000); + let range3 = IoRange::new_mmio_range(0x2000, 0x1000); + + assert_eq!(range1, range1.clone()); + assert_eq!(range1, range2); + assert!(range1 < range3); + } + + #[test] + fn test_error_code() { + let err = super::Error::DeviceOverlap; + + assert!(err.source().is_none()); + assert_eq!( + format!("{err}"), + "device address conflicts with existing devices" + ); + + let err = super::Error::NoDevice; + assert!(err.source().is_none()); + assert_eq!(format!("{err:#?}"), "NoDevice"); + } + + #[test] + fn test_io_manager_partial_eq() { + let mut io_mgr1 = IoManager::new(); + let mut io_mgr2 = IoManager::new(); + let dummy1 = Arc::new(DummyDevice::new(0)); + let dummy2 = Arc::new(DummyDevice::new(0)); + + let mut resources1 = DeviceResources::new(); + let mut resources2 = DeviceResources::new(); + + let mmio = Resource::MmioAddressRange { + base: MMIO_ADDRESS_BASE, + size: MMIO_ADDRESS_SIZE, + }; + let pio = Resource::PioAddressRange { + base: PIO_ADDRESS_BASE, + size: PIO_ADDRESS_SIZE, + }; + + resources1.append(mmio.clone()); + resources1.append(pio.clone()); + + resources2.append(mmio); + resources2.append(pio); + + io_mgr1.register_device_io(dummy1, &resources1).unwrap(); + io_mgr2.register_device_io(dummy2, &resources2).unwrap(); + + assert!(io_mgr1 == io_mgr2); + } + + #[test] + fn test_io_manager_partial_neq() { + let mut io_mgr1 = IoManager::new(); + let mut io_mgr2 = IoManager::new(); + let dummy1 = Arc::new(DummyDevice::new(0)); + let dummy2 = Arc::new(DummyDevice::new(0)); + + let mut resources1 = DeviceResources::new(); + let mut resources2 = DeviceResources::new(); + + let mmio = Resource::MmioAddressRange { + base: MMIO_ADDRESS_BASE, + size: MMIO_ADDRESS_SIZE, + }; + let pio = Resource::PioAddressRange { + base: PIO_ADDRESS_BASE, + size: PIO_ADDRESS_SIZE, + }; + + resources1.append(mmio.clone()); + resources1.append(pio); + + resources2.append(mmio); + + io_mgr1.register_device_io(dummy1, &resources1).unwrap(); + io_mgr2.register_device_io(dummy2, &resources2).unwrap(); + + assert!(io_mgr1 != io_mgr2); + } +} diff --git a/src/dragonball/src/dbs_device/src/lib.rs b/src/dragonball/src/dbs_device/src/lib.rs new file mode 100644 index 000000000..a48229962 --- /dev/null +++ b/src/dragonball/src/dbs_device/src/lib.rs @@ -0,0 +1,420 @@ +// Copyright 2020 Alibaba Cloud. All Rights Reserved. +// Copyright © 2019 Intel Corporation. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![deny(missing_docs)] + +//! Device model for Dragonball Secure Sandbox. +//! +//! The `dbs-device` crate, as a counterpart of [vm-device], defines device model for the +//! Dragonball Secure Sandbox. The `dbs-device` crate shares some common concepts and data structures +//! with [vm-device], but it also diverges from [vm-device] due to different VMM designs. +//! +//! [vm-device]: https://github.com/rust-vmm/vm-device + +use std::any::Any; +use std::cmp::{Ord, PartialOrd}; +use std::convert::TryFrom; +use std::sync::Mutex; + +use self::resources::DeviceResources; + +pub mod device_manager; +pub mod resources; + +/// Size of MMIO range/access request. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +pub struct IoSize(pub u64); + +impl IoSize { + /// Get the raw value as u64 to make operation simple. + #[inline] + pub fn raw_value(self) -> u64 { + self.0 + } +} + +impl From for IoSize { + #[inline] + fn from(size: u64) -> Self { + IoSize(size) + } +} + +impl From for u64 { + #[inline] + fn from(size: IoSize) -> Self { + size.0 + } +} + +/// Memory Mapped IO (MMIO) address. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +pub struct IoAddress(pub u64); + +impl IoAddress { + /// Get the raw value of IO Address to make operation simple. + #[inline] + pub fn raw_value(self) -> u64 { + self.0 + } +} + +impl From for IoAddress { + #[inline] + fn from(addr: u64) -> Self { + IoAddress(addr) + } +} + +impl From for u64 { + #[inline] + fn from(addr: IoAddress) -> Self { + addr.0 + } +} + +type PioAddressType = u16; + +/// Size of Port I/O range/request. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +pub struct PioSize(pub PioAddressType); + +impl PioSize { + /// Get the raw value as u64 to make operation simple. + #[inline] + pub fn raw_value(self) -> PioAddressType { + self.0 + } +} + +impl From for PioSize { + #[inline] + fn from(size: PioAddressType) -> Self { + PioSize(size) + } +} + +impl From for PioAddressType { + #[inline] + fn from(size: PioSize) -> Self { + size.0 + } +} + +impl TryFrom for PioSize { + type Error = IoSize; + + #[inline] + fn try_from(size: IoSize) -> Result { + if size.raw_value() <= std::u16::MAX as u64 { + Ok(PioSize(size.raw_value() as PioAddressType)) + } else { + Err(size) + } + } +} + +impl From for IoSize { + #[inline] + fn from(size: PioSize) -> Self { + IoSize(size.raw_value() as u64) + } +} + +/// Port IO (PIO) address. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +pub struct PioAddress(pub PioAddressType); + +impl PioAddress { + /// Get the raw value of IO Address to make operation simple. + #[inline] + pub fn raw_value(self) -> PioAddressType { + self.0 + } +} + +impl From for PioAddress { + #[inline] + fn from(addr: PioAddressType) -> Self { + PioAddress(addr) + } +} + +impl From for PioAddressType { + #[inline] + fn from(addr: PioAddress) -> Self { + addr.0 + } +} + +impl TryFrom for PioAddress { + type Error = IoAddress; + + #[inline] + fn try_from(addr: IoAddress) -> Result { + if addr.0 <= std::u16::MAX as u64 { + Ok(PioAddress(addr.raw_value() as PioAddressType)) + } else { + Err(addr) + } + } +} + +impl From for IoAddress { + #[inline] + fn from(addr: PioAddress) -> Self { + IoAddress(addr.raw_value() as u64) + } +} + +/// Trait for device to handle trapped MMIO/PIO access requests with interior mutability +/// for high performance. +/// +/// Any device which needs to trap MMIO/PIO access requests should implement the [DeviceIo] or +/// [DeviceIoMut] trait and register itself to the [IoManager](crate::device_manager::IoManager) +/// with those trapped IO address ranges. When the guest access those trapped address ranges, +/// the access request will be routed to the registered callbacks. +/// +/// The [DeviceIo] trait adopts the interior mutability pattern so we can get a real concurrent +/// multiple threads handling. For device backend drivers not focusing on high performance, +/// the Mutex adapter may be used to simplify the implementation. +#[allow(unused_variables)] +pub trait DeviceIo: Send + Sync { + /// Read from the MMIO address `base + offset` into `data`. + fn read(&self, base: IoAddress, offset: IoAddress, data: &mut [u8]) {} + + /// Write from `data` to the MMIO address `base + offset`. + fn write(&self, base: IoAddress, offset: IoAddress, data: &[u8]) {} + + /// Read from port `base + offset` into `data`. + fn pio_read(&self, base: PioAddress, offset: PioAddress, data: &mut [u8]) {} + + /// Write from `data` to the port `base + offset`. + fn pio_write(&self, base: PioAddress, offset: PioAddress, data: &[u8]) {} + + /// Get resources assigned to the device. + fn get_assigned_resources(&self) -> DeviceResources { + DeviceResources::new() + } + + /// Get the trapped IO address ranges for the device. + /// + /// Only MMIO/PIO address ranges in the resource list will be handled, other resources will be + /// ignored. So the device does not need to filter out non-MMIO/PIO resources. + fn get_trapped_io_resources(&self) -> DeviceResources { + self.get_assigned_resources() + } + + /// Used to downcast to the specific type. + fn as_any(&self) -> &dyn Any; +} + +/// Trait for device to handle trapped MMIO/PIO access requests. +/// +/// Many device backend drivers will mutate itself when handling IO requests. The [DeviceIo] trait +/// assumes interior mutability, but it's a little complex to support interior mutability. +/// So the Mutex adapter may be used to ease device backend driver implementations. +/// +/// The Mutex adapter is an zero overhead abstraction without performance penalty. +#[allow(unused_variables)] +pub trait DeviceIoMut { + /// Read from the MMIO address `base + offset` into `data`. + fn read(&mut self, base: IoAddress, offset: IoAddress, data: &mut [u8]) {} + + /// Write from `data` to the MMIO address `base + offset`. + fn write(&mut self, base: IoAddress, offset: IoAddress, data: &[u8]) {} + + /// Read from port `base + offset` into `data`. + fn pio_read(&mut self, base: PioAddress, offset: PioAddress, data: &mut [u8]) {} + + /// Write from `data` to the port `base + offset`. + fn pio_write(&mut self, base: PioAddress, offset: PioAddress, data: &[u8]) {} + + /// Get resources assigned to the device. + fn get_assigned_resources(&self) -> DeviceResources { + DeviceResources::new() + } + + /// Get the trapped IO address ranges for the device. + /// + /// Only MMIO/PIO address ranges in the resource list will be handled, other resources will be + /// ignored. So the device does not need to filter out non-MMIO/PIO resources. + fn get_trapped_io_resources(&self) -> DeviceResources { + self.get_assigned_resources() + } +} + +impl DeviceIo for Mutex { + fn read(&self, base: IoAddress, offset: IoAddress, data: &mut [u8]) { + // Safe to unwrap() because we don't expect poisoned lock here. + self.lock().unwrap().read(base, offset, data) + } + + fn write(&self, base: IoAddress, offset: IoAddress, data: &[u8]) { + // Safe to unwrap() because we don't expect poisoned lock here. + self.lock().unwrap().write(base, offset, data) + } + + fn pio_read(&self, base: PioAddress, offset: PioAddress, data: &mut [u8]) { + // Safe to unwrap() because we don't expect poisoned lock here. + self.lock().unwrap().pio_read(base, offset, data) + } + + fn pio_write(&self, base: PioAddress, offset: PioAddress, data: &[u8]) { + // Safe to unwrap() because we don't expect poisoned lock here. + self.lock().unwrap().pio_write(base, offset, data) + } + + fn get_assigned_resources(&self) -> DeviceResources { + // Safe to unwrap() because we don't expect poisoned lock here. + self.lock().unwrap().get_assigned_resources() + } + + fn get_trapped_io_resources(&self) -> DeviceResources { + // Safe to unwrap() because we don't expect poisoned lock here. + self.lock().unwrap().get_trapped_io_resources() + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +#[cfg(test)] +mod tests { + use std::convert::TryFrom; + use std::sync::Arc; + + use super::*; + + #[derive(Default)] + struct MockDevice { + data: Mutex, + } + + impl DeviceIo for MockDevice { + fn read(&self, _base: IoAddress, _offset: IoAddress, data: &mut [u8]) { + data[0] = *self.data.lock().unwrap(); + } + + fn write(&self, _base: IoAddress, _offset: IoAddress, data: &[u8]) { + *self.data.lock().unwrap() = data[0]; + } + + fn pio_read(&self, _base: PioAddress, _offset: PioAddress, data: &mut [u8]) { + data[0] = *self.data.lock().unwrap(); + } + + fn pio_write(&self, _base: PioAddress, _offset: PioAddress, data: &[u8]) { + *self.data.lock().unwrap() = data[0]; + } + fn as_any(&self) -> &dyn Any { + self + } + } + + #[derive(Default)] + struct MockDeviceMut { + data: u8, + } + + impl DeviceIoMut for MockDeviceMut { + fn read(&mut self, _base: IoAddress, _offset: IoAddress, data: &mut [u8]) { + data[0] = self.data; + } + + fn write(&mut self, _base: IoAddress, _offset: IoAddress, data: &[u8]) { + self.data = data[0]; + } + + fn pio_read(&mut self, _base: PioAddress, _offset: PioAddress, data: &mut [u8]) { + data[0] = self.data; + } + + fn pio_write(&mut self, _base: PioAddress, _offset: PioAddress, data: &[u8]) { + self.data = data[0]; + } + } + + fn register_device(device: Arc) { + device.write(IoAddress(0), IoAddress(0), &[0x10u8]); + let mut buf = [0x0u8]; + device.read(IoAddress(0), IoAddress(0), &mut buf); + assert_eq!(buf[0], 0x10); + + { + device.pio_write(PioAddress(0), PioAddress(0), &[0x10u8]); + let mut buf = [0x0u8]; + device.pio_read(PioAddress(0), PioAddress(0), &mut buf); + assert_eq!(buf[0], 0x10); + } + + // test trait's default implementation + let resource = DeviceResources::new(); + assert_eq!(resource, device.get_assigned_resources()); + assert_eq!(resource, device.get_trapped_io_resources()); + } + + #[test] + fn test_device_io_adapter() { + let device = Arc::new(MockDevice::default()); + + register_device(device.clone()); + assert_eq!(*device.data.lock().unwrap(), 0x010); + } + + #[test] + fn test_device_io_mut_adapter() { + let device_mut = Arc::new(Mutex::new(MockDeviceMut::default())); + + register_device(device_mut.clone()); + assert_eq!(device_mut.lock().unwrap().data, 0x010); + } + + #[test] + fn test_io_data_struct() { + let io_size = IoSize::from(0x1111u64); + assert_eq!(io_size.raw_value(), 0x1111u64); + assert_eq!(u64::from(io_size), 0x1111u64); + assert_eq!(io_size, io_size.clone()); + let io_size1 = IoSize::from(0x1112u64); + assert!(io_size < io_size1); + + let io_addr = IoAddress::from(0x1234u64); + assert_eq!(io_addr.raw_value(), 0x1234u64); + assert_eq!(u64::from(io_addr), 0x1234u64); + assert_eq!(io_addr, io_addr.clone()); + let io_addr1 = IoAddress::from(0x1235u64); + assert!(io_addr < io_addr1); + } + + #[test] + fn test_pio_data_struct() { + let pio_size = PioSize::from(0x1111u16); + assert_eq!(pio_size.raw_value(), 0x1111u16); + assert_eq!(u16::from(pio_size), 0x1111u16); + assert_eq!(pio_size, pio_size.clone()); + let pio_size1 = PioSize::from(0x1112u16); + assert!(pio_size < pio_size1); + + let pio_size = PioSize::try_from(IoSize(0x1111u64)).unwrap(); + assert_eq!(pio_size.raw_value(), 0x1111u16); + + assert!(PioSize::try_from(IoSize(std::u16::MAX as u64 + 1)).is_err()); + + let io_size = IoSize::from(PioSize::from(0x1111u16)); + assert_eq!(io_size.raw_value(), 0x1111u64); + + let pio_addr = PioAddress::from(0x1234u16); + assert_eq!(pio_addr.raw_value(), 0x1234u16); + assert_eq!(u16::from(pio_addr), 0x1234u16); + assert_eq!(pio_addr, pio_addr.clone()); + let pio_addr1 = PioAddress::from(0x1235u16); + assert!(pio_addr < pio_addr1); + + assert!(PioAddress::try_from(IoAddress::from(0x12_3456u64)).is_err()); + assert!(PioAddress::try_from(IoAddress::from(0x1234u64)).is_ok()); + assert_eq!(IoAddress::from(pio_addr).raw_value(), 0x1234u64); + } +} diff --git a/src/dragonball/src/dbs_device/src/resources.rs b/src/dragonball/src/dbs_device/src/resources.rs new file mode 100644 index 000000000..e87b0fe87 --- /dev/null +++ b/src/dragonball/src/dbs_device/src/resources.rs @@ -0,0 +1,649 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Descriptors representing device resource allocation requirements and assigned resources. +//! +//! There are several components related to resource management: +//! - the Dragonball Secure Sandbox (VMM), which is responsible for creating and registering devices +//! to the device manager. +//! - the device manager, which manages all devices of a Dragonball Secure Sandbox instance. +//! - the devices, which implement virtual device backends for the guest. +//! +//! They cooperate with each to provide resources required by each device. The high level flow of +//! resource management is as below: +//! 1) the VMM creates a new device object. +//! 2) the device returns an array of [ResourceConstraint](self::ResourceConstraint), +//! describing the required resources and resource allocation constraints. +//! 3) the VMM allocates required resources from a resource manager, +//! 4) the VMM passes the allocated resources [DeviceResources](self::DeviceResources), +//! which is an array of [Resource](self::Resource), to the device object. +//! 5) the VMM registers the new device onto corresponding device managers according the allocated +//! resources. + +use std::ops::Deref; + +/// Enumeration describing a device's resource allocation requirements and constraints. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum ResourceConstraint { + /// Constraint for an IO Port address range. + PioAddress { + /// Allocating resource within the range [`min`, `max`] if specified. + range: Option<(u16, u16)>, + /// Alignment for the allocated address. + align: u16, + /// Size for the allocated address range. + size: u16, + }, + /// Constraint for a Memory Mapped IO address range. + MmioAddress { + /// Allocating resource within the range [`min`, `max`] if specified. + range: Option<(u64, u64)>, + /// Alignment for the allocated address. + align: u64, + /// Size for the allocated address range. + size: u64, + }, + /// Constraint for a Guest Mem address range. + MemAddress { + /// Allocating resource within the range [`min`, `max`] if specified. + range: Option<(u64, u64)>, + /// Alignment for the allocated address. + align: u64, + /// Size for the allocated address range. + size: u64, + }, + /// Constraint for a legacy IRQ. + LegacyIrq { + /// Reserving the pre-allocated IRQ if it's specified. + irq: Option, + }, + /// Constraint for PCI MSI IRQs. + PciMsiIrq { + /// Number of Irqs to allocate. + size: u32, + }, + /// Constraint for PCI MSIx IRQs. + PciMsixIrq { + /// Number of Irqs to allocate. + size: u32, + }, + /// Constraint for generic IRQs. + GenericIrq { + /// Number of Irqs to allocate. + size: u32, + }, + /// Constraint for KVM mem_slot indexes to map memory into the guest. + KvmMemSlot { + /// Allocating kvm memory slots starting from the index `slot` if + /// specified. + slot: Option, + /// Number of slots to allocate. + size: u32, + }, +} + +impl ResourceConstraint { + /// Create a new PIO address constraint object with default configuration. + pub fn new_pio(size: u16) -> Self { + ResourceConstraint::PioAddress { + range: None, + align: 0x1, + size, + } + } + + /// Create a new PIO address constraint object. + pub fn pio_with_constraints(size: u16, range: Option<(u16, u16)>, align: u16) -> Self { + ResourceConstraint::PioAddress { range, align, size } + } + + /// Create a new MMIO address constraint object with default configuration. + pub fn new_mmio(size: u64) -> Self { + ResourceConstraint::MmioAddress { + range: None, + align: 0x1000, + size, + } + } + + /// Create a new MMIO address constraint object. + pub fn mmio_with_constraints(size: u64, range: Option<(u64, u64)>, align: u64) -> Self { + ResourceConstraint::MmioAddress { range, align, size } + } + + /// Create a new Mem address constraint object with default configuration. + pub fn new_mem(size: u64) -> Self { + ResourceConstraint::MemAddress { + range: None, + align: 0x1000, + size, + } + } + + /// Create a new Mem address constraint object. + pub fn mem_with_constraints(size: u64, range: Option<(u64, u64)>, align: u64) -> Self { + ResourceConstraint::MemAddress { range, align, size } + } + + /// Create a new legacy IRQ constraint object. + /// + /// Allocating the pre-allocated legacy Irq `irq` if specified. + pub fn new_legacy_irq(irq: Option) -> Self { + ResourceConstraint::LegacyIrq { irq } + } + + /// Create a new PCI MSI IRQ constraint object. + pub fn new_pci_msi_irq(size: u32) -> Self { + ResourceConstraint::PciMsiIrq { size } + } + + /// Create a new PCI MSIX IRQ constraint object. + pub fn new_pci_msix_irq(size: u32) -> Self { + ResourceConstraint::PciMsixIrq { size } + } + + /// Create a new Generic IRQ constraint object. + pub fn new_generic_irq(size: u32) -> Self { + ResourceConstraint::GenericIrq { size } + } + + /// Create a new KVM memory slot constraint object. + /// + /// Allocating kvm memory slots starting from the index `slot` if specified. + pub fn new_kvm_mem_slot(size: u32, slot: Option) -> Self { + ResourceConstraint::KvmMemSlot { slot, size } + } +} + +/// Type of Message Singaled Interrupt +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum MsiIrqType { + /// PCI MSI IRQ numbers. + PciMsi, + /// PCI MSIx IRQ numbers. + PciMsix, + /// Generic MSI IRQ numbers. + GenericMsi, +} + +/// Enumeration for device resources. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Resource { + /// IO Port resource range. + PioAddressRange { + /// Pio resource base + base: u16, + /// Pio resource size + size: u16, + }, + /// Memory Mapped IO resource range. + MmioAddressRange { + /// Mmio resource base + base: u64, + /// Mmio resource size + size: u64, + }, + /// Guest Mem resource range. + MemAddressRange { + /// Mem resource base + base: u64, + /// Mem resource size + size: u64, + }, + /// Legacy IRQ number. + LegacyIrq(u32), + /// Message Signaled Interrupt + MsiIrq { + /// Msi irq type + ty: MsiIrqType, + /// Msi irq base + base: u32, + /// Msi irq size + size: u32, + }, + /// Network Interface Card MAC address. + MacAddresss(String), + /// KVM memslot index. + KvmMemSlot(u32), +} + +/// Newtype to store a set of device resources. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct DeviceResources(Vec); + +impl DeviceResources { + /// Create a container object to store device resources. + pub fn new() -> Self { + DeviceResources(Vec::new()) + } + + /// Append a device resource to the container object. + pub fn append(&mut self, entry: Resource) { + self.0.push(entry); + } + + /// Get the IO port address resources. + pub fn get_pio_address_ranges(&self) -> Vec<(u16, u16)> { + let mut vec = Vec::new(); + for entry in self.0.iter().as_ref() { + if let Resource::PioAddressRange { base, size } = entry { + vec.push((*base, *size)); + } + } + vec + } + + /// Get the Memory Mapped IO address resources. + pub fn get_mmio_address_ranges(&self) -> Vec<(u64, u64)> { + let mut vec = Vec::new(); + for entry in self.0.iter().as_ref() { + if let Resource::MmioAddressRange { base, size } = entry { + vec.push((*base, *size)); + } + } + vec + } + + /// Get the Guest Memory address resources. + pub fn get_mem_address_ranges(&self) -> Vec<(u64, u64)> { + let mut vec = Vec::new(); + for entry in self.0.iter().as_ref() { + if let Resource::MemAddressRange { base, size } = entry { + vec.push((*base, *size)); + } + } + vec + } + + /// Get the first legacy interrupt number(IRQ). + pub fn get_legacy_irq(&self) -> Option { + for entry in self.0.iter().as_ref() { + if let Resource::LegacyIrq(base) = entry { + return Some(*base); + } + } + None + } + + /// Get information about the first PCI MSI interrupt resource. + pub fn get_pci_msi_irqs(&self) -> Option<(u32, u32)> { + self.get_msi_irqs(MsiIrqType::PciMsi) + } + + /// Get information about the first PCI MSIx interrupt resource. + pub fn get_pci_msix_irqs(&self) -> Option<(u32, u32)> { + self.get_msi_irqs(MsiIrqType::PciMsix) + } + + /// Get information about the first Generic MSI interrupt resource. + pub fn get_generic_msi_irqs(&self) -> Option<(u32, u32)> { + self.get_msi_irqs(MsiIrqType::GenericMsi) + } + + fn get_msi_irqs(&self, ty: MsiIrqType) -> Option<(u32, u32)> { + for entry in self.0.iter().as_ref() { + if let Resource::MsiIrq { + ty: msi_type, + base, + size, + } = entry + { + if ty == *msi_type { + return Some((*base, *size)); + } + } + } + None + } + + /// Get the KVM memory slots to map memory into the guest. + pub fn get_kvm_mem_slots(&self) -> Vec { + let mut vec = Vec::new(); + for entry in self.0.iter().as_ref() { + if let Resource::KvmMemSlot(index) = entry { + vec.push(*index); + } + } + vec + } + + /// Get the first resource information for NIC MAC address. + pub fn get_mac_address(&self) -> Option { + for entry in self.0.iter().as_ref() { + if let Resource::MacAddresss(addr) = entry { + return Some(addr.clone()); + } + } + None + } + + /// Get immutable reference to all the resources. + pub fn get_all_resources(&self) -> &[Resource] { + &self.0 + } +} + +impl Deref for DeviceResources { + type Target = [Resource]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + + const PIO_ADDRESS_SIZE: u16 = 5; + const PIO_ADDRESS_BASE: u16 = 0; + const MMIO_ADDRESS_SIZE: u64 = 0x8765_4321; + const MMIO_ADDRESS_BASE: u64 = 0x1234_5678; + const MEM_ADDRESS_SIZE: u64 = 0x8765_4321; + const MEM_ADDRESS_BASE: u64 = 0x1234_5678; + const LEGACY_IRQ: u32 = 0x168; + const PCI_MSI_IRQ_SIZE: u32 = 0x8888; + const PCI_MSI_IRQ_BASE: u32 = 0x6666; + const PCI_MSIX_IRQ_SIZE: u32 = 0x16666; + const PCI_MSIX_IRQ_BASE: u32 = 0x8888; + const GENERIC_MSI_IRQS_SIZE: u32 = 0x16888; + const GENERIC_MSI_IRQS_BASE: u32 = 0x16688; + const MAC_ADDRESS: &str = "00:08:63:66:86:88"; + const KVM_SLOT_ID: u32 = 0x0100; + + pub fn get_device_resource() -> DeviceResources { + let mut resource = DeviceResources::new(); + + let entry = Resource::PioAddressRange { + base: PIO_ADDRESS_BASE, + size: PIO_ADDRESS_SIZE, + }; + resource.append(entry.clone()); + assert_eq!(entry, resource[0]); + + let entry = Resource::MmioAddressRange { + base: MMIO_ADDRESS_BASE, + size: MMIO_ADDRESS_SIZE, + }; + resource.append(entry.clone()); + assert_eq!(entry, resource[1]); + + let entry = Resource::MemAddressRange { + base: MEM_ADDRESS_BASE, + size: MEM_ADDRESS_SIZE, + }; + resource.append(entry.clone()); + assert_eq!(entry, resource[2]); + + let entry = Resource::LegacyIrq(LEGACY_IRQ); + resource.append(entry.clone()); + assert_eq!(entry, resource[3]); + + let entry = Resource::MsiIrq { + ty: MsiIrqType::PciMsi, + base: PCI_MSI_IRQ_BASE, + size: PCI_MSI_IRQ_SIZE, + }; + resource.append(entry.clone()); + assert_eq!(entry, resource[4]); + + let entry = Resource::MsiIrq { + ty: MsiIrqType::PciMsix, + base: PCI_MSIX_IRQ_BASE, + size: PCI_MSIX_IRQ_SIZE, + }; + resource.append(entry.clone()); + assert_eq!(entry, resource[5]); + + let entry = Resource::MsiIrq { + ty: MsiIrqType::GenericMsi, + base: GENERIC_MSI_IRQS_BASE, + size: GENERIC_MSI_IRQS_SIZE, + }; + resource.append(entry.clone()); + assert_eq!(entry, resource[6]); + + let entry = Resource::MacAddresss(MAC_ADDRESS.to_string()); + resource.append(entry.clone()); + assert_eq!(entry, resource[7]); + + let entry = Resource::KvmMemSlot(KVM_SLOT_ID); + resource.append(entry.clone()); + assert_eq!(entry, resource[8]); + + resource + } + + #[test] + fn get_pio_address_ranges() { + let resources = get_device_resource(); + assert!( + resources.get_pio_address_ranges()[0].0 == PIO_ADDRESS_BASE + && resources.get_pio_address_ranges()[0].1 == PIO_ADDRESS_SIZE + ); + assert_eq!( + resources[0], + Resource::PioAddressRange { + base: PIO_ADDRESS_BASE, + size: PIO_ADDRESS_SIZE, + } + ); + assert_ne!(resources[0], resources[1]); + + let resources2 = resources.clone(); + assert_eq!(resources.len(), resources2.len()); + drop(resources); + assert_eq!( + resources2[0], + Resource::PioAddressRange { + base: PIO_ADDRESS_BASE, + size: PIO_ADDRESS_SIZE, + } + ); + } + + #[test] + fn test_get_mmio_address_ranges() { + let resources = get_device_resource(); + assert!( + resources.get_mmio_address_ranges()[0].0 == MMIO_ADDRESS_BASE + && resources.get_mmio_address_ranges()[0].1 == MMIO_ADDRESS_SIZE + ); + } + + #[test] + fn test_get_mem_address_ranges() { + let resources = get_device_resource(); + assert!( + resources.get_mem_address_ranges()[0].0 == MEM_ADDRESS_BASE + && resources.get_mem_address_ranges()[0].1 == MEM_ADDRESS_SIZE + ); + } + + #[test] + fn test_get_legacy_irq() { + let resources = get_device_resource(); + assert!(resources.get_legacy_irq().unwrap() == LEGACY_IRQ); + + // None case. + let resources = DeviceResources::new(); + assert!(resources.get_legacy_irq().is_none()); + } + + #[test] + fn test_get_pci_msi_irqs() { + let resources = get_device_resource(); + assert!( + resources.get_pci_msi_irqs().unwrap().0 == PCI_MSI_IRQ_BASE + && resources.get_pci_msi_irqs().unwrap().1 == PCI_MSI_IRQ_SIZE + ); + + // None case. + let resources = DeviceResources::new(); + assert!(resources.get_generic_msi_irqs().is_none()); + } + + #[test] + fn test_get_pci_msix_irqs() { + let resources = get_device_resource(); + assert!( + resources.get_pci_msix_irqs().unwrap().0 == PCI_MSIX_IRQ_BASE + && resources.get_pci_msix_irqs().unwrap().1 == PCI_MSIX_IRQ_SIZE + ); + + // None case. + let resources = DeviceResources::new(); + assert!(resources.get_generic_msi_irqs().is_none()); + } + + #[test] + fn test_get_generic_msi_irqs() { + let resources = get_device_resource(); + assert!( + resources.get_generic_msi_irqs().unwrap().0 == GENERIC_MSI_IRQS_BASE + && resources.get_generic_msi_irqs().unwrap().1 == GENERIC_MSI_IRQS_SIZE + ); + + // None case. + let resources = DeviceResources::new(); + assert!(resources.get_generic_msi_irqs().is_none()); + } + + #[test] + fn test_get_mac_address() { + let resources = get_device_resource(); + assert_eq!(resources.get_mac_address().unwrap(), MAC_ADDRESS); + + // None case. + let resources = DeviceResources::new(); + assert!(resources.get_mac_address().is_none()); + } + + #[test] + fn test_get_kvm_slot() { + let resources = get_device_resource(); + assert_eq!(resources.get_kvm_mem_slots(), vec![KVM_SLOT_ID]); + } + + #[test] + fn test_get_all_resources() { + let resources = get_device_resource(); + assert_eq!(resources.get_all_resources().len(), 9); + } + + #[test] + fn test_resource_constraint() { + let pio = ResourceConstraint::new_pio(2); + let pio2 = pio; + let mmio = ResourceConstraint::new_mmio(0x1000); + assert_eq!(pio, pio2); + assert_ne!(pio, mmio); + + if let ResourceConstraint::PioAddress { range, align, size } = + ResourceConstraint::new_pio(2) + { + assert_eq!(range, None); + assert_eq!(align, 1); + assert_eq!(size, 2); + } else { + panic!("Pio resource constraint is invalid."); + } + + if let ResourceConstraint::PioAddress { range, align, size } = + ResourceConstraint::pio_with_constraints(2, Some((15, 16)), 2) + { + assert_eq!(range, Some((15, 16))); + assert_eq!(align, 2); + assert_eq!(size, 2); + } else { + panic!("Pio resource constraint is invalid."); + } + + if let ResourceConstraint::MmioAddress { range, align, size } = + ResourceConstraint::new_mmio(0x2000) + { + assert_eq!(range, None); + assert_eq!(align, 0x1000); + assert_eq!(size, 0x2000); + } else { + panic!("Mmio resource constraint is invalid."); + } + + if let ResourceConstraint::MmioAddress { range, align, size } = + ResourceConstraint::mmio_with_constraints(0x2000, Some((0x0, 0x2000)), 0x2000) + { + assert_eq!(range, Some((0x0, 0x2000))); + assert_eq!(align, 0x2000); + assert_eq!(size, 0x2000); + } else { + panic!("Mmio resource constraint is invalid."); + } + + if let ResourceConstraint::MemAddress { range, align, size } = + ResourceConstraint::new_mem(0x2000) + { + assert_eq!(range, None); + assert_eq!(align, 0x1000); + assert_eq!(size, 0x2000); + } else { + panic!("Mem resource constraint is invalid."); + } + + if let ResourceConstraint::MemAddress { range, align, size } = + ResourceConstraint::mem_with_constraints(0x2000, Some((0x0, 0x2000)), 0x2000) + { + assert_eq!(range, Some((0x0, 0x2000))); + assert_eq!(align, 0x2000); + assert_eq!(size, 0x2000); + } else { + panic!("Mem resource constraint is invalid."); + } + + if let ResourceConstraint::LegacyIrq { irq } = + ResourceConstraint::new_legacy_irq(Some(0x123)) + { + assert_eq!(irq, Some(0x123)); + } else { + panic!("IRQ resource constraint is invalid."); + } + + if let ResourceConstraint::PciMsiIrq { size } = ResourceConstraint::new_pci_msi_irq(0x123) { + assert_eq!(size, 0x123); + } else { + panic!("Pci MSI irq resource constraint is invalid."); + } + + if let ResourceConstraint::PciMsixIrq { size } = ResourceConstraint::new_pci_msix_irq(0x123) + { + assert_eq!(size, 0x123); + } else { + panic!("Pci MSIx irq resource constraint is invalid."); + } + + if let ResourceConstraint::GenericIrq { size } = ResourceConstraint::new_generic_irq(0x123) + { + assert_eq!(size, 0x123); + } else { + panic!("generic irq resource constraint is invalid."); + } + + if let ResourceConstraint::KvmMemSlot { slot, size } = + ResourceConstraint::new_kvm_mem_slot(0x1000, Some(0x2000)) + { + assert_eq!(slot, Some(0x2000)); + assert_eq!(size, 0x1000); + } else { + panic!("KVM slot resource constraint is invalid."); + } + } + + #[test] + fn test_resources_deref() { + let resources = get_device_resource(); + let mut count = 0; + for _res in resources.iter() { + count += 1; + } + assert_eq!(count, resources.0.len()); + } +} diff --git a/src/dragonball/src/dbs_interrupt/Cargo.toml b/src/dragonball/src/dbs_interrupt/Cargo.toml new file mode 100644 index 000000000..20d5d46e4 --- /dev/null +++ b/src/dragonball/src/dbs_interrupt/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "dbs-interrupt" +version = "0.2.2" +authors = ["Alibaba Dragonball Team"] +description = "Traits and structs to manage interrupts for virtual devices" +license = "Apache-2.0" +edition = "2018" +homepage = "https://github.com/openanolis/dragonball-sandbox" +repository = "https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-interrupt" +keywords = ["dragonball", "secure-sandbox", "device", "interrupt"] +readme = "README.md" + +[dependencies] +dbs-device = { path = "../dbs_device" } +dbs-arch = { path = "../dbs_arch" } +kvm-bindings = { version = "0.6.0", optional = true } +kvm-ioctls = { version = "0.12.0", optional = true } +libc = "0.2" +vmm-sys-util = "0.11.0" + +[features] +default = ["legacy-irq", "msi-irq"] + +legacy-irq = [] +msi-irq = [] + +kvm-irq = ["kvm-ioctls", "kvm-bindings"] +kvm-legacy-irq = ["legacy-irq", "kvm-irq"] +kvm-msi-generic = ["msi-irq", "kvm-irq"] +kvm-msi-irq = ["kvm-msi-generic"] diff --git a/src/dragonball/src/dbs_interrupt/LICENSE b/src/dragonball/src/dbs_interrupt/LICENSE new file mode 120000 index 000000000..30cff7403 --- /dev/null +++ b/src/dragonball/src/dbs_interrupt/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/src/dragonball/src/dbs_interrupt/README.md b/src/dragonball/src/dbs_interrupt/README.md new file mode 100644 index 000000000..3ddd354cd --- /dev/null +++ b/src/dragonball/src/dbs_interrupt/README.md @@ -0,0 +1,73 @@ +# dbs-interrupt + +Interrupts are used by hardware devices to indicate asynchronous events to the processor. +The `dbs-interrupt` crate provides traits and data structures for the `Dragonball Sandbox` to manage +interrupts for virtual and physical devices. + +An interrupt alerts the processor to a high-priority condition requiring the interruption of +the current code the processor is executing. The processor responds by suspending its current activities, +saving its state, and executing a function called an interrupt handler (or an interrupt service routine, ISR) +to deal with the event. This interruption is temporary, and, after the interrupt handler finishes, +unless handling the interrupt has emitted a fatal error, the processor resumes normal activities. + +Hardware interrupts are used by devices to communicate that they require attention from the +operating system, or a bare-metal program running on the CPU if there are no OSes. The act of +initiating a hardware interrupt is referred to as an interrupt request (IRQ). Different devices are +usually associated with different interrupts using a unique value associated with each interrupt. +This makes it possible to know which hardware device caused which interrupts. These interrupt values +are often called IRQ lines, or just interrupt lines. + +Nowadays, IRQ lines is not the only mechanism to deliver device interrupts to processors. MSI +(Message Signaled Interrupt) is another commonly used alternative in-band method of signaling an +interrupt, using special in-band messages to replace traditional out-of-band assertion of dedicated +interrupt lines. While more complex to implement in a device, message signaled interrupts have some +significant advantages over pin-based out-of-band interrupt signaling. Message signaled interrupts +are supported in PCI bus since its version 2.2, and in later available PCI Express bus. Some non-PCI +architectures also use message signaled interrupts. + +While IRQ is a term commonly used by Operating Systems when dealing with hardware interrupts, the +IRQ numbers managed by OSes are independent of the ones managed by VMM. For simplicity sake, the +term Interrupt Source is used instead of IRQ to represent both pin-based interrupts and MSI +interrupts. + +A device may support multiple types of interrupts, and each type of interrupt may support one or +multiple interrupt sources. For example, a PCI device may support: + +- Legacy Irq: exactly one interrupt source. +- PCI MSI Irq: 1,2,4,8,16,32 interrupt sources. +- PCI MSIx Irq: 2^n(n=0-11) interrupt sources. + +A distinct Interrupt Source Identifier (ISID) will be assigned to each interrupt source. An ID +allocator will be used to allocate and free Interrupt Source Identifiers for devices. To decouple +this crate from the ID allocator, here we doesn't take the responsibility to allocate/free Interrupt +Source IDs but only makes use of assigned IDs. + +The overall flow to deal with interrupts is: + +- the VMM creates an interrupt manager +- the VMM creates a device manager, passing on an reference to the interrupt manager +- the device manager passes on an reference to the interrupt manager to all registered devices +- guest kernel loads drivers for virtual devices +- guest device driver determines the type and number of interrupts needed, and update the device + configuration +- the virtual device backend requests the interrupt manager to create an interrupt group according to guest configuration information + +The dbs-device crate provides: + +- [trait `InterruptManager`]: manage interrupt sources for virtual device backend +- [struct `DeviceInterruptManager`]: an implementation of [`InterruptManager`], manage interrupts and interrupt modes for a device +- [trait `InterruptSourceGroup`]: manage a group of interrupt sources for a device, provide methods to control the interrupts +- [enum `InterruptSourceType`]: type of interrupt source +- [enum `InterruptSourceConfig`], [struct `LegacyIrqSourceConfig`] and [struct `MsiIrqSourceConfig`]: configuration data for interrupt sources + +## License + +This project is licensed under [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). + +[trait InterruptManager]: src/lib.rs +[struct DeviceInterruptManager]: src/manager.rs +[trait InterruptSourceGroup]: src/lib.rs +[enum InterruptSourceType]: src/lib.rs +[enum InterruptSourceConfig]: src/lib.rs +[struct LegacyIrqSourceConfig]: src/lib.rs +[struct MsiIrqSourceConfig]: src/lib.rs diff --git a/src/dragonball/src/dbs_interrupt/src/kvm/legacy_irq.rs b/src/dragonball/src/dbs_interrupt/src/kvm/legacy_irq.rs new file mode 100644 index 000000000..3fb6b0247 --- /dev/null +++ b/src/dragonball/src/dbs_interrupt/src/kvm/legacy_irq.rs @@ -0,0 +1,351 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Manage virtual device's legacy interrupts based on Linux KVM framework. +//! +//! On x86 platforms, legacy interrupts are those managed by the Master PIC, the slave PIC and +//! IOAPICs. + +use kvm_bindings::KVM_IRQ_ROUTING_IRQCHIP; +#[cfg(target_arch = "x86_64")] +use kvm_bindings::{KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE}; +use vmm_sys_util::eventfd::EFD_NONBLOCK; + +use super::*; + +#[cfg(target_arch = "x86_64")] +/// Maximum number of legacy interrupts supported. +pub const MAX_LEGACY_IRQS: u32 = 24; + +#[cfg(target_arch = "aarch64")] +/// Maximum number of legacy interrupts supported. +pub const MAX_LEGACY_IRQS: u32 = 128; + +pub(super) struct LegacyIrq { + base: u32, + vmfd: Arc, + irqfd: EventFd, +} + +impl LegacyIrq { + pub(super) fn new( + base: InterruptIndex, + count: InterruptIndex, + vmfd: Arc, + _routes: Arc, + ) -> Result { + if count != 1 { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + if base >= MAX_LEGACY_IRQS { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + Ok(LegacyIrq { + base, + vmfd, + irqfd: EventFd::new(EFD_NONBLOCK)?, + }) + } + + #[cfg(target_arch = "x86_64")] + fn add_legacy_entry( + gsi: u32, + chip: u32, + pin: u32, + routes: &mut HashMap, + ) -> Result<()> { + let mut entry = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + // Safe because we are initializing all fields of the `irqchip` struct. + entry.u.irqchip.irqchip = chip; + entry.u.irqchip.pin = pin; + routes.insert(hash_key(&entry), entry); + + Ok(()) + } + + /// Build routings for IRQs connected to the master PIC, the slave PIC or the first IOAPIC. + #[cfg(target_arch = "x86_64")] + pub(super) fn initialize_legacy( + routes: &mut HashMap, + ) -> Result<()> { + // Build routings for the master PIC + for i in 0..8 { + if i != 2 { + Self::add_legacy_entry(i, KVM_IRQCHIP_PIC_MASTER, i, routes)?; + } + } + + // Build routings for the slave PIC + for i in 8..16 { + Self::add_legacy_entry(i, KVM_IRQCHIP_PIC_SLAVE, i - 8, routes)?; + } + + // Build routings for the first IOAPIC + for i in 0..MAX_LEGACY_IRQS { + if i == 0 { + Self::add_legacy_entry(i, KVM_IRQCHIP_IOAPIC, 2, routes)?; + } else if i != 2 { + Self::add_legacy_entry(i, KVM_IRQCHIP_IOAPIC, i, routes)?; + }; + } + + Ok(()) + } + + #[cfg(target_arch = "aarch64")] + pub(super) fn initialize_legacy( + routes: &mut HashMap, + ) -> Result<()> { + for i in 0..MAX_LEGACY_IRQS { + let mut entry = kvm_irq_routing_entry { + gsi: i, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + entry.u.irqchip.irqchip = 0; + entry.u.irqchip.pin = i; + routes.insert(hash_key(&entry), entry); + } + Ok(()) + } +} + +impl InterruptSourceGroup for LegacyIrq { + fn interrupt_type(&self) -> InterruptSourceType { + InterruptSourceType::LegacyIrq + } + + fn len(&self) -> u32 { + 1 + } + + fn base(&self) -> u32 { + self.base + } + + fn enable(&self, configs: &[InterruptSourceConfig]) -> Result<()> { + if configs.len() != 1 { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + // The IRQ routings for legacy IRQs have been configured during KvmIrqManager::initialize(), + // so only need to register irqfd to the KVM driver. + self.vmfd + .register_irqfd(&self.irqfd, self.base) + .map_err(from_sys_util_errno) + } + + fn disable(&self) -> Result<()> { + self.vmfd + .unregister_irqfd(&self.irqfd, self.base) + .map_err(from_sys_util_errno) + } + + fn update(&self, index: InterruptIndex, _config: &InterruptSourceConfig) -> Result<()> { + // For legacy interrupts, the routing configuration is managed by the PIC/IOAPIC interrupt + // controller drivers, so nothing to do here. + if index != 0 { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + Ok(()) + } + + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd> { + if index != 0 { + None + } else { + Some(&self.irqfd) + } + } + + fn trigger(&self, index: InterruptIndex) -> Result<()> { + if index != 0 { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + self.irqfd.write(1) + } + + fn mask(&self, index: InterruptIndex) -> Result<()> { + if index > 1 { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + self.vmfd + .unregister_irqfd(&self.irqfd, self.base + index) + .map_err(from_sys_util_errno)?; + + Ok(()) + } + + fn unmask(&self, index: InterruptIndex) -> Result<()> { + if index > 1 { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + self.vmfd + .register_irqfd(&self.irqfd, self.base + index) + .map_err(from_sys_util_errno)?; + + Ok(()) + } + + fn get_pending_state(&self, index: InterruptIndex) -> bool { + if index > 1 { + return false; + } + + // Peak the EventFd.count by reading and writing back. + // The irqfd must be in NON-BLOCKING mode. + match self.irqfd.read() { + Err(_) => false, + Ok(count) => { + if count != 0 && self.irqfd.write(count).is_err() { + // Hope the caller will handle the pending state corrrectly, + // then no interrupt will be lost. + } + count != 0 + } + } + } +} + +#[cfg(test)] +#[cfg(target_arch = "x86_64")] +mod test { + use super::*; + use crate::manager::tests::create_vm_fd; + + const MASTER_PIC: usize = 7; + const SLAVE_PIC: usize = 8; + const IOAPIC: usize = 23; + + #[test] + #[allow(unreachable_patterns)] + fn test_legacy_interrupt_group() { + let vmfd = Arc::new(create_vm_fd()); + let rounting = Arc::new(KvmIrqRouting::new(vmfd.clone())); + let base = 0; + let count = 1; + let group = LegacyIrq::new(base, count, vmfd.clone(), rounting.clone()).unwrap(); + + let legacy_fds = vec![InterruptSourceConfig::LegacyIrq(LegacyIrqSourceConfig {})]; + + match group.interrupt_type() { + InterruptSourceType::LegacyIrq => {} + _ => { + panic!(); + } + } + vmfd.create_irq_chip().unwrap(); + assert_eq!(group.len(), 1); + assert_eq!(group.base(), base); + group.enable(&legacy_fds).unwrap(); + group.notifier(0).unwrap().write(1).unwrap(); + group.trigger(0).unwrap(); + assert!(group.trigger(1).is_err()); + group + .update( + 0, + &InterruptSourceConfig::LegacyIrq(LegacyIrqSourceConfig {}), + ) + .unwrap(); + group.disable().unwrap(); + + assert!(LegacyIrq::new(base, 2, vmfd.clone(), rounting.clone()).is_err()); + assert!(LegacyIrq::new(110, 1, vmfd, rounting).is_err()); + } + + #[test] + fn test_irq_routing_initialize_legacy() { + let vmfd = Arc::new(create_vm_fd()); + let routing = KvmIrqRouting::new(vmfd.clone()); + + // this would ok on 4.9 kernel + assert!(routing.initialize().is_err()); + + vmfd.create_irq_chip().unwrap(); + routing.initialize().unwrap(); + + let routes = &routing.routes.lock().unwrap(); + assert_eq!(routes.len(), MASTER_PIC + SLAVE_PIC + IOAPIC); + } + + #[test] + fn test_routing_opt() { + let vmfd = Arc::new(create_vm_fd()); + let routing = KvmIrqRouting::new(vmfd.clone()); + + // this would ok on 4.9 kernel + assert!(routing.initialize().is_err()); + + vmfd.create_irq_chip().unwrap(); + routing.initialize().unwrap(); + + let mut entry = kvm_irq_routing_entry { + gsi: 8, + type_: kvm_bindings::KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + + // Safe because we are initializing all fields of the `irqchip` struct. + entry.u.irqchip.irqchip = 0; + entry.u.irqchip.pin = 3; + + let entrys = vec![entry]; + + assert!(routing.modify(&entry).is_err()); + routing.add(&entrys).unwrap(); + entry.u.irqchip.pin = 4; + routing.modify(&entry).unwrap(); + routing.remove(&entrys).unwrap(); + assert!(routing.modify(&entry).is_err()); + } + + #[test] + fn test_routing_set_routing() { + let vmfd = Arc::new(create_vm_fd()); + let routing = KvmIrqRouting::new(vmfd.clone()); + + // this would ok on 4.9 kernel + assert!(routing.initialize().is_err()); + + vmfd.create_irq_chip().unwrap(); + routing.initialize().unwrap(); + + let mut entry = kvm_irq_routing_entry { + gsi: 8, + type_: kvm_bindings::KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + entry.u.irqchip.irqchip = 0; + entry.u.irqchip.pin = 3; + + routing + .routes + .lock() + .unwrap() + .insert(hash_key(&entry), entry); + let routes = routing.routes.lock().unwrap(); + routing.set_routing(&routes).unwrap(); + } + + #[test] + fn test_has_key() { + let gsi = 4; + let mut entry = kvm_irq_routing_entry { + gsi, + type_: kvm_bindings::KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + // Safe because we are initializing all fields of the `irqchip` struct. + entry.u.irqchip.irqchip = kvm_bindings::KVM_IRQCHIP_PIC_MASTER; + entry.u.irqchip.pin = gsi; + assert_eq!(hash_key(&entry), 0x0001_0000_0004); + } +} diff --git a/src/dragonball/src/dbs_interrupt/src/kvm/mod.rs b/src/dragonball/src/dbs_interrupt/src/kvm/mod.rs new file mode 100644 index 000000000..435bb20a7 --- /dev/null +++ b/src/dragonball/src/dbs_interrupt/src/kvm/mod.rs @@ -0,0 +1,340 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Manage virtual device's interrupts based on the Linux KVM framework. +//! +//! When updaing KVM IRQ routing by ioctl(KVM_SET_GSI_ROUTING), all interrupts of the virtual +//! machine must be updated all together. The [KvmIrqRouting](struct.KvmIrqRouting.html) structure +//! is to maintain the global interrupt routing table. +//! +//! It deserves a good documentation about the way that KVM based vmms manages interrupts. From the +//! KVM hypervisor side, it provides three mechanism to support injecting interrupts into guests: +//! 1) Irqfd. When data is written to an irqfd, it triggers KVM to inject an interrupt into guest. +//! 2) Irq routing. Irq routing determines the way to inject an irq into guest. +//! 3) Signal MSI. Vmm can inject an MSI interrupt into guest by issuing KVM_SIGNAL_MSI ioctl. +//! +//! Most VMMs use irqfd + irq routing to support interrupt injecting, so we will focus on this mode. +//! The flow to enable interrupt injecting is: +//! 1) VMM creates an irqfd +//! 2) VMM invokes KVM_IRQFD to bind the irqfd to an interrupt source +//! 3) VMM invokes KVM_SET_GSI_ROUTING to configure the way to inject the interrupt into guest +//! 4) device backend driver writes to the irqfd +//! 5) an interurpt is injected into the guest + +use std::collections::HashMap; +use std::io::{Error, ErrorKind}; +use std::sync::{Arc, Mutex}; + +use kvm_bindings::{kvm_irq_routing, kvm_irq_routing_entry}; +use kvm_ioctls::VmFd; + +use super::*; + +#[cfg(feature = "kvm-legacy-irq")] +use legacy_irq::LegacyIrq; +#[cfg(feature = "kvm-msi-irq")] +use msi_irq::MsiIrq; + +#[cfg(feature = "kvm-legacy-irq")] +mod legacy_irq; +#[cfg(feature = "kvm-msi-generic")] +mod msi_generic; +#[cfg(feature = "kvm-msi-irq")] +mod msi_irq; + +/// Maximum number of global interrupt sources. +pub const MAX_IRQS: InterruptIndex = 1024; + +/// Default maximum number of Message Signaled Interrupts per device. +pub const DEFAULT_MAX_MSI_IRQS_PER_DEVICE: InterruptIndex = 256; + +/// Structure to manage interrupt sources for a virtual machine based on the Linux KVM framework. +/// +/// The KVM framework provides methods to inject interrupts into the target virtual machines, which +/// uses irqfd to notity the KVM kernel module for injecting interrupts. When the interrupt source, +/// usually a virtual device backend in userspace, writes to the irqfd file descriptor, the KVM +/// kernel module will inject a corresponding interrupt into the target VM according to the IRQ +/// routing configuration. +pub struct KvmIrqManager { + mgr: Mutex, +} + +impl KvmIrqManager { + /// Create a new interrupt manager based on the Linux KVM framework. + /// + /// # Arguments + /// * `vmfd`: The KVM VM file descriptor, which will be used to access the KVM subsystem. + pub fn new(vmfd: Arc) -> Self { + KvmIrqManager { + mgr: Mutex::new(KvmIrqManagerObj { + vmfd: vmfd.clone(), + groups: HashMap::new(), + routes: Arc::new(KvmIrqRouting::new(vmfd)), + max_msi_irqs: DEFAULT_MAX_MSI_IRQS_PER_DEVICE, + }), + } + } + + /// Prepare the interrupt manager for generating interrupts into the target VM. + pub fn initialize(&self) -> Result<()> { + // Safe to unwrap because there's no legal way to break the mutex. + let mgr = self.mgr.lock().unwrap(); + mgr.initialize() + } + + /// Set maximum supported MSI interrupts per device. + pub fn set_max_msi_irqs(&self, max_msi_irqs: InterruptIndex) { + let mut mgr = self.mgr.lock().unwrap(); + mgr.max_msi_irqs = max_msi_irqs; + } +} + +impl InterruptManager for KvmIrqManager { + fn create_group( + &self, + ty: InterruptSourceType, + base: InterruptIndex, + count: u32, + ) -> Result>> { + // Safe to unwrap because there's no legal way to break the mutex. + let mut mgr = self.mgr.lock().unwrap(); + mgr.create_group(ty, base, count) + } + + fn destroy_group(&self, group: Arc>) -> Result<()> { + // Safe to unwrap because there's no legal way to break the mutex. + let mut mgr = self.mgr.lock().unwrap(); + mgr.destroy_group(group) + } +} + +struct KvmIrqManagerObj { + vmfd: Arc, + routes: Arc, + groups: HashMap>>, + max_msi_irqs: InterruptIndex, +} + +impl KvmIrqManagerObj { + fn initialize(&self) -> Result<()> { + self.routes.initialize()?; + Ok(()) + } + + fn create_group( + &mut self, + ty: InterruptSourceType, + base: InterruptIndex, + count: u32, + ) -> Result>> { + #[allow(unreachable_patterns)] + let group: Arc> = match ty { + #[cfg(feature = "kvm-legacy-irq")] + InterruptSourceType::LegacyIrq => Arc::new(Box::new(LegacyIrq::new( + base, + count, + self.vmfd.clone(), + self.routes.clone(), + )?)), + #[cfg(feature = "kvm-msi-irq")] + InterruptSourceType::MsiIrq => Arc::new(Box::new(MsiIrq::new( + base, + count, + self.max_msi_irqs, + self.vmfd.clone(), + self.routes.clone(), + )?)), + _ => return Err(Error::from(ErrorKind::InvalidInput)), + }; + + self.groups.insert(base, group.clone()); + + Ok(group) + } + + fn destroy_group(&mut self, group: Arc>) -> Result<()> { + self.groups.remove(&group.base()); + Ok(()) + } +} + +// Use (entry.type, entry.gsi) as the hash key because entry.gsi can't uniquely identify an +// interrupt source on x86 platforms. The PIC and IOAPIC may share the same GSI on x86 platforms. +fn hash_key(entry: &kvm_irq_routing_entry) -> u64 { + let type1 = match entry.type_ { + #[cfg(feature = "kvm-legacy-irq")] + kvm_bindings::KVM_IRQ_ROUTING_IRQCHIP => unsafe { entry.u.irqchip.irqchip }, + _ => 0u32, + }; + (u64::from(type1) << 48 | u64::from(entry.type_) << 32) | u64::from(entry.gsi) +} + +pub(super) struct KvmIrqRouting { + vm_fd: Arc, + routes: Mutex>, +} + +impl KvmIrqRouting { + pub(super) fn new(vm_fd: Arc) -> Self { + KvmIrqRouting { + vm_fd, + routes: Mutex::new(HashMap::new()), + } + } + + pub(super) fn initialize(&self) -> Result<()> { + // Safe to unwrap because there's no legal way to break the mutex. + #[allow(unused_mut)] + let mut routes = self.routes.lock().unwrap(); + + #[cfg(feature = "kvm-legacy-irq")] + LegacyIrq::initialize_legacy(&mut routes)?; + + self.set_routing(&routes)?; + + Ok(()) + } + + fn set_routing(&self, routes: &HashMap) -> Result<()> { + // Allocate enough buffer memory. + let elem_sz = std::mem::size_of::(); + let total_sz = std::mem::size_of::() * routes.len() + elem_sz; + let elem_cnt = (total_sz + elem_sz - 1) / elem_sz; + let mut irq_routings = Vec::::with_capacity(elem_cnt); + irq_routings.resize_with(elem_cnt, Default::default); + + // Prepare the irq_routing header. + let irq_routing = &mut irq_routings[0]; + irq_routing.nr = routes.len() as u32; + irq_routing.flags = 0; + + // Safe because we have just allocated enough memory above. + let irq_routing_entries = unsafe { irq_routing.entries.as_mut_slice(routes.len()) }; + for (idx, entry) in routes.values().enumerate() { + irq_routing_entries[idx] = *entry; + } + + self.vm_fd + .set_gsi_routing(irq_routing) + .map_err(from_sys_util_errno)?; + + Ok(()) + } +} + +#[cfg(feature = "kvm-msi-generic")] +impl KvmIrqRouting { + pub(super) fn add(&self, entries: &[kvm_irq_routing_entry]) -> Result<()> { + // Safe to unwrap because there's no legal way to break the mutex. + let mut routes = self.routes.lock().unwrap(); + for entry in entries { + if entry.gsi >= MAX_IRQS { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } else if routes.contains_key(&hash_key(entry)) { + return Err(std::io::Error::from_raw_os_error(libc::EEXIST)); + } + } + + for entry in entries { + let _ = routes.insert(hash_key(entry), *entry); + } + self.set_routing(&routes) + } + + pub(super) fn remove(&self, entries: &[kvm_irq_routing_entry]) -> Result<()> { + // Safe to unwrap because there's no legal way to break the mutex. + let mut routes = self.routes.lock().unwrap(); + for entry in entries { + let _ = routes.remove(&hash_key(entry)); + } + self.set_routing(&routes) + } + + pub(super) fn modify(&self, entry: &kvm_irq_routing_entry) -> Result<()> { + // Safe to unwrap because there's no legal way to break the mutex. + let mut routes = self.routes.lock().unwrap(); + if !routes.contains_key(&hash_key(entry)) { + return Err(std::io::Error::from_raw_os_error(libc::ENOENT)); + } + + let _ = routes.insert(hash_key(entry), *entry); + self.set_routing(&routes) + } +} + +/// Helper function convert from vmm_sys_util::errno::Error to std::io::Error. +pub fn from_sys_util_errno(e: vmm_sys_util::errno::Error) -> std::io::Error { + std::io::Error::from_raw_os_error(e.errno()) +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + use crate::manager::tests::create_vm_fd; + + fn create_irq_group( + manager: Arc, + _vmfd: Arc, + ) -> Arc> { + let base = 0; + let count = 1; + + manager + .create_group(InterruptSourceType::LegacyIrq, base, count) + .unwrap() + } + + fn create_msi_group( + manager: Arc, + _vmfd: Arc, + ) -> Arc> { + let base = 168; + let count = 32; + + manager + .create_group(InterruptSourceType::MsiIrq, base, count) + .unwrap() + } + + pub fn create_kvm_irq_manager() -> (Arc, KvmIrqManager) { + let vmfd = Arc::new(create_vm_fd()); + let manager = KvmIrqManager::new(vmfd.clone()); + vmfd.create_irq_chip().unwrap(); + manager.initialize().unwrap(); + (vmfd, manager) + } + + #[test] + fn test_create_kvm_irq_manager() { + let _ = create_kvm_irq_manager(); + } + + #[test] + fn test_kvm_irq_manager_opt() { + let vmfd = Arc::new(create_vm_fd()); + vmfd.create_irq_chip().unwrap(); + let manager = Arc::new(KvmIrqManager::new(vmfd.clone())); + manager.initialize().unwrap(); + + // set max irqs + manager.set_max_msi_irqs(0x128); + assert_eq!(manager.mgr.lock().unwrap().max_msi_irqs, 0x128); + + // irq + let group = create_irq_group(manager.clone(), vmfd.clone()); + let _ = group.clone(); + manager.destroy_group(group).unwrap(); + + // msi + let group = create_msi_group(manager.clone(), vmfd); + let _ = group.clone(); + manager.destroy_group(group).unwrap(); + } + + #[test] + fn test_from_sys_util_errno() { + let error = vmm_sys_util::errno::Error::new(1); + let io_error = from_sys_util_errno(error); + assert_eq!(io_error.kind(), std::io::ErrorKind::PermissionDenied); + } +} diff --git a/src/dragonball/src/dbs_interrupt/src/kvm/msi_generic.rs b/src/dragonball/src/dbs_interrupt/src/kvm/msi_generic.rs new file mode 100644 index 000000000..eedef67d0 --- /dev/null +++ b/src/dragonball/src/dbs_interrupt/src/kvm/msi_generic.rs @@ -0,0 +1,132 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Helper utilities for handling MSI interrupts. + +use kvm_bindings::{kvm_irq_routing_entry, KVM_IRQ_ROUTING_MSI}; +use vmm_sys_util::eventfd::EFD_NONBLOCK; + +use super::*; + +pub(crate) struct MsiConfig { + pub(super) irqfd: EventFd, + pub(crate) config: Mutex, +} + +impl MsiConfig { + pub(crate) fn new() -> Self { + MsiConfig { + irqfd: EventFd::new(EFD_NONBLOCK).unwrap(), + config: Mutex::new(Default::default()), + } + } +} + +pub(super) fn new_msi_routing_entry( + gsi: InterruptIndex, + msicfg: &MsiIrqSourceConfig, +) -> kvm_irq_routing_entry { + let mut entry = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_MSI, + ..Default::default() + }; + entry.u.msi.address_hi = msicfg.high_addr; + entry.u.msi.address_lo = msicfg.low_addr; + entry.u.msi.data = msicfg.data; + if let Some(dev_id) = msicfg.device_id { + entry.u.msi.__bindgen_anon_1.devid = dev_id; + entry.flags = kvm_bindings::KVM_MSI_VALID_DEVID; + } + + entry +} + +#[allow(irrefutable_let_patterns)] +pub(super) fn create_msi_routing_entries( + base: InterruptIndex, + configs: &[InterruptSourceConfig], +) -> Result> { + let _ = base + .checked_add(configs.len() as u32) + .ok_or_else(|| std::io::Error::from_raw_os_error(libc::EINVAL))?; + let mut entries = Vec::with_capacity(configs.len()); + for (i, ref val) in configs.iter().enumerate() { + if let InterruptSourceConfig::MsiIrq(msicfg) = val { + let entry = new_msi_routing_entry(base + i as u32, msicfg); + entries.push(entry); + } else { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + } + + Ok(entries) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_create_msiconfig() { + let config = MsiConfig::new(); + config.irqfd.write(1).unwrap(); + } + + #[test] + fn test_new_msi_routing_single() { + let test_gsi = 4; + let msi_source_config = MsiIrqSourceConfig { + high_addr: 0x1234, + low_addr: 0x5678, + data: 0x9876, + msg_ctl: 0, + device_id: None, + }; + let entry = new_msi_routing_entry(test_gsi, &msi_source_config); + assert_eq!(entry.gsi, test_gsi); + assert_eq!(entry.type_, KVM_IRQ_ROUTING_MSI); + unsafe { + assert_eq!(entry.u.msi.address_hi, msi_source_config.high_addr); + assert_eq!(entry.u.msi.address_lo, msi_source_config.low_addr); + assert_eq!(entry.u.msi.data, msi_source_config.data); + } + } + + #[cfg(all(feature = "legacy_irq", target_arch = "x86_64"))] + #[test] + fn test_new_msi_routing_multi() { + let mut msi_fds = Vec::with_capacity(16); + for _ in 0..16 { + msi_fds.push(InterruptSourceConfig::MsiIrq(MsiIrqSourceConfig { + high_addr: 0x1234, + low_addr: 0x5678, + data: 0x9876, + msg_ctl: 0, + device_id: None, + })); + } + let mut legacy_fds = Vec::with_capacity(16); + for _ in 0..16 { + legacy_fds.push(InterruptSourceConfig::LegacyIrq(LegacyIrqSourceConfig {})); + } + + let base = 0; + let entrys = create_msi_routing_entries(0, &msi_fds).unwrap(); + + for (i, entry) in entrys.iter().enumerate() { + assert_eq!(entry.gsi, (base + i) as u32); + assert_eq!(entry.type_, KVM_IRQ_ROUTING_MSI); + if let InterruptSourceConfig::MsiIrq(config) = &msi_fds[i] { + unsafe { + assert_eq!(entry.u.msi.address_hi, config.high_addr); + assert_eq!(entry.u.msi.address_lo, config.low_addr); + assert_eq!(entry.u.msi.data, config.data); + } + } + } + + assert!(create_msi_routing_entries(0, &legacy_fds).is_err()); + assert!(create_msi_routing_entries(!0, &msi_fds).is_err()); + } +} diff --git a/src/dragonball/src/dbs_interrupt/src/kvm/msi_irq.rs b/src/dragonball/src/dbs_interrupt/src/kvm/msi_irq.rs new file mode 100644 index 000000000..50e1cdb33 --- /dev/null +++ b/src/dragonball/src/dbs_interrupt/src/kvm/msi_irq.rs @@ -0,0 +1,276 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Manage virtual device's PCI MSI/PCI MSIx interrupts based on Linux KVM framework. +//! +//! To optimize for performance by avoiding unnecessary locking and state checking, we assume that +//! the caller will take the responsibility to maintain the interrupt states and only issue valid +//! requests to this driver. If the caller doesn't obey the contract, only the current virtual +//! machine will be affected, it shouldn't break the host or other virtual machines. + +use super::msi_generic::{create_msi_routing_entries, new_msi_routing_entry, MsiConfig}; +use super::*; + +pub(super) struct MsiIrq { + base: InterruptIndex, + count: InterruptIndex, + vmfd: Arc, + irq_routing: Arc, + msi_configs: Vec, +} + +impl MsiIrq { + pub(super) fn new( + base: InterruptIndex, + count: InterruptIndex, + max_msi_irqs: InterruptIndex, + vmfd: Arc, + irq_routing: Arc, + ) -> Result { + if count > max_msi_irqs || base >= MAX_IRQS || base + count > MAX_IRQS { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + let mut msi_configs = Vec::with_capacity(count as usize); + for _ in 0..count { + msi_configs.push(MsiConfig::new()); + } + + Ok(MsiIrq { + base, + count, + vmfd, + irq_routing, + msi_configs, + }) + } +} + +impl InterruptSourceGroup for MsiIrq { + fn interrupt_type(&self) -> InterruptSourceType { + InterruptSourceType::MsiIrq + } + + fn len(&self) -> u32 { + self.count + } + + fn base(&self) -> u32 { + self.base + } + + fn enable(&self, configs: &[InterruptSourceConfig]) -> Result<()> { + if configs.len() != self.count as usize { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + // First add IRQ routings for all the MSI interrupts. + let entries = create_msi_routing_entries(self.base, configs)?; + + self.irq_routing + .add(&entries) + .or_else(|err| match err.kind() { + // The irq_routing was already restored when the snapshot was restored, so the AlreadyExists error is ignored here. + std::io::ErrorKind::AlreadyExists => Ok(()), + _ => Err(err), + })?; + + // Then register irqfds to the KVM module. + for i in 0..self.count { + let irqfd = &self.msi_configs[i as usize].irqfd; + self.vmfd + .register_irqfd(irqfd, self.base + i) + .map_err(from_sys_util_errno)?; + } + + Ok(()) + } + + fn disable(&self) -> Result<()> { + // First unregister all irqfds, so it won't trigger anymore. + for i in 0..self.count { + let irqfd = &self.msi_configs[i as usize].irqfd; + self.vmfd + .unregister_irqfd(irqfd, self.base + i) + .map_err(from_sys_util_errno)?; + } + + // Then tear down the IRQ routings for all the MSI interrupts. + let mut entries = Vec::with_capacity(self.count as usize); + for i in 0..self.count { + // Safe to unwrap because there's no legal way to break the mutex. + let msicfg = self.msi_configs[i as usize].config.lock().unwrap(); + let entry = new_msi_routing_entry(self.base + i, &msicfg); + entries.push(entry); + } + self.irq_routing.remove(&entries)?; + + Ok(()) + } + + #[allow(irrefutable_let_patterns)] + fn update(&self, index: InterruptIndex, config: &InterruptSourceConfig) -> Result<()> { + if index >= self.count { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + if let InterruptSourceConfig::MsiIrq(ref cfg) = config { + // Safe to unwrap because there's no legal way to break the mutex. + let entry = { + let mut msicfg = self.msi_configs[index as usize].config.lock().unwrap(); + msicfg.high_addr = cfg.high_addr; + msicfg.low_addr = cfg.low_addr; + msicfg.data = cfg.data; + msicfg.device_id = cfg.device_id; + new_msi_routing_entry(self.base + index, &msicfg) + }; + self.irq_routing.modify(&entry) + } else { + Err(std::io::Error::from_raw_os_error(libc::EINVAL)) + } + } + + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd> { + if index >= self.count { + None + } else { + let msi_config = &self.msi_configs[index as usize]; + Some(&msi_config.irqfd) + } + } + + fn trigger(&self, index: InterruptIndex) -> Result<()> { + // Assume that the caller will maintain the interrupt states and only call this function + // when suitable. + if index >= self.count { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + let msi_config = &self.msi_configs[index as usize]; + msi_config.irqfd.write(1) + } + + fn mask(&self, index: InterruptIndex) -> Result<()> { + if index >= self.count { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + let irqfd = &self.msi_configs[index as usize].irqfd; + self.vmfd + .unregister_irqfd(irqfd, self.base + index) + .map_err(from_sys_util_errno)?; + + Ok(()) + } + + fn unmask(&self, index: InterruptIndex) -> Result<()> { + if index >= self.count { + return Err(std::io::Error::from_raw_os_error(libc::EINVAL)); + } + + let irqfd = &self.msi_configs[index as usize].irqfd; + self.vmfd + .register_irqfd(irqfd, self.base + index) + .map_err(from_sys_util_errno)?; + + Ok(()) + } + + fn get_pending_state(&self, index: InterruptIndex) -> bool { + if index >= self.count { + return false; + } + + // Peak the EventFd.count by reading and writing back. + // The irqfd must be in NON-BLOCKING mode. + let irqfd = &self.msi_configs[index as usize].irqfd; + match irqfd.read() { + Err(_) => false, + Ok(count) => { + if count != 0 && irqfd.write(count).is_err() { + // Hope the caller will handle the pending state corrrectly, + // then no interrupt will be lost. + // Really no way to recover here! + } + count != 0 + } + } + } +} + +#[cfg(target_arch = "x86_64")] +#[cfg(test)] +mod test { + use super::*; + use crate::manager::tests::create_vm_fd; + + #[test] + #[allow(unreachable_patterns)] + fn test_msi_interrupt_group() { + let vmfd = Arc::new(create_vm_fd()); + vmfd.create_irq_chip().unwrap(); + + let rounting = Arc::new(KvmIrqRouting::new(vmfd.clone())); + rounting.initialize().unwrap(); + + let base = 168; + let count = 32; + let group = MsiIrq::new( + base, + count, + DEFAULT_MAX_MSI_IRQS_PER_DEVICE, + vmfd.clone(), + rounting.clone(), + ) + .unwrap(); + let mut msi_fds = Vec::with_capacity(count as usize); + + match group.interrupt_type() { + InterruptSourceType::MsiIrq => {} + _ => { + panic!(); + } + } + + for _ in 0..count { + let msi_source_config = MsiIrqSourceConfig { + high_addr: 0x1234, + low_addr: 0x5678, + data: 0x9876, + msg_ctl: 0x6789, + device_id: None, + }; + msi_fds.push(InterruptSourceConfig::MsiIrq(msi_source_config)); + } + + group.enable(&msi_fds).unwrap(); + assert_eq!(group.len(), count); + assert_eq!(group.base(), base); + + for i in 0..count { + let msi_source_config = MsiIrqSourceConfig { + high_addr: i + 0x1234, + low_addr: i + 0x5678, + data: i + 0x9876, + msg_ctl: i + 0x6789, + device_id: None, + }; + group.notifier(i).unwrap().write(1).unwrap(); + group.trigger(i).unwrap(); + group + .update(0, &InterruptSourceConfig::MsiIrq(msi_source_config)) + .unwrap(); + } + assert!(group.trigger(33).is_err()); + group.disable().unwrap(); + + assert!(MsiIrq::new( + base, + DEFAULT_MAX_MSI_IRQS_PER_DEVICE + 1, + DEFAULT_MAX_MSI_IRQS_PER_DEVICE, + vmfd.clone(), + rounting.clone() + ) + .is_err()); + assert!(MsiIrq::new(1100, 1, DEFAULT_MAX_MSI_IRQS_PER_DEVICE, vmfd, rounting).is_err()); + } +} diff --git a/src/dragonball/src/dbs_interrupt/src/lib.rs b/src/dragonball/src/dbs_interrupt/src/lib.rs new file mode 100644 index 000000000..fab0123b2 --- /dev/null +++ b/src/dragonball/src/dbs_interrupt/src/lib.rs @@ -0,0 +1,244 @@ +// Copyright (C) 2019-2020 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Traits and Structs to manage interrupt sources for devices. +//! +//! Software indicating an event that needs immediate attention. An interrupt alerts the processor +//! to a high-priority condition requiring the interruption of the current code the processor is +//! executing. The processor responds by suspending its current activities, saving its state, and +//! executing a function called an interrupt handler (or an interrupt service routine, ISR) to deal +//! with the event. This interruption is temporary, and, after the interrupt handler finishes, +//! unless handling the interrupt has emitted a fatal error, the processor resumes normal +//! activities. +//! +//! Hardware interrupts are used by devices to communicate that they require attention from the +//! operating system, or a bare-metal program running on the CPU if there are no OSes. The act of +//! initiating a hardware interrupt is referred to as an interrupt request (IRQ). Different devices +//! are usually associated with different interrupts using a unique value associated with each +//! interrupt. This makes it possible to know which hardware device caused which interrupts. These +//! interrupt values are often called IRQ lines, or just interrupt lines. +//! +//! Nowadays, IRQ lines is not the only mechanism to deliver device interrupts to processors. MSI +//! [(Message Signaled Interrupt)](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts) is +//! another commonly used alternative in-band method of signaling an interrupt, using special +//! in-band messages to replace traditional out-of-band assertion of dedicated interrupt lines. +//! While more complex to implement in a device, message signaled interrupts have some significant +//! advantages over pin-based out-of-band interrupt signaling. Message signaled interrupts are +//! supported in PCI bus since its version 2.2, and in later available PCI Express bus. Some non-PCI +//! architectures also use message signaled interrupts. +//! +//! While IRQ is a term commonly used by Operating Systems when dealing with hardware interrupts, +//! the IRQ numbers managed by OSes are independent of the ones managed by VMM. For simplicity sake, +//! the term `Interrupt Source` is used instead of IRQ to represent both pin-based interrupts and +//! MSI interrupts. +//! +//! A device may support multiple types of interrupts, and each type of interrupt may support one or +//! multiple interrupt sources. For example, a PCI device may support: +//! * Legacy Irq: exactly one interrupt source. +//! * PCI MSI Irq: 1,2,4,8,16,32 interrupt sources. +//! * PCI MSIx Irq: 2^n(n=0-11) interrupt sources. +//! +//! A distinct Interrupt Source Identifier (ISID) will be assigned to each interrupt source. An ID +//! allocator will be used to allocate and free Interrupt Source Identifiers for devices. To +//! decouple this crate from the ID allocator, here we doesn't take the responsibility to +//! allocate/free Interrupt Source IDs but only makes use of assigned IDs. +//! +//! The overall flow to deal with interrupts is: +//! * the VMM creates an interrupt manager +//! * the VMM creates a device manager, passing on an reference to the interrupt manager +//! * the device manager passes on an reference to the interrupt manager to all registered devices +//! * guest kernel loads drivers for virtual devices +//! * guest device driver determines the type and number of interrupts needed, and update the device +//! configuration +//! * the virtual device backend requests the interrupt manager to create an interrupt group +//! according to guest configuration information + +use std::io::Error; +use std::ops::Deref; +use std::sync::Arc; + +use vmm_sys_util::eventfd::EventFd; + +mod manager; +pub use manager::MSI_DEVICE_ID_SHIFT; +pub use manager::{DeviceInterruptManager, DeviceInterruptMode, InterruptStatusRegister32}; + +mod notifier; +pub use self::notifier::*; + +#[cfg(feature = "kvm-irq")] +pub mod kvm; +#[cfg(feature = "kvm-irq")] +pub use self::kvm::KvmIrqManager; + +/// Reuse std::io::Result to simplify interoperability among crates. +pub type Result = std::io::Result; + +/// Data type to store an interrupt source identifier. +pub type InterruptIndex = u32; + +/// Type of interrupt source. +#[derive(Clone, Eq, PartialEq, Debug)] +pub enum InterruptSourceType { + #[cfg(feature = "legacy-irq")] + /// Legacy Pin-based Interrupt. + /// On x86 platforms, legacy interrupts are routed through 8259 PICs and/or IOAPICs. + LegacyIrq, + #[cfg(feature = "msi-irq")] + /// Message Signaled Interrupt (PCI MSI/PCI MSIx etc). + /// Some non-PCI devices (like HPET on x86) make use of generic MSI in platform specific ways. + MsiIrq, +} + +/// Configuration data for an interrupt source. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum InterruptSourceConfig { + #[cfg(feature = "legacy-irq")] + /// Configuration data for Legacy interrupts. + LegacyIrq(LegacyIrqSourceConfig), + #[cfg(feature = "msi-irq")] + /// Configuration data for PciMsi, PciMsix and generic MSI interrupts. + MsiIrq(MsiIrqSourceConfig), +} + +/// Configuration data for legacy interrupts. +/// +/// On x86 platforms, legacy interrupts means those interrupts routed through PICs or IOAPICs. +#[cfg(feature = "legacy-irq")] +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct LegacyIrqSourceConfig {} + +/// Configuration data for GenericMsi, PciMsi, PciMsix interrupts. +#[cfg(feature = "msi-irq")] +#[derive(Default, Clone, Debug, Eq, PartialEq)] +pub struct MsiIrqSourceConfig { + /// High address to deliver message signaled interrupt. + pub high_addr: u32, + /// Low address to deliver message signaled interrupt. + pub low_addr: u32, + /// Data to write to deliver message signaled interrupt. + pub data: u32, + /// Interrupt control state. + pub msg_ctl: u32, + /// Device id indicate the device who triggers this msi irq. + pub device_id: Option, +} + +/// Trait to manage interrupt sources for virtual device backends. +/// +/// The InterruptManager implementations should protect itself from concurrent accesses internally, +/// so it could be invoked from multi-threaded context. +pub trait InterruptManager { + /// Create an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object to manage interrupt + /// sources for a virtual device. + /// + /// An [InterruptSourceGroup](trait.InterruptSourceGroup.html) object manages all interrupt + /// sources of the same type for a virtual device. + /// + /// # Arguments + /// * type_: type of interrupt source. + /// * base: base Interrupt Source ID to be managed by the group object. + /// * count: number of Interrupt Sources to be managed by the group object. + fn create_group( + &self, + type_: InterruptSourceType, + base: InterruptIndex, + count: InterruptIndex, + ) -> Result>>; + + /// Destroy an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object created by + /// [create_group()](trait.InterruptManager.html#tymethod.create_group). + /// + /// Assume the caller takes the responsibility to disable all interrupt sources of the group + /// before calling destroy_group(). This assumption helps to simplify InterruptSourceGroup + /// implementations. + fn destroy_group(&self, group: Arc>) -> Result<()>; +} + +impl InterruptManager for Arc { + fn create_group( + &self, + type_: InterruptSourceType, + base: u32, + count: u32, + ) -> std::result::Result>, Error> { + self.deref().create_group(type_, base, count) + } + + fn destroy_group( + &self, + group: Arc>, + ) -> std::result::Result<(), Error> { + self.deref().destroy_group(group) + } +} + +/// Trait to manage a group of interrupt sources for a device. +/// +/// A device may support several types of interrupts, and each type of interrupt may contain one or +/// multiple continuous interrupt sources. For example, a PCI device may concurrently support: +/// * Legacy Irq: exactly one interrupt source. +/// * PCI MSI Irq: 1,2,4,8,16,32 interrupt sources. +/// * PCI MSIx Irq: 2^n(n=0-11) interrupt sources. +/// +/// PCI MSI interrupts of a device may not be configured individually, and must configured as a +/// whole block. So all interrupts of the same type of a device are abstracted as an +/// [InterruptSourceGroup](trait.InterruptSourceGroup.html) object, instead of abstracting each +/// interrupt source as a distinct InterruptSource. +#[allow(clippy::len_without_is_empty)] +pub trait InterruptSourceGroup: Send + Sync { + /// Get type of interrupt sources managed by the group. + fn interrupt_type(&self) -> InterruptSourceType; + + /// Get number of interrupt sources managed by the group. + fn len(&self) -> InterruptIndex; + + /// Get base of the assigned Interrupt Source Identifiers. + fn base(&self) -> InterruptIndex; + + /// Enable the interrupt sources in the group to generate interrupts. + fn enable(&self, configs: &[InterruptSourceConfig]) -> Result<()>; + + /// Disable the interrupt sources in the group to generate interrupts. + fn disable(&self) -> Result<()>; + + /// Update the interrupt source group configuration. + /// + /// # Arguments + /// * index: sub-index into the group. + /// * config: configuration data for the interrupt source. + fn update(&self, index: InterruptIndex, config: &InterruptSourceConfig) -> Result<()>; + + /// Returns an interrupt notifier from this interrupt. + /// + /// An interrupt notifier allows for external components and processes to inject interrupts into + /// guest, by writing to the file returned by this method. + fn notifier(&self, _index: InterruptIndex) -> Option<&EventFd> { + None + } + + /// Inject an interrupt from this interrupt source into the guest. + /// + /// If the interrupt has an associated `interrupt_status` register, all bits set in `flag` will + /// be atomically ORed into the `interrupt_status` register. + fn trigger(&self, index: InterruptIndex) -> Result<()>; + + /// Mask an interrupt from this interrupt source. + fn mask(&self, _index: InterruptIndex) -> Result<()> { + // Not all interrupt sources can be disabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Unmask an interrupt from this interrupt source. + fn unmask(&self, _index: InterruptIndex) -> Result<()> { + // Not all interrupt sources can be disabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Check whether there's pending interrupt. + fn get_pending_state(&self, _index: InterruptIndex) -> bool { + false + } +} diff --git a/src/dragonball/src/dbs_interrupt/src/manager.rs b/src/dragonball/src/dbs_interrupt/src/manager.rs new file mode 100644 index 000000000..bec623dc8 --- /dev/null +++ b/src/dragonball/src/dbs_interrupt/src/manager.rs @@ -0,0 +1,794 @@ +// Copyright (C) 2019-2020 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Interrupt manager to manage and switch device interrupt modes. +//! +//! A device may support multiple interrupt modes. For example, a PCI device may support legacy, PCI +//! MSI and PCI MSIx interrupts. This interrupt manager helps a device backend driver to manage its +//! interrupts and provides interfaces to switch interrupt working modes. +use std::io::{Error, Result}; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::Arc; + +use dbs_device::resources::DeviceResources; + +#[cfg(feature = "legacy-irq")] +use super::LegacyIrqSourceConfig; +#[cfg(feature = "msi-irq")] +use super::MsiIrqSourceConfig; +use super::{InterruptManager, InterruptSourceConfig, InterruptSourceGroup, InterruptSourceType}; + +/// Defines the offset when device_id is recorded to msi. +/// +/// For the origin of this value, please refer to the comment of set_msi_device_id function. +pub const MSI_DEVICE_ID_SHIFT: u8 = 3; + +#[cfg(feature = "legacy-irq")] +const LEGACY_CONFIGS: [InterruptSourceConfig; 1] = + [InterruptSourceConfig::LegacyIrq(LegacyIrqSourceConfig {})]; + +#[cfg(feature = "msi-irq")] +const MSI_INT_MASK_BIT: u8 = 0; +#[cfg(feature = "msi-irq")] +const MSI_INT_MASK: u32 = (1 << MSI_INT_MASK_BIT) as u32; + +/// Device interrupt working modes. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum DeviceInterruptMode { + /// The device interrupt manager has been disabled. + Disabled = 0, + /// The device interrupt manager works in legacy irq mode. + LegacyIrq = 1, + /// The device interrupt manager works in generic MSI mode. + GenericMsiIrq = 2, + /// The device interrupt manager works in PCI MSI mode. + PciMsiIrq = 3, + /// The device interrupt manager works in PCI MSI-x mode. + PciMsixIrq = 4, +} + +/// A struct to manage interrupts and interrupt modes for a device. +/// +/// The interrupt manager may support multiple working mode. For example, an interrupt manager for a +/// PCI device may work in legacy mode, PCI MSI mode or PCI MSIx mode. Under certain conditions, the +/// interrupt manager may switch between interrupt working modes. To simplify implementation, +/// switching working mode is only supported at configuration stage and will be disabled at runtime +/// stage. The DeviceInterruptManager::enable() switches the interrupt manager from configuration +/// stage into runtime stage. And DeviceInterruptManager::reset() switches from runtime stage back +/// to initial configuration stage. +pub struct DeviceInterruptManager { + mode: DeviceInterruptMode, + activated: bool, + current_idx: usize, + mode2idx: [usize; 5], + intr_mgr: T, + intr_groups: Vec>>, + #[cfg(feature = "msi-irq")] + msi_config: Vec, + /// Device id indicate the device who triggers msi irq. + device_id: Option, +} + +impl DeviceInterruptManager { + /// Create an interrupt manager for a device. + /// + /// # Arguments + /// * `intr_mgr`: underline interrupt manager to allocate/free interrupt groups. + /// * `resources`: resources assigned to the device, including assigned interrupt resources. + pub fn new(intr_mgr: T, resources: &DeviceResources) -> Result { + let mut mgr = DeviceInterruptManager { + mode: DeviceInterruptMode::Disabled, + activated: false, + current_idx: usize::MAX, + mode2idx: [usize::MAX; 5], + intr_mgr, + intr_groups: Vec::new(), + #[cfg(feature = "msi-irq")] + msi_config: Vec::new(), + device_id: None, + }; + + #[cfg(feature = "legacy-irq")] + { + if let Some(irq) = resources.get_legacy_irq() { + let group = mgr + .intr_mgr + .create_group(InterruptSourceType::LegacyIrq, irq, 1)?; + mgr.mode2idx[DeviceInterruptMode::LegacyIrq as usize] = mgr.intr_groups.len(); + mgr.intr_groups.push(group); + } + } + + #[cfg(feature = "msi-irq")] + { + if let Some(msi) = resources.get_generic_msi_irqs() { + let group = mgr + .intr_mgr + .create_group(InterruptSourceType::MsiIrq, msi.0, msi.1)?; + mgr.resize_msi_config_space(group.len()); + mgr.mode2idx[DeviceInterruptMode::GenericMsiIrq as usize] = mgr.intr_groups.len(); + mgr.intr_groups.push(group); + } + + if let Some(msi) = resources.get_pci_msi_irqs() { + let group = mgr + .intr_mgr + .create_group(InterruptSourceType::MsiIrq, msi.0, msi.1)?; + mgr.resize_msi_config_space(group.len()); + mgr.mode2idx[DeviceInterruptMode::PciMsiIrq as usize] = mgr.intr_groups.len(); + mgr.intr_groups.push(group); + } + + if let Some(msi) = resources.get_pci_msix_irqs() { + let group = mgr + .intr_mgr + .create_group(InterruptSourceType::MsiIrq, msi.0, msi.1)?; + mgr.resize_msi_config_space(group.len()); + mgr.mode2idx[DeviceInterruptMode::PciMsixIrq as usize] = mgr.intr_groups.len(); + mgr.intr_groups.push(group); + } + } + + Ok(mgr) + } + + /// Set device_id for MSI routing + pub fn set_device_id(&mut self, device_id: Option) { + self.device_id = device_id; + } + + /// Check whether the interrupt manager has been activated. + pub fn is_enabled(&self) -> bool { + self.activated + } + + /// Switch the interrupt manager from configuration stage into runtime stage. + /// + /// The working mode could only be changed at configuration stage, and all requests to change + /// working mode at runtime stage will be rejected. + /// + /// If the interrupt manager is still in DISABLED mode when DeviceInterruptManager::enable() is + /// called, it will be put into LEGACY mode if LEGACY mode is supported. + pub fn enable(&mut self) -> Result<()> { + if self.activated { + return Ok(()); + } + + // Enter Legacy mode by default if Legacy mode is supported. + if self.mode == DeviceInterruptMode::Disabled + && self.mode2idx[DeviceInterruptMode::LegacyIrq as usize] != usize::MAX + { + self.set_working_mode(DeviceInterruptMode::LegacyIrq)?; + } + if self.mode == DeviceInterruptMode::Disabled { + return Err(Error::from_raw_os_error(libc::EINVAL)); + } + + self.intr_groups[self.current_idx].enable(self.get_configs(self.mode))?; + self.activated = true; + + Ok(()) + } + + /// Switch the interrupt manager from runtime stage back into initial configuration stage. + /// + /// Currently we doesn't track the usage of interrupt group object given out by `get_group()`, + /// so the the caller needs to take the responsibility to release all interrupt group object + /// reference before calling DeviceInterruptManager::reset(). + pub fn reset(&mut self) -> Result<()> { + if self.activated { + self.activated = false; + self.intr_groups[self.current_idx].disable()?; + } + self.set_working_mode(DeviceInterruptMode::Disabled)?; + + Ok(()) + } + + /// Get the current interrupt working mode. + pub fn get_working_mode(&mut self) -> DeviceInterruptMode { + self.mode + } + + /// Switch interrupt working mode. + /// + /// Currently switching working mode is only supported during device configuration stage and + /// will always return failure if called during device runtime stage. The device switches from + /// configuration stage to runtime stage by invoking `DeviceInterruptManager::enable()`. With + /// this constraint, the device drivers may call `DeviceInterruptManager::get_group()` to get + /// the underline active interrupt group object, and directly calls the interrupt group object's + /// methods to trigger/acknowledge interrupts. + /// + /// This is a key design decision for optimizing performance. Though the DeviceInterruptManager + /// object itself is not multi-thread safe and must be protected from concurrent access by the + /// caller, the interrupt source group object is multi-thread safe and could be called + /// concurrently to trigger/acknowledge interrupts. This design may help to improve performance + /// for MSI interrupts. + /// + /// # Arguments + /// * `mode`: target working mode. + pub fn set_working_mode(&mut self, mode: DeviceInterruptMode) -> Result<()> { + // Can't switch mode agian once enabled. + if self.activated { + return Err(Error::from_raw_os_error(libc::EINVAL)); + } + + if mode != self.mode { + // Supported state transitions: + // - other state -> DISABLED + // - DISABLED -> other + // - non-legacy -> legacy + // - legacy -> non-legacy + if self.mode != DeviceInterruptMode::Disabled + && self.mode != DeviceInterruptMode::LegacyIrq + && mode != DeviceInterruptMode::LegacyIrq + && mode != DeviceInterruptMode::Disabled + { + return Err(Error::from_raw_os_error(libc::EINVAL)); + } + + // Then enter new state + if mode != DeviceInterruptMode::Disabled { + self.current_idx = self.mode2idx[mode as usize]; + } else { + // We should reset irq configs when disable interrupt + self.reset_configs(mode); + } + self.mode = mode; + } + + Ok(()) + } + + /// Get the underline interrupt source group object, so the device driver could concurrently + /// trigger/acknowledge interrupts by using the returned group object. + pub fn get_group(&self) -> Option>> { + if !self.activated || self.mode == DeviceInterruptMode::Disabled { + None + } else { + Some(self.intr_groups[self.current_idx].clone()) + } + } + + /// Get the underline interrupt source group object, ignore the mode + pub fn get_group_unchecked(&self) -> Arc> { + self.intr_groups[self.current_idx].clone() + } + + /// Reconfigure a specific interrupt in current working mode at configuration or runtime stage. + /// + /// It's mainly used to reconfigure Generic MSI/PCI MSI/PCI MSIx interrupts. Actually legacy + /// interrupts don't support reconfiguration yet. + #[allow(unused_variables)] + pub fn update(&mut self, index: u32) -> Result<()> { + if !self.activated { + return Err(Error::from_raw_os_error(libc::EINVAL)); + } + + match self.mode { + #[cfg(feature = "msi-irq")] + DeviceInterruptMode::GenericMsiIrq + | DeviceInterruptMode::PciMsiIrq + | DeviceInterruptMode::PciMsixIrq => { + let group = &self.intr_groups[self.current_idx]; + if index >= group.len() || index >= self.msi_config.len() as u32 { + return Err(Error::from_raw_os_error(libc::EINVAL)); + } + group.update(index, &self.msi_config[index as usize])?; + Ok(()) + } + _ => Err(Error::from_raw_os_error(libc::EINVAL)), + } + } + + fn get_configs(&self, mode: DeviceInterruptMode) -> &[InterruptSourceConfig] { + match mode { + #[cfg(feature = "legacy-irq")] + DeviceInterruptMode::LegacyIrq => &LEGACY_CONFIGS[..], + #[cfg(feature = "msi-irq")] + DeviceInterruptMode::GenericMsiIrq + | DeviceInterruptMode::PciMsiIrq + | DeviceInterruptMode::PciMsixIrq => { + let idx = self.mode2idx[mode as usize]; + let group_len = self.intr_groups[idx].len() as usize; + &self.msi_config[0..group_len] + } + _ => panic!("unhandled interrupt type in get_configs()"), + } + } + + fn reset_configs(&mut self, mode: DeviceInterruptMode) { + match mode { + #[cfg(feature = "msi-irq")] + DeviceInterruptMode::GenericMsiIrq + | DeviceInterruptMode::PciMsiIrq + | DeviceInterruptMode::PciMsixIrq => { + self.msi_config = vec![ + InterruptSourceConfig::MsiIrq(MsiIrqSourceConfig::default()); + self.msi_config.len() + ]; + } + _ => {} + } + } +} + +#[cfg(feature = "msi-irq")] +impl DeviceInterruptManager { + /// Set the high address for a MSI message. + #[allow(irrefutable_let_patterns)] + pub fn set_msi_high_address(&mut self, index: u32, data: u32) -> Result<()> { + if (index as usize) < self.msi_config.len() { + if let InterruptSourceConfig::MsiIrq(ref mut msi) = self.msi_config[index as usize] { + msi.high_addr = data; + return Ok(()); + } + } + Err(Error::from_raw_os_error(libc::EINVAL)) + } + + /// Set the low address for a MSI message. + #[allow(irrefutable_let_patterns)] + pub fn set_msi_low_address(&mut self, index: u32, data: u32) -> Result<()> { + if (index as usize) < self.msi_config.len() { + if let InterruptSourceConfig::MsiIrq(ref mut msi) = self.msi_config[index as usize] { + msi.low_addr = data; + return Ok(()); + } + } + Err(Error::from_raw_os_error(libc::EINVAL)) + } + + /// Set the data for a MSI message. + #[allow(irrefutable_let_patterns)] + pub fn set_msi_data(&mut self, index: u32, data: u32) -> Result<()> { + if (index as usize) < self.msi_config.len() { + if let InterruptSourceConfig::MsiIrq(ref mut msi) = self.msi_config[index as usize] { + msi.data = data; + return Ok(()); + } + } + Err(Error::from_raw_os_error(libc::EINVAL)) + } + + /// Set msi irq MASK bit + #[allow(irrefutable_let_patterns)] + pub fn set_msi_mask(&mut self, index: u32, mask: bool) -> Result<()> { + if (index as usize) < self.msi_config.len() { + if let InterruptSourceConfig::MsiIrq(ref mut msi) = self.msi_config[index as usize] { + let mut msg_ctl = msi.msg_ctl; + msg_ctl &= !MSI_INT_MASK; + if mask { + msg_ctl |= MSI_INT_MASK; + } + msi.msg_ctl = msg_ctl; + return Ok(()); + } + } + Err(Error::from_raw_os_error(libc::EINVAL)) + } + + /// Get msi irq MASK state + #[allow(irrefutable_let_patterns)] + pub fn get_msi_mask(&self, index: u32) -> Result { + if (index as usize) < self.msi_config.len() { + if let InterruptSourceConfig::MsiIrq(ref msi) = self.msi_config[index as usize] { + return Ok((msi.msg_ctl & MSI_INT_MASK) == MSI_INT_MASK); + } + } + Err(Error::from_raw_os_error(libc::EINVAL)) + } + + #[cfg(target_arch = "aarch64")] + /// Set the device id for a MSI irq + pub fn set_msi_device_id(&mut self, index: u32) -> Result<()> { + if (index as usize) < self.msi_config.len() { + if let InterruptSourceConfig::MsiIrq(ref mut msi) = self.msi_config[index as usize] { + msi.device_id = self.device_id.map(|dev_id| { + // An pci device attach to ITS will have a new device id which is use for msi + // irq routing. It is calculated according to kernel function PCI_DEVID(), + // new_dev_id = (bus << 8) | devfn. In addition, devfn = device_id << 3, + // according to pci-host-ecam-generic's spec, and we implement bus = 0. + dev_id << MSI_DEVICE_ID_SHIFT + }); + return Ok(()); + } + } + Err(Error::from_raw_os_error(libc::EINVAL)) + } + + fn resize_msi_config_space(&mut self, size: u32) { + if self.msi_config.len() < size as usize { + self.msi_config = + vec![InterruptSourceConfig::MsiIrq(MsiIrqSourceConfig::default()); size as usize]; + } + } +} + +/// Struct to implement a 32-bit interrupt status register. +#[derive(Default, Debug)] +pub struct InterruptStatusRegister32 { + status: AtomicU32, +} + +impl InterruptStatusRegister32 { + /// Create a status register instance. + pub fn new() -> Self { + InterruptStatusRegister32 { + status: AtomicU32::new(0), + } + } + + /// Read current value of the status register. + pub fn read(&self) -> u32 { + self.status.load(Ordering::SeqCst) + } + + /// Write value to the status register. + pub fn write(&self, value: u32) { + self.status.store(value, Ordering::SeqCst); + } + + /// Read current value and reset the status register to 0. + pub fn read_and_clear(&self) -> u32 { + self.status.swap(0, Ordering::SeqCst) + } + + /// Set bits into `value`. + pub fn set_bits(&self, value: u32) { + self.status.fetch_or(value, Ordering::SeqCst); + } + + /// Clear bits present in `value`. + pub fn clear_bits(&self, value: u32) { + self.status.fetch_and(!value, Ordering::SeqCst); + } +} + +#[cfg(all(test, feature = "kvm-legacy-irq", feature = "kvm-msi-irq"))] +pub(crate) mod tests { + use std::sync::Arc; + + use dbs_device::resources::{DeviceResources, MsiIrqType, Resource}; + use kvm_ioctls::{Kvm, VmFd}; + + use super::*; + use crate::KvmIrqManager; + + pub(crate) fn create_vm_fd() -> VmFd { + let kvm = Kvm::new().unwrap(); + kvm.create_vm().unwrap() + } + + fn create_init_resources() -> DeviceResources { + let mut resources = DeviceResources::new(); + + resources.append(Resource::MmioAddressRange { + base: 0xd000_0000, + size: 0x10_0000, + }); + resources.append(Resource::LegacyIrq(0)); + resources.append(Resource::MsiIrq { + ty: MsiIrqType::GenericMsi, + base: 0x200, + size: 0x10, + }); + resources.append(Resource::MsiIrq { + ty: MsiIrqType::PciMsi, + base: 0x100, + size: 0x20, + }); + resources.append(Resource::MsiIrq { + ty: MsiIrqType::PciMsix, + base: 0x300, + size: 0x30, + }); + + resources + } + + fn create_interrupt_manager() -> DeviceInterruptManager> { + let vmfd = Arc::new(create_vm_fd()); + #[cfg(target_arch = "x86_64")] + vmfd.create_irq_chip().unwrap(); + #[cfg(target_arch = "aarch64")] + let _ = dbs_arch::gic::create_gic(&vmfd, 1); + let intr_mgr = Arc::new(KvmIrqManager::new(vmfd)); + + let resource = create_init_resources(); + intr_mgr.initialize().unwrap(); + DeviceInterruptManager::new(intr_mgr, &resource).unwrap() + } + + #[test] + fn test_create_device_interrupt_manager() { + let mut mgr = create_interrupt_manager(); + + assert_eq!(mgr.mode, DeviceInterruptMode::Disabled); + assert!(!mgr.activated); + assert_eq!(mgr.current_idx, usize::MAX); + assert_eq!(mgr.intr_groups.len(), 4); + assert!(!mgr.is_enabled()); + assert!(mgr.get_group().is_none()); + + // Enter legacy mode by default + mgr.enable().unwrap(); + assert!(mgr.is_enabled()); + assert_eq!( + mgr.mode2idx[DeviceInterruptMode::LegacyIrq as usize], + mgr.current_idx + ); + assert!(mgr.get_group().is_some()); + assert_eq!( + mgr.get_group_unchecked().interrupt_type(), + InterruptSourceType::LegacyIrq + ); + + // Disable interrupt manager + mgr.reset().unwrap(); + assert!(!mgr.is_enabled()); + assert_eq!( + mgr.mode2idx[DeviceInterruptMode::LegacyIrq as usize], + mgr.current_idx + ); + assert_eq!(mgr.get_working_mode(), DeviceInterruptMode::Disabled); + assert!(mgr.get_group().is_none()); + } + + #[test] + fn test_device_interrupt_manager_switch_mode() { + let mut mgr = create_interrupt_manager(); + + // Can't switch working mode in enabled state. + mgr.enable().unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap_err(); + mgr.reset().unwrap(); + + // Switch from LEGACY to PciMsi mode + mgr.set_working_mode(DeviceInterruptMode::LegacyIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::LegacyIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap_err(); + + // Switch from LEGACY to PciMsix mode + mgr.set_working_mode(DeviceInterruptMode::LegacyIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap_err(); + + // Switch from LEGACY to GenericMsi mode + mgr.set_working_mode(DeviceInterruptMode::LegacyIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap_err(); + + // Switch from DISABLED to PciMsi mode + mgr.set_working_mode(DeviceInterruptMode::Disabled).unwrap(); + mgr.set_working_mode(DeviceInterruptMode::Disabled).unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap_err(); + + // Switch from DISABLED to PciMsix mode + mgr.set_working_mode(DeviceInterruptMode::Disabled).unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap_err(); + + // Switch from DISABLED to GenericMsi mode + mgr.set_working_mode(DeviceInterruptMode::Disabled).unwrap(); + mgr.set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap(); + mgr.set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap_err(); + mgr.set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap_err(); + + mgr.set_working_mode(DeviceInterruptMode::Disabled).unwrap(); + mgr.set_working_mode(DeviceInterruptMode::Disabled).unwrap(); + } + + #[test] + fn test_msi_config() { + let mut interrupt_manager = create_interrupt_manager(); + + assert!(interrupt_manager.set_msi_data(512, 0).is_err()); + interrupt_manager.set_msi_data(0, 0).unwrap(); + assert!(interrupt_manager.set_msi_high_address(512, 0).is_err()); + interrupt_manager.set_msi_high_address(0, 0).unwrap(); + assert!(interrupt_manager.set_msi_low_address(512, 0).is_err()); + interrupt_manager.set_msi_low_address(0, 0).unwrap(); + assert!(interrupt_manager.get_msi_mask(512).is_err()); + assert!(!interrupt_manager.get_msi_mask(0).unwrap()); + assert!(interrupt_manager.set_msi_mask(512, true).is_err()); + interrupt_manager.set_msi_mask(0, true).unwrap(); + assert!(interrupt_manager.get_msi_mask(0).unwrap()); + } + + #[test] + fn test_set_working_mode_after_activated() { + let mut interrupt_manager = create_interrupt_manager(); + interrupt_manager.activated = true; + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::Disabled) + .is_err()); + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .is_err()); + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::LegacyIrq) + .is_err()); + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::PciMsiIrq) + .is_err()); + assert!(interrupt_manager + .set_working_mode(DeviceInterruptMode::PciMsixIrq) + .is_err()); + } + + #[test] + fn test_disable2legacy() { + let mut interrupt_manager = create_interrupt_manager(); + interrupt_manager.activated = false; + interrupt_manager.mode = DeviceInterruptMode::Disabled; + interrupt_manager + .set_working_mode(DeviceInterruptMode::LegacyIrq) + .unwrap(); + } + + #[test] + fn test_disable2nonlegacy() { + let mut interrupt_manager = create_interrupt_manager(); + interrupt_manager.activated = false; + interrupt_manager.mode = DeviceInterruptMode::Disabled; + interrupt_manager + .set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap(); + } + + #[test] + fn test_legacy2nonlegacy() { + let mut interrupt_manager = create_interrupt_manager(); + interrupt_manager.activated = false; + interrupt_manager.mode = DeviceInterruptMode::Disabled; + interrupt_manager + .set_working_mode(DeviceInterruptMode::LegacyIrq) + .unwrap(); + interrupt_manager + .set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap(); + } + + #[test] + fn test_nonlegacy2legacy() { + let mut interrupt_manager = create_interrupt_manager(); + interrupt_manager.activated = false; + interrupt_manager.mode = DeviceInterruptMode::Disabled; + interrupt_manager + .set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap(); + interrupt_manager + .set_working_mode(DeviceInterruptMode::LegacyIrq) + .unwrap(); + } + + #[test] + fn test_update() { + let mut interrupt_manager = create_interrupt_manager(); + interrupt_manager + .set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap(); + interrupt_manager.enable().unwrap(); + assert!(interrupt_manager.update(0x10).is_err()); + interrupt_manager.update(0x01).unwrap(); + interrupt_manager.reset().unwrap(); + interrupt_manager + .set_working_mode(DeviceInterruptMode::LegacyIrq) + .unwrap(); + assert!(interrupt_manager.update(0x10).is_err()); + } + + #[test] + fn test_get_configs() { + // legacy irq config + { + let interrupt_manager = create_interrupt_manager(); + + let legacy_config = interrupt_manager.get_configs(DeviceInterruptMode::LegacyIrq); + assert_eq!(legacy_config, LEGACY_CONFIGS); + } + + // generic irq config + { + let mut interrupt_manager = create_interrupt_manager(); + interrupt_manager + .set_working_mode(DeviceInterruptMode::GenericMsiIrq) + .unwrap(); + let msi_config = interrupt_manager.get_configs(DeviceInterruptMode::GenericMsiIrq); + assert_eq!(msi_config.len(), 0x10); + } + + // msi irq config + { + let mut interrupt_manager = create_interrupt_manager(); + interrupt_manager + .set_working_mode(DeviceInterruptMode::PciMsiIrq) + .unwrap(); + let msi_config = interrupt_manager.get_configs(DeviceInterruptMode::PciMsiIrq); + assert_eq!(msi_config.len(), 0x20); + } + + // msix irq config + { + let mut interrupt_manager = create_interrupt_manager(); + interrupt_manager + .set_working_mode(DeviceInterruptMode::PciMsixIrq) + .unwrap(); + let msi_config = interrupt_manager.get_configs(DeviceInterruptMode::PciMsixIrq); + assert_eq!(msi_config.len(), 0x30); + } + } + + #[test] + fn test_reset_configs() { + let mut interrupt_manager = create_interrupt_manager(); + + interrupt_manager.reset_configs(DeviceInterruptMode::LegacyIrq); + interrupt_manager.reset_configs(DeviceInterruptMode::LegacyIrq); + + interrupt_manager.set_msi_data(0, 100).unwrap(); + interrupt_manager.set_msi_high_address(0, 200).unwrap(); + interrupt_manager.set_msi_low_address(0, 300).unwrap(); + + interrupt_manager.reset_configs(DeviceInterruptMode::GenericMsiIrq); + assert_eq!( + interrupt_manager.msi_config[0], + InterruptSourceConfig::MsiIrq(MsiIrqSourceConfig::default()) + ); + } + + #[test] + fn test_interrupt_status_register() { + let status = InterruptStatusRegister32::new(); + + assert_eq!(status.read(), 0); + status.write(0x13); + assert_eq!(status.read(), 0x13); + status.clear_bits(0x11); + assert_eq!(status.read(), 0x2); + status.set_bits(0x100); + assert_eq!(status.read_and_clear(), 0x102); + assert_eq!(status.read(), 0); + } +} diff --git a/src/dragonball/src/dbs_interrupt/src/notifier.rs b/src/dragonball/src/dbs_interrupt/src/notifier.rs new file mode 100644 index 000000000..0589f9e29 --- /dev/null +++ b/src/dragonball/src/dbs_interrupt/src/notifier.rs @@ -0,0 +1,230 @@ +// Copyright 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Event notifier to inject device interrupts to virtual machines. + +use std::any::Any; +use std::io::Error; +use std::sync::Arc; + +use vmm_sys_util::eventfd::EventFd; + +use crate::{InterruptIndex, InterruptSourceGroup, InterruptStatusRegister32}; + +#[cfg(feature = "legacy-irq")] +pub use self::legacy::*; +#[cfg(feature = "msi-irq")] +pub use self::msi::*; + +/// Trait to inject device interrupts to virtual machines. +pub trait InterruptNotifier: Send + Sync { + /// Inject a device interrupt to the virtual machine. + fn notify(&self) -> Result<(), Error>; + + /// Get the optional `EventFd` object to inject interrupt to the virtual machine. + fn notifier(&self) -> Option<&EventFd>; + + /// Clone a boxed dyn trait object. + fn clone_boxed(&self) -> Box; + + /// Convert `self` to `std::any::Any`. + fn as_any(&self) -> &dyn Any; +} + +#[cfg(feature = "legacy-irq")] +mod legacy { + use super::*; + + /// Struct to inject legacy interrupt to guest. + #[derive(Clone)] + pub struct LegacyNotifier { + pub(crate) intr_group: Arc>, + pub(crate) intr_status: Arc, + pub(crate) status_bits: u32, + } + + impl LegacyNotifier { + /// Create a legacy notifier. + pub fn new( + intr_group: Arc>, + intr_status: Arc, + status_bits: u32, + ) -> Self { + Self { + intr_group, + intr_status, + status_bits, + } + } + } + + impl InterruptNotifier for LegacyNotifier { + fn notify(&self) -> Result<(), Error> { + self.intr_status.set_bits(self.status_bits); + self.intr_group.trigger(0) + } + + fn notifier(&self) -> Option<&EventFd> { + self.intr_group.notifier(0) + } + + fn clone_boxed(&self) -> Box { + Box::new(self.clone()) + } + + fn as_any(&self) -> &dyn Any { + self + } + } +} + +#[cfg(feature = "msi-irq")] +mod msi { + use super::*; + + /// Struct to inject message signalled interrupt to guest. + #[derive(Clone)] + pub struct MsiNotifier { + pub(crate) intr_group: Arc>, + pub(crate) intr_index: InterruptIndex, + } + + impl MsiNotifier { + /// Create a notifier to inject message signalled interrupt to guest. + pub fn new( + intr_group: Arc>, + intr_index: InterruptIndex, + ) -> Self { + MsiNotifier { + intr_group, + intr_index, + } + } + } + + impl InterruptNotifier for MsiNotifier { + fn notify(&self) -> Result<(), Error> { + self.intr_group.trigger(self.intr_index) + } + + fn notifier(&self) -> Option<&EventFd> { + self.intr_group.notifier(self.intr_index) + } + + fn clone_boxed(&self) -> Box { + Box::new(self.clone()) + } + + fn as_any(&self) -> &dyn Any { + self + } + } +} + +/// Struct to discard interrupts. +#[derive(Copy, Clone, Debug, Default)] +pub struct NoopNotifier {} + +impl NoopNotifier { + /// Create a noop notifier to discard interrupts. + pub fn new() -> Self { + NoopNotifier {} + } +} + +impl InterruptNotifier for NoopNotifier { + fn notify(&self) -> Result<(), Error> { + Ok(()) + } + + fn notifier(&self) -> Option<&EventFd> { + None + } + + fn clone_boxed(&self) -> Box { + Box::new(*self) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +/// Clone a boxed interrupt notifier object. +pub fn clone_notifier(notifier: &dyn InterruptNotifier) -> Box { + notifier.clone_boxed() +} + +#[cfg(test)] +mod tests { + #![allow(unused_imports)] + #![allow(dead_code)] + use super::*; + + use crate::{InterruptManager, InterruptSourceType}; + + const VIRTIO_INTR_VRING: u32 = 0x01; + const VIRTIO_INTR_CONFIG: u32 = 0x02; + + #[test] + fn create_virtio_null_notifier() { + let notifier = NoopNotifier::new(); + + notifier.notify().unwrap(); + assert!(notifier.notifier().is_none()); + } + + #[cfg(feature = "kvm-legacy-irq")] + #[test] + fn test_create_legacy_notifier() { + let (_vmfd, irq_manager) = crate::kvm::tests::create_kvm_irq_manager(); + let group = irq_manager + .create_group(InterruptSourceType::LegacyIrq, 0, 1) + .unwrap(); + let status = Arc::new(InterruptStatusRegister32::new()); + assert_eq!(status.read(), 0); + + let notifer = LegacyNotifier::new(group.clone(), status.clone(), VIRTIO_INTR_CONFIG); + notifer.notify().unwrap(); + assert!(notifer.notifier().is_some()); + assert_eq!(notifer.status_bits, VIRTIO_INTR_CONFIG); + assert_eq!(status.read_and_clear(), VIRTIO_INTR_CONFIG); + assert_eq!(status.read(), 0); + + let notifier = LegacyNotifier::new(group.clone(), status.clone(), VIRTIO_INTR_VRING); + notifier.notify().unwrap(); + assert!(notifier.notifier().is_some()); + assert_eq!(status.read(), VIRTIO_INTR_VRING); + status.clear_bits(VIRTIO_INTR_VRING); + assert_eq!(status.read(), 0); + let eventfd = notifier.notifier().unwrap(); + assert_eq!(eventfd.read().unwrap(), 2); + + let clone = clone_notifier(¬ifier); + assert_eq!(clone.as_any().type_id(), notifier.as_any().type_id()); + } + + #[cfg(feature = "kvm-msi-irq")] + #[test] + fn test_virtio_msi_notifier() { + let (_vmfd, irq_manager) = crate::kvm::tests::create_kvm_irq_manager(); + let group = irq_manager + .create_group(InterruptSourceType::MsiIrq, 0, 3) + .unwrap(); + let notifier1 = MsiNotifier::new(group.clone(), 1); + let notifier2 = MsiNotifier::new(group.clone(), 2); + let notifier3 = MsiNotifier::new(group.clone(), 3); + assert!(notifier1.notifier().is_some()); + assert!(notifier2.notifier().is_some()); + assert!(notifier3.notifier().is_none()); + + notifier1.notify().unwrap(); + notifier1.notify().unwrap(); + notifier2.notify().unwrap(); + assert_eq!(notifier1.notifier().unwrap().read().unwrap(), 2); + assert_eq!(notifier2.notifier().unwrap().read().unwrap(), 1); + + let clone = clone_notifier(¬ifier1); + assert_eq!(clone.as_any().type_id(), notifier1.as_any().type_id()); + } +} diff --git a/src/dragonball/src/dbs_legacy_devices/Cargo.toml b/src/dragonball/src/dbs_legacy_devices/Cargo.toml new file mode 100644 index 000000000..8655783f7 --- /dev/null +++ b/src/dragonball/src/dbs_legacy_devices/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "dbs-legacy-devices" +version = "0.1.1" +authors = ["Alibaba Dragonball Team"] +license = "Apache-2.0 AND BSD-3-Clause" +edition = "2018" +description = "dbs-legacy-devices provides emulation for legacy devices." +homepage = "https://github.com/openanolis/dragonball-sandbox" +repository = "https://github.com/openanolis/dragonball-sandbox" +keywords = ["dragonball", "secure-sandbox", "devices", "legacy"] +readme = "README.md" + +[dependencies] +dbs-device = { path = "../dbs_device" } +dbs-utils = { path = "../dbs_utils" } +libc = "0.2.39" +log = "0.4.14" +serde = { version = "1.0.27", features = ["derive", "rc"] } +vm-superio = "0.5.0" +vmm-sys-util = "0.11.0" + +[dev-dependencies] +libc = "0.2.39" diff --git a/src/dragonball/src/dbs_legacy_devices/LICENSE b/src/dragonball/src/dbs_legacy_devices/LICENSE new file mode 120000 index 000000000..30cff7403 --- /dev/null +++ b/src/dragonball/src/dbs_legacy_devices/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/src/dragonball/src/dbs_legacy_devices/README.md b/src/dragonball/src/dbs_legacy_devices/README.md new file mode 100644 index 000000000..e1271995d --- /dev/null +++ b/src/dragonball/src/dbs_legacy_devices/README.md @@ -0,0 +1,26 @@ +# dbs-legacy-devices + +`dbs-legacy-devices` provides emulation for legacy devices. + +## Serial Devices + +Defined a wrapper over the Serial of [`vm-superio`](https://github.com/rust-vmm/vm-superio). +This wrapper is needed because [Orphan rules](https://doc.rust-lang.org/reference/items/implementations.html#orphan-rules), +which is one crate can not implement a trait for a struct defined in +another crate. This wrapper also contains the input field that is +missing from upstream implementation. + +## i8042 Devices + +Defined a wrapper over the `i8042 PS/2 Controller` of [`vm-superio`](https://github.com/rust-vmm/vm-superio). +The i8042 PS/2 controller emulates, at this point, only the CPU reset command which is needed for announcing the VMM about the guest's shutdown. + +### Acknowledgement + +Part of the code is derived from the [Firecracker](https://github.com/firecracker-microvm/firecracker) project. +And modified to use [`DeviceIoMut`](../dbs_device/src/lib.rs) to support serial port to Bus. + + +## License + +This project is licensed under [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). diff --git a/src/dragonball/src/dbs_legacy_devices/THIRD-PARTY b/src/dragonball/src/dbs_legacy_devices/THIRD-PARTY new file mode 120000 index 000000000..301d0a498 --- /dev/null +++ b/src/dragonball/src/dbs_legacy_devices/THIRD-PARTY @@ -0,0 +1 @@ +../../THIRD-PARTY \ No newline at end of file diff --git a/src/dragonball/src/dbs_legacy_devices/src/cmos.rs b/src/dragonball/src/dbs_legacy_devices/src/cmos.rs new file mode 100644 index 000000000..4ac59cdfa --- /dev/null +++ b/src/dragonball/src/dbs_legacy_devices/src/cmos.rs @@ -0,0 +1,137 @@ +// Copyright 2023 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::cmp::min; +use std::mem; + +use libc::{clock_gettime, gmtime_r, timespec, tm, CLOCK_REALTIME}; +use vmm_sys_util::eventfd::EventFd; + +use dbs_device::{DeviceIoMut, PioAddress}; + +/// The value of index offset register is always guaranteed to be in range via INDEX_MASK. +const INDEX_MASK: u8 = 0x7f; +/// Offset of index offset register. +const INDEX_OFFSET: u16 = 0x0; +/// Offset of data offset register. +const DATA_OFFSET: u16 = 0x1; +/// Length of Cmos memory. +const DATA_LEN: usize = 128; + +/// A CMOS/RTC device commonly seen on x86 I/O port 0x70/0x71. +pub struct CmosDevice { + index: u8, + data: [u8; DATA_LEN], + reset_evt: EventFd, +} + +impl CmosDevice { + /// Constructs a CMOS/RTC device with initial data. + /// `mem_below_4g` is the size of memory in bytes below the 32-bit gap. + /// `mem_above_4g` is the size of memory in bytes above the 32-bit gap. + pub fn new(mem_below_4g: u64, mem_above_4g: u64, reset_evt: EventFd) -> CmosDevice { + let mut data = [0u8; DATA_LEN]; + // Extended memory from 16 MB to 4 GB in units of 64 KB + let ext_mem = min( + 0xFFFF, + mem_below_4g.saturating_sub(16 * 1024 * 1024) / (64 * 1024), + ); + data[0x34] = ext_mem as u8; + data[0x35] = (ext_mem >> 8) as u8; + // High memory (> 4GB) in units of 64 KB + let high_mem = min(0x00FF_FFFF, mem_above_4g / (64 * 1024)); + data[0x5b] = high_mem as u8; + data[0x5c] = (high_mem >> 8) as u8; + data[0x5d] = (high_mem >> 16) as u8; + CmosDevice { + index: 0, + data, + reset_evt, + } + } +} +impl DeviceIoMut for CmosDevice { + fn pio_write(&mut self, _base: PioAddress, offset: PioAddress, data: &[u8]) { + if data.len() != 1 { + return; + } + match offset.raw_value() { + INDEX_OFFSET => self.index = data[0], + DATA_OFFSET => { + if self.index == 0x8f && data[0] == 0 { + self.reset_evt.write(1).unwrap(); + } else { + self.data[(self.index & INDEX_MASK) as usize] = data[0] + } + } + _ => {} + }; + } + fn pio_read(&mut self, _base: PioAddress, offset: PioAddress, data: &mut [u8]) { + fn to_bcd(v: u8) -> u8 { + assert!(v < 100); + ((v / 10) << 4) | (v % 10) + } + if data.len() != 1 { + return; + } + data[0] = match offset.raw_value() { + INDEX_OFFSET => self.index, + DATA_OFFSET => { + let seconds; + let minutes; + let hours; + let week_day; + let day; + let month; + let year; + // The clock_gettime and gmtime_r calls are safe as long as the structs they are + // given are large enough, and neither of them fail. It is safe to zero initialize + // the tm and timespec struct because it contains only plain data. + let update_in_progress = unsafe { + let mut timespec: timespec = mem::zeroed(); + clock_gettime(CLOCK_REALTIME, &mut timespec as *mut _); + let now = timespec.tv_sec; + let mut tm: tm = mem::zeroed(); + gmtime_r(&now, &mut tm as *mut _); + // The following lines of code are safe but depend on tm being in scope. + seconds = tm.tm_sec; + minutes = tm.tm_min; + hours = tm.tm_hour; + week_day = tm.tm_wday + 1; + day = tm.tm_mday; + month = tm.tm_mon + 1; + year = tm.tm_year; + // Update in Progress bit held for last 224us of each second + const NANOSECONDS_PER_SECOND: i64 = 1_000_000_000; + const UIP_HOLD_LENGTH: i64 = 8 * NANOSECONDS_PER_SECOND / 32768; + timespec.tv_nsec >= (NANOSECONDS_PER_SECOND - UIP_HOLD_LENGTH) + }; + match self.index { + 0x00 => to_bcd(seconds as u8), + 0x02 => to_bcd(minutes as u8), + 0x04 => to_bcd(hours as u8), + 0x06 => to_bcd(week_day as u8), + 0x07 => to_bcd(day as u8), + 0x08 => to_bcd(month as u8), + 0x09 => to_bcd((year % 100) as u8), + // Bit 5 for 32kHz clock. Bit 7 for Update in Progress + 0x0a => 1 << 5 | (update_in_progress as u8) << 7, + // Bit 0-6 are reserved and must be 0. + // Bit 7 must be 1 (CMOS has power) + 0x0d => 1 << 7, + 0x32 => to_bcd(((year + 1900) / 100) as u8), + _ => { + // self.index is always guaranteed to be in range via INDEX_MASK. + self.data[(self.index & INDEX_MASK) as usize] + } + } + } + _ => 0, + } + } +} diff --git a/src/dragonball/src/dbs_legacy_devices/src/i8042.rs b/src/dragonball/src/dbs_legacy_devices/src/i8042.rs new file mode 100644 index 000000000..b3f8a859e --- /dev/null +++ b/src/dragonball/src/dbs_legacy_devices/src/i8042.rs @@ -0,0 +1,136 @@ +// Copyright 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::sync::Arc; + +use dbs_device::{DeviceIoMut, PioAddress}; +use dbs_utils::metric::{IncMetric, SharedIncMetric}; +use log::error; +use serde::Serialize; +use vm_superio::{I8042Device as I8042Dev, Trigger}; + +use crate::EventFdTrigger; + +/// Metrics specific to the i8042 device. +#[derive(Default, Serialize)] +pub struct I8042DeviceMetrics { + /// Errors triggered while using the i8042 device. + pub error_count: SharedIncMetric, + /// Number of superfluous read intents on this i8042 device. + pub missed_read_count: SharedIncMetric, + /// Number of superfluous read intents on this i8042 device. + pub missed_write_count: SharedIncMetric, + /// Bytes read by this device. + pub read_count: SharedIncMetric, + /// Bytes written by this device. + pub write_count: SharedIncMetric, +} + +pub type I8042Device = I8042Wrapper; + +pub struct I8042Wrapper { + pub device: I8042Dev, + pub metrics: Arc, +} + +impl I8042Device { + pub fn new(event: EventFdTrigger, metrics: Arc) -> Self { + Self { + device: I8042Dev::new(event), + metrics, + } + } +} + +impl DeviceIoMut for I8042Wrapper { + fn pio_read(&mut self, _base: PioAddress, offset: PioAddress, data: &mut [u8]) { + if data.len() != 1 { + self.metrics.missed_read_count.inc(); + return; + } + data[0] = self.device.read(offset.raw_value() as u8); + self.metrics.read_count.inc(); + } + + fn pio_write(&mut self, _base: PioAddress, offset: PioAddress, data: &[u8]) { + if data.len() != 1 { + self.metrics.missed_write_count.inc(); + return; + } + if let Err(e) = self.device.write(offset.raw_value() as u8, data[0]) { + self.metrics.error_count.inc(); + error!("Failed to trigger i8042 reset event: {:?}", e); + } else { + self.metrics.write_count.inc(); + } + } +} + +#[cfg(test)] +mod tests { + use std::os::unix::prelude::FromRawFd; + + use vmm_sys_util::eventfd::EventFd; + + use super::*; + + const COMMAND_OFFSET: u8 = 4; + const CMD_RESET_CPU: u8 = 0xFE; + + #[test] + fn test_i8042_valid_ops() { + let reset_evt = EventFdTrigger::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()); + let metrics = Arc::new(I8042DeviceMetrics::default()); + let mut i8042 = I8042Device::new(reset_evt.try_clone().unwrap(), metrics); + + let mut v = [0x00u8; 1]; + i8042.pio_read(PioAddress(0), PioAddress(0), &mut v); + assert_eq!(v[0], 0); + assert_eq!(i8042.metrics.read_count.count(), 1); + + // Check if reset works. + i8042.pio_write( + PioAddress(0), + PioAddress(COMMAND_OFFSET as u16), + &[CMD_RESET_CPU], + ); + assert_eq!(i8042.metrics.write_count.count(), 1); + reset_evt.read().unwrap(); + } + + #[test] + fn test_i8042_invalid_ops() { + let reset_evt = EventFdTrigger::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()); + let metrics = Arc::new(I8042DeviceMetrics::default()); + let mut i8042 = I8042Device::new(reset_evt.try_clone().unwrap(), metrics); + + let mut v = [0x00u8; 2]; + i8042.pio_read(PioAddress(0), PioAddress(0), &mut v); + assert_eq!(v[0], 0); + assert_eq!(i8042.metrics.read_count.count(), 0); + assert_eq!(i8042.metrics.missed_read_count.count(), 1); + + i8042.pio_write( + PioAddress(0), + PioAddress(COMMAND_OFFSET as u16), + &[CMD_RESET_CPU, 0], + ); + assert_eq!(i8042.metrics.write_count.count(), 0); + assert_eq!(i8042.metrics.missed_write_count.count(), 1); + } + + #[test] + fn test_i8042_reset_err() { + let reset_evt = EventFdTrigger::new(unsafe { EventFd::from_raw_fd(i32::MAX) }); + let metrics = Arc::new(I8042DeviceMetrics::default()); + let mut i8042 = I8042Device::new(reset_evt, metrics); + i8042.pio_write( + PioAddress(0), + PioAddress(COMMAND_OFFSET as u16), + &[CMD_RESET_CPU], + ); + assert_eq!(i8042.metrics.write_count.count(), 0); + assert_eq!(i8042.metrics.error_count.count(), 1); + } +} diff --git a/src/dragonball/src/dbs_legacy_devices/src/lib.rs b/src/dragonball/src/dbs_legacy_devices/src/lib.rs new file mode 100644 index 000000000..40c865ea7 --- /dev/null +++ b/src/dragonball/src/dbs_legacy_devices/src/lib.rs @@ -0,0 +1,76 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! Emulates virtual and hardware devices. +mod serial; +pub use self::serial::*; + +#[cfg(target_arch = "x86_64")] +mod cmos; +#[cfg(target_arch = "x86_64")] +pub use self::cmos::*; +#[cfg(target_arch = "x86_64")] +mod i8042; +#[cfg(target_arch = "x86_64")] +pub use self::i8042::*; + +#[cfg(target_arch = "aarch64")] +mod rtc_pl031; +#[cfg(target_arch = "aarch64")] +pub use self::rtc_pl031::*; + +use vm_superio::Trigger; +use vmm_sys_util::eventfd::EventFd; +/// Newtype for implementing the trigger functionality for `EventFd`. +/// +/// The trigger is used for handling events in the legacy devices. +pub struct EventFdTrigger(EventFd); + +impl Trigger for EventFdTrigger { + type E = std::io::Error; + + fn trigger(&self) -> std::io::Result<()> { + self.write(1) + } +} +impl std::ops::Deref for EventFdTrigger { + type Target = EventFd; + fn deref(&self) -> &Self::Target { + &self.0 + } +} +impl EventFdTrigger { + pub fn try_clone(&self) -> std::io::Result { + Ok(EventFdTrigger((**self).try_clone()?)) + } + pub fn new(evt: EventFd) -> Self { + Self(evt) + } + + pub fn get_event(&self) -> EventFd { + self.0.try_clone().unwrap() + } +} + +#[cfg(test)] +mod tests { + use std::ops::Deref; + + use vmm_sys_util::eventfd::EventFd; + + use super::*; + + #[test] + fn test_eventfd_trigger() { + let intr_evt = EventFdTrigger::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()); + intr_evt.trigger().unwrap(); + assert_eq!(intr_evt.get_event().read().unwrap(), 1); + intr_evt.try_clone().unwrap().trigger().unwrap(); + assert_eq!(intr_evt.deref().read().unwrap(), 1); + } +} diff --git a/src/dragonball/src/dbs_legacy_devices/src/rtc_pl031.rs b/src/dragonball/src/dbs_legacy_devices/src/rtc_pl031.rs new file mode 100644 index 000000000..3d2d04dae --- /dev/null +++ b/src/dragonball/src/dbs_legacy_devices/src/rtc_pl031.rs @@ -0,0 +1,128 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! ARM PL031 Real Time Clock +//! +//! This module implements a PL031 Real Time Clock (RTC) that provides to provides long time base counter. +use std::convert::TryInto; + +use dbs_device::{DeviceIoMut, IoAddress}; +use dbs_utils::metric::{IncMetric, SharedIncMetric}; +use log::warn; +use vm_superio::rtc_pl031::{Rtc, RtcEvents}; + +/// Metrics specific to the RTC device +#[derive(Default)] +pub struct RTCDeviceMetrics { + /// Errors triggered while using the RTC device. + pub error_count: SharedIncMetric, + /// Number of superfluous read intents on this RTC device. + pub missed_read_count: SharedIncMetric, + /// Number of superfluous write intents on this RTC device. + pub missed_write_count: SharedIncMetric, +} + +impl RtcEvents for RTCDeviceMetrics { + fn invalid_read(&self) { + self.missed_read_count.inc(); + self.error_count.inc(); + } + + fn invalid_write(&self) { + self.missed_write_count.inc(); + self.error_count.inc(); + } +} + +/// The wrapper of Rtc struct to implement DeviceIoMut trait. +pub struct RTCDevice { + pub rtc: Rtc, +} + +impl Default for RTCDevice { + fn default() -> Self { + Self::new() + } +} + +impl RTCDevice { + pub fn new() -> Self { + let metrics = RTCDeviceMetrics::default(); + Self { + rtc: Rtc::with_events(metrics), + } + } +} + +impl DeviceIoMut for RTCDevice { + fn read(&mut self, _base: IoAddress, offset: IoAddress, data: &mut [u8]) { + if data.len() == 4 { + self.rtc + .read(offset.raw_value() as u16, data.try_into().unwrap()) + } else { + warn!( + "Invalid RTC PL031 read: offset {}, data length {}", + offset.raw_value(), + data.len() + ); + self.rtc.events().invalid_read(); + } + } + + fn write(&mut self, _base: IoAddress, offset: IoAddress, data: &[u8]) { + if data.len() == 4 { + self.rtc + .write(offset.raw_value() as u16, data.try_into().unwrap()) + } else { + warn!( + "Invalid RTC PL031 write: offset {}, data length {}", + offset.raw_value(), + data.len() + ); + self.rtc.events().invalid_write(); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + impl RTCDevice { + fn read(&mut self, offset: u64, data: &mut [u8]) { + DeviceIoMut::read(self, IoAddress::from(0), IoAddress::from(offset), data) + } + + fn write(&mut self, offset: u64, data: &[u8]) { + DeviceIoMut::write(self, IoAddress::from(0), IoAddress::from(offset), data) + } + } + + #[test] + fn test_rtc_read_write_and_event() { + let mut rtc_device = RTCDevice::new(); + let data = [0; 4]; + + // Write to the DR register. Since this is a RO register, the write + // function should fail. + let invalid_writes_before = rtc_device.rtc.events().missed_write_count.count(); + let error_count_before = rtc_device.rtc.events().error_count.count(); + rtc_device.rtc.write(0x000, &data); + let invalid_writes_after = rtc_device.rtc.events().missed_write_count.count(); + let error_count_after = rtc_device.rtc.events().error_count.count(); + assert_eq!(invalid_writes_after - invalid_writes_before, 1); + assert_eq!(error_count_after - error_count_before, 1); + + let write_data_good = 123u32.to_le_bytes(); + let mut data_bad = [0; 2]; + let mut read_data_good = [0; 4]; + + rtc_device.write(0x008, &write_data_good); + rtc_device.write(0x008, &data_bad); + rtc_device.read(0x008, &mut read_data_good); + rtc_device.read(0x008, &mut data_bad); + assert_eq!(u32::from_le_bytes(read_data_good), 123); + assert_eq!(u16::from_le_bytes(data_bad), 0); + } +} diff --git a/src/dragonball/src/dbs_legacy_devices/src/serial.rs b/src/dragonball/src/dbs_legacy_devices/src/serial.rs new file mode 100644 index 000000000..ba203e2b2 --- /dev/null +++ b/src/dragonball/src/dbs_legacy_devices/src/serial.rs @@ -0,0 +1,291 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. +use std::io::Write; +use std::sync::{Arc, Mutex}; + +use dbs_device::{DeviceIoMut, IoAddress, PioAddress}; +use dbs_utils::metric::{IncMetric, SharedIncMetric}; +use log::error; +use serde::Serialize; +use vm_superio::{serial::SerialEvents, Serial, Trigger}; +use vmm_sys_util::eventfd::EventFd; + +use crate::EventFdTrigger; + +/// Trait for devices that handle raw non-blocking I/O requests. +pub trait ConsoleHandler { + /// Send raw input to this emulated device. + fn raw_input(&mut self, _data: &[u8]) -> std::io::Result { + Ok(0) + } + + /// Set the stream to receive raw output from this emulated device. + fn set_output_stream(&mut self, out: Option>); +} + +/// Metrics specific to the UART device. +#[derive(Default, Serialize)] +pub struct SerialDeviceMetrics { + /// Errors triggered while using the UART device. + pub error_count: SharedIncMetric, + /// Number of flush operations. + pub flush_count: SharedIncMetric, + /// Number of read calls that did not trigger a read. + pub missed_read_count: SharedIncMetric, + /// Number of write calls that did not trigger a write. + pub missed_write_count: SharedIncMetric, + /// Number of succeeded read calls. + pub read_count: SharedIncMetric, + /// Number of succeeded write calls. + pub write_count: SharedIncMetric, +} + +pub struct SerialEventsWrapper { + pub metrics: Arc, + pub buffer_ready_event_fd: Option, +} + +impl SerialEvents for SerialEventsWrapper { + fn buffer_read(&self) { + self.metrics.read_count.inc(); + } + + fn out_byte(&self) { + self.metrics.write_count.inc(); + } + + fn tx_lost_byte(&self) { + self.metrics.missed_write_count.inc(); + } + + fn in_buffer_empty(&self) { + match self + .buffer_ready_event_fd + .as_ref() + .map_or(Ok(()), |buf_ready| buf_ready.write(1)) + { + Ok(_) => (), + Err(err) => error!( + "Could not signal that serial device buffer is ready: {:?}", + err + ), + } + } +} + +pub type SerialDevice = SerialWrapper; + +impl SerialDevice { + /// Creates a new SerialDevice instance. + pub fn new(event: EventFd) -> Self { + let out = Arc::new(Mutex::new(None)); + Self { + serial: Serial::with_events( + EventFdTrigger::new(event), + SerialEventsWrapper { + metrics: Arc::new(SerialDeviceMetrics::default()), + buffer_ready_event_fd: None, + }, + AdapterWriter(out.clone()), + ), + out, + } + } +} + +pub struct SerialWrapper { + pub serial: Serial, + pub out: Arc>>>, +} + +impl ConsoleHandler for SerialWrapper { + fn raw_input(&mut self, data: &[u8]) -> std::io::Result { + self.serial + .enqueue_raw_bytes(data) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("{e:?}"))) + } + + fn set_output_stream(&mut self, out: Option>) { + *self.out.lock().unwrap() = out; + } +} + +impl DeviceIoMut for SerialWrapper { + fn pio_read(&mut self, _base: PioAddress, offset: PioAddress, data: &mut [u8]) { + if data.len() != 1 { + self.serial.events().metrics.missed_read_count.inc(); + return; + } + data[0] = self.serial.read(offset.raw_value() as u8); + } + fn pio_write(&mut self, _base: PioAddress, offset: PioAddress, data: &[u8]) { + if data.len() != 1 { + self.serial.events().metrics.missed_write_count.inc(); + return; + } + if let Err(e) = self.serial.write(offset.raw_value() as u8, data[0]) { + error!("Failed the pio write to serial: {:?}", e); + self.serial.events().metrics.error_count.inc(); + } + } + + fn read(&mut self, _base: IoAddress, offset: IoAddress, data: &mut [u8]) { + if data.len() != 1 { + self.serial.events().metrics.missed_read_count.inc(); + return; + } + data[0] = self.serial.read(offset.raw_value() as u8); + } + fn write(&mut self, _base: IoAddress, offset: IoAddress, data: &[u8]) { + if data.len() != 1 { + self.serial.events().metrics.missed_write_count.inc(); + return; + } + if let Err(e) = self.serial.write(offset.raw_value() as u8, data[0]) { + error!("Failed the write to serial: {:?}", e); + self.serial.events().metrics.error_count.inc(); + } + } +} + +pub struct AdapterWriter(pub Arc>>>); + +impl Write for AdapterWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + if let Some(w) = self.0.lock().unwrap().as_mut() { + w.write(buf) + } else { + Ok(buf.len()) + } + } + + fn flush(&mut self) -> std::io::Result<()> { + if let Some(w) = self.0.lock().unwrap().as_mut() { + w.flush() + } else { + Ok(()) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io; + use std::sync::{Arc, Mutex}; + + #[derive(Clone)] + struct SharedBuffer { + buf: Arc>>, + } + + impl io::Write for SharedBuffer { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.buf.lock().unwrap().write(buf) + } + fn flush(&mut self) -> io::Result<()> { + self.buf.lock().unwrap().flush() + } + } + + #[test] + fn test_serial_bus_write() { + let serial_out_buf = Arc::new(Mutex::new(Vec::new())); + let serial_out = Box::new(SharedBuffer { + buf: serial_out_buf.clone(), + }); + let intr_evt = EventFd::new(libc::EFD_NONBLOCK).unwrap(); + + let mut serial = SerialDevice::new(intr_evt); + let metrics = serial.serial.events().metrics.clone(); + + serial.set_output_stream(Some(serial_out)); + + let invalid_writes_before = serial.serial.events().metrics.missed_write_count.count(); + ::pio_write(&mut serial, PioAddress(0), PioAddress(0), &[b'x', b'y']); + let writes_before = metrics.write_count.count(); + + let invalid_writes_after = metrics.missed_write_count.count(); + assert_eq!(invalid_writes_before + 1, invalid_writes_after); + + assert_eq!(serial_out_buf.lock().unwrap().as_slice().len(), 0); + ::write(&mut serial, IoAddress(0), IoAddress(0), &[b'x', b'y']); + assert_eq!(serial_out_buf.lock().unwrap().as_slice().len(), 0); + + let invalid_writes_after = metrics.missed_write_count.count(); + assert_eq!(invalid_writes_before + 2, invalid_writes_after); + + ::pio_write(&mut serial, PioAddress(0), PioAddress(0), &[b'a']); + ::pio_write(&mut serial, PioAddress(0), PioAddress(0), &[b'b']); + ::write(&mut serial, IoAddress(0), IoAddress(0), &[b'c']); + assert_eq!( + serial_out_buf.lock().unwrap().as_slice(), + &[b'a', b'b', b'c'] + ); + + let invalid_writes_after_2 = metrics.missed_write_count.count(); + let writes_after = metrics.write_count.count(); + // The `invalid_write_count` metric should be the same as before the one-byte writes. + assert_eq!(invalid_writes_after_2, invalid_writes_after); + assert_eq!(writes_after, writes_before + 3); + } + + #[test] + fn test_serial_bus_read() { + let intr_evt = EventFdTrigger::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()); + + let metrics = Arc::new(SerialDeviceMetrics::default()); + + let out: Arc>>> = + Arc::new(Mutex::new(Some(Box::new(std::io::sink())))); + let mut serial = SerialDevice { + serial: Serial::with_events( + intr_evt, + SerialEventsWrapper { + metrics: metrics.clone(), + buffer_ready_event_fd: None, + }, + AdapterWriter(out.clone()), + ), + out, + }; + serial + .serial + .enqueue_raw_bytes(&[b'a', b'b', b'c']) + .unwrap(); + + let invalid_reads_before = metrics.missed_read_count.count(); + + let mut v = [0x00; 2]; + ::pio_read(&mut serial, PioAddress(0), PioAddress(0), &mut v); + assert_eq!(v[0], b'\0'); + + let invalid_reads_after = metrics.missed_read_count.count(); + assert_eq!(invalid_reads_before + 1, invalid_reads_after); + + ::read(&mut serial, IoAddress(0), IoAddress(0), &mut v); + assert_eq!(v[0], b'\0'); + + let invalid_reads_after = metrics.missed_read_count.count(); + assert_eq!(invalid_reads_before + 2, invalid_reads_after); + + let mut v = [0x00; 1]; + ::pio_read(&mut serial, PioAddress(0), PioAddress(0), &mut v); + assert_eq!(v[0], b'a'); + + ::pio_read(&mut serial, PioAddress(0), PioAddress(0), &mut v); + assert_eq!(v[0], b'b'); + + ::read(&mut serial, IoAddress(0), IoAddress(0), &mut v); + assert_eq!(v[0], b'c'); + + let invalid_reads_after_2 = metrics.missed_read_count.count(); + // The `invalid_read_count` metric should be the same as before the one-byte reads. + assert_eq!(invalid_reads_after_2, invalid_reads_after); + } +} diff --git a/src/dragonball/src/dbs_tdx/Cargo.toml b/src/dragonball/src/dbs_tdx/Cargo.toml new file mode 100644 index 000000000..2643c6e48 --- /dev/null +++ b/src/dragonball/src/dbs_tdx/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "dbs-tdx" +version = "0.1.0" +authors = ["Alibaba Dragonball Team"] +description = "helpers and utilities to create TDX VM" +license = "Apache-2.0 AND BSD-3-Clause" +edition = "2018" +homepage = "https://github.com/openanolis/dragonball-sandbox" +repository = "https://github.com/openanolis/dragonball-sandbox" +keywords = ["dragonball", "secure-sandbox", "TDX", "confidential container"] +readme = "README.md" + +[dependencies] +thiserror = "1.0" +kvm-bindings = { version = "0.6.0", features = ["fam-wrappers"] } +vmm-sys-util = "0.11.0" + +[dev-dependencies] +serde_json = "1.0.9" diff --git a/src/dragonball/src/dbs_tdx/README.md b/src/dragonball/src/dbs_tdx/README.md new file mode 100644 index 000000000..3bc735ab3 --- /dev/null +++ b/src/dragonball/src/dbs_tdx/README.md @@ -0,0 +1,14 @@ +# dbs-tdx + +This crate is a collection of modules that provides helpers and utilities to create a TDX Dragonball VM. + +Currently this crate involves: +- tdx-ioctls + +## Acknowledgement + +Part of the code is derived from the [Cloud Hypervisor](https://github.com/cloud-hypervisor/cloud-hypervisor) project. + +## License + +This project is licensed under [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). diff --git a/src/dragonball/src/dbs_tdx/src/lib.rs b/src/dragonball/src/dbs_tdx/src/lib.rs new file mode 100644 index 000000000..6b67688e5 --- /dev/null +++ b/src/dragonball/src/dbs_tdx/src/lib.rs @@ -0,0 +1,5 @@ +// Copyright (C) 2023 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#[cfg(target_arch = "x86_64")] +pub mod tdx_ioctls; diff --git a/src/dragonball/src/dbs_tdx/src/tdx_ioctls.rs b/src/dragonball/src/dbs_tdx/src/tdx_ioctls.rs new file mode 100644 index 000000000..c9323fa42 --- /dev/null +++ b/src/dragonball/src/dbs_tdx/src/tdx_ioctls.rs @@ -0,0 +1,220 @@ +// Copyright © 2019 Intel Corporation +// +// Copyright (c) 2023 Alibaba Cloud. +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +use std::os::unix::io::RawFd; + +use kvm_bindings::{CpuId, __IncompleteArrayField, KVMIO}; +use thiserror::Error; +use vmm_sys_util::fam::{FamStruct, FamStructWrapper}; +use vmm_sys_util::ioctl::ioctl_with_val; +use vmm_sys_util::{generate_fam_struct_impl, ioctl_ioc_nr, ioctl_iowr_nr}; + +/// Tdx capability list. +pub type TdxCaps = FamStructWrapper; + +/// Cpuid configs entry counts. +const TDX1_MAX_NR_CPUID_CONFIGS: usize = 6; + +generate_fam_struct_impl!( + TdxCapabilities, + TdxCpuidConfig, + cpuid_configs, + u32, + nr_cpuid_configs, + TDX1_MAX_NR_CPUID_CONFIGS +); + +#[repr(C)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] +/// Tdx cpuid config. +pub struct TdxCpuidConfig { + /// cpuid leaf + pub leaf: u32, + /// cpuid sub leaf + pub sub_leaf: u32, + /// eax + pub eax: u32, + /// ebx + pub ebx: u32, + /// ecx + pub ecx: u32, + /// edx + pub edx: u32, +} + +#[repr(C)] +#[derive(Default)] +/// Tdx capabilities. +pub struct TdxCapabilities { + /// cpuid bits need to be fixed to 0. + pub attrs_fixed0: u64, + /// cpuid bits need to be fixed to 1. + pub attrs_fixed1: u64, + /// xfam bits need to be fixed to 0. + pub xfam_fixed0: u64, + /// xfam bits need to be fixed to 1. + pub xfam_fixed1: u64, + /// cpuid configs entry number. + pub nr_cpuid_configs: u32, + /// padding. + pub padding: u32, + /// cpuid config list + pub cpuid_configs: __IncompleteArrayField, +} + +ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); +/// TDX module related errors. +#[derive(Error, Debug)] +pub enum TdxIoctlError { + /// Failed to create TdxCaps + #[error("Failed to create TdxCaps")] + TdxCapabilitiesCreate, + /// Failed to get TDX Capbilities + #[error("Failed to get TDX Capbilities: {0}")] + TdxCapabilities(#[source] std::io::Error), + /// Failed to init TDX. + #[error("Failed to init TDX: {0}")] + TdxInit(#[source] std::io::Error), + /// Failed to finalize TDX. + #[error("Failed to finalize TDX: {0}")] + TdxFinalize(#[source] std::io::Error), + /// Failed to init TDX memory region. + #[error("Failed to init TDX memory region: {0}")] + TdxInitMemRegion(#[source] std::io::Error), + /// Failed to init TDX vcpu. + #[error("Failed to init TDX vcpu: {0}")] + TdxInitVcpu(#[source] std::io::Error), +} + +/// TDX related ioctl command +#[repr(u32)] +enum TdxCommand { + /// Get Capability + Capabilities = 0, + /// Init TD + InitVm = 1, + /// Init vcpu for TD + InitVcpu = 2, + /// Init memory region for TD + InitMemRegion = 3, + /// Finalize TD + Finalize = 4, +} + +/// TDX related ioctl command +fn tdx_command( + fd: &RawFd, + command: TdxCommand, + metadata: u32, + data: u64, +) -> std::result::Result<(), std::io::Error> { + #[repr(C)] + struct TdxIoctlCmd { + command: TdxCommand, + metadata: u32, + data: u64, + } + let cmd = TdxIoctlCmd { + command, + metadata, + data, + }; + let ret = unsafe { + ioctl_with_val( + fd, + KVM_MEMORY_ENCRYPT_OP(), + &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, + ) + }; + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + Ok(()) +} + +/// Init TDX +pub fn tdx_init( + vm_fd: &RawFd, + cpu_id: &CpuId, + max_vcpus: u32, +) -> std::result::Result<(), TdxIoctlError> { + #[repr(C)] + struct TdxInitVm { + max_vcpus: u32, + tsc_khz: u32, + attributes: u64, + cpuid: u64, + mrconfigid: [u64; 6], + mrowner: [u64; 6], + mrownerconfig: [u64; 6], + reserved: [u64; 43], + } + let data = TdxInitVm { + max_vcpus, + tsc_khz: 0, + attributes: 0, // TDX1_TD_ATTRIBUTE_DEBUG, + cpuid: cpu_id.as_fam_struct_ptr() as u64, + mrconfigid: [0; 6], + mrowner: [0; 6], + mrownerconfig: [0; 6], + reserved: [0; 43], + }; + tdx_command(vm_fd, TdxCommand::InitVm, 0, &data as *const _ as u64) + .map_err(TdxIoctlError::TdxInit) +} + +/// Finalize the TDX setup for this VM +pub fn tdx_finalize(vm_fd: &RawFd) -> std::result::Result<(), TdxIoctlError> { + tdx_command(vm_fd, TdxCommand::Finalize, 0, 0).map_err(TdxIoctlError::TdxFinalize) +} + +/// Initialize TDX memory Region +pub fn tdx_init_memory_region( + vm_fd: &RawFd, + host_address: u64, + guest_address: u64, + size: u64, + measure: bool, +) -> std::result::Result<(), TdxIoctlError> { + #[repr(C)] + struct TdxInitMemRegion { + host_address: u64, + guest_address: u64, + pages: u64, + } + let data = TdxInitMemRegion { + host_address, + guest_address, + pages: size / 4096, + }; + tdx_command( + vm_fd, + TdxCommand::InitMemRegion, + if measure { 1 } else { 0 }, + &data as *const _ as u64, + ) + .map_err(TdxIoctlError::TdxInitMemRegion) +} + +/// Initialize TDX vcpu +pub fn tdx_init_vcpu(vcpu_fd: &RawFd, hob_address: u64) -> std::result::Result<(), TdxIoctlError> { + tdx_command(vcpu_fd, TdxCommand::InitVcpu, 0, hob_address).map_err(TdxIoctlError::TdxInitVcpu) +} + +/// Get tdx capabilities. +pub fn tdx_get_caps(kvm_fd: &RawFd) -> std::result::Result { + let mut tdx_caps = TdxCaps::new(TDX1_MAX_NR_CPUID_CONFIGS) + .map_err(|_| TdxIoctlError::TdxCapabilitiesCreate)?; + tdx_command( + kvm_fd, + TdxCommand::Capabilities, + 0, + tdx_caps.as_mut_fam_struct_ptr() as *const _ as u64, + ) + .map_err(TdxIoctlError::TdxCapabilities)?; + Ok(tdx_caps) +} diff --git a/src/dragonball/src/dbs_upcall/Cargo.toml b/src/dragonball/src/dbs_upcall/Cargo.toml new file mode 100755 index 000000000..b65051092 --- /dev/null +++ b/src/dragonball/src/dbs_upcall/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "dbs-upcall" +version = "0.3.0" +authors = ["Alibaba Dragonball Team"] +license = "Apache-2.0" +edition = "2018" +description = "dbs-upcall is a direct communication tool between VMM and guest" +homepage = "https://github.com/openanolis/dragonball-sandbox" +repository = "https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-virtio-devices" +keywords = ["dragonball", "secure-sandbox", "devices", "upcall", "virtio"] +readme = "README.md" + +[dependencies] +anyhow = "1" +log = "0.4.14" +thiserror = "1" +timerfd = "1.2.0" + +dbs-utils = { path = "../dbs_utils" } +dbs-virtio-devices = { path = "../dbs_virtio_devices", features = ["virtio-vsock"] } diff --git a/src/dragonball/src/dbs_upcall/LICENSE b/src/dragonball/src/dbs_upcall/LICENSE new file mode 120000 index 000000000..30cff7403 --- /dev/null +++ b/src/dragonball/src/dbs_upcall/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/src/dragonball/src/dbs_upcall/README.md b/src/dragonball/src/dbs_upcall/README.md new file mode 100755 index 000000000..0dd7b4cf0 --- /dev/null +++ b/src/dragonball/src/dbs_upcall/README.md @@ -0,0 +1,69 @@ +# dbs-upcall + +`dbs-upcall` is a direct communication tool between VMM and guest developed upon VSOCK. The server side of the upcall is a driver in guest kernel (kernel patches are needed for this feature) and it'll start to serve the requests after the kernel starts. And the client side is in VMM , it'll be a thread that communicates with VSOCK through UDS. + +We have accomplished device hotplug / hot-unplug directly through upcall in order to avoid virtualization of ACPI to minimize virtual machines overhead. And there could be many other usage through this direct communication channel. + +## Design + +### Server Design + +The server side of upcall is a driver in guest kernel and the VSOCK port is 0xDB. +After the VSOCK is connected, upcall related service will be registered and a kernel thread providing corresponding service will be created. +The upcall service thread will first send a message with message type Connect to try to connect with the client side (VMM). After service successfully connects, the service thread will get into a loop for continuously receiving requests from the client side and processing the requests until the service stops. + +The service we currently support: +1. device manager : supports CPU hotplug / hot-unplug, virtio-mmio devices hotplug / hot-unplug + +### Client Design +The client side is in VMM and we abstract related logic into this crate `dbs-upcall`. + +The upcall state machine for the client side: +![Upcall State Machine](./images/upcall_state_machine.png) + +The client side's workflow: +1. [Current State: `WaitingServer`] Check the connection with VSOCK server. +2. [Current State: `WaitingService`]Check the connection with upcall server side in the guest kernel for message type Connect and magic version. +3. [Current State: `ServiceConnected`] The request could be sent through upcall in this state. + +If step 1 and 2 failed, upcall will try to reconnect. +If request is sent in step 3, upcall state will change to `ServiceBusy` and upcall will not process other requests in this state. + +### Message Design +There are two parts for the upcall request message : message header and message load. +And there are three parts for the upcall reply message: message header, result and message load. + +Message Header contains following information and it remains the same for the request and the reply : +1. magic_version(u32): magic version for identifying upcall and the service type +2. msg_size(u32): size of the message load +3. msg_type(u32): type for the message to identify its usage (e.g. ADD_CPU) +4. msg_flags(u32): reserved + +For the upcall request message, message load currently contains two kinds of msg_load. +msg_load type 1: add_mmio_dev - for virtio-mmio hotplug / hot-unplug request: +1. mmio_base +2. mmio_size +3. mmio_irq + +msg_load type 2: `cpu_dev_info`` - for CPU hotplug / hot-unplug request: +1. count +2. `apic_ver` +3. `apic_ids[256]` + +For the upcall reply message, reply contains result and two kinds of msg_load. +If result is 0, the operation is successful. +If result is not 0, result refers to the error code. + +msg_load type 1: add_mmio_dev - for virtio-mmio reply: +currently empty + +msg_load type 2: `cpu_dev_reply_info`` - for CPU hotplug / hot-unplug reply: +1. `apic_index` + +## Kernel Patches + +Kernel patches are needed for dbs-upcall. You could go to [Upcall Kernel Patches](/tools/packaging/kernel/patches/5.10.x/dragonball-experimental) to get the upcall patches. + +## License + +This project is licensed under [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). \ No newline at end of file diff --git a/src/dragonball/src/dbs_upcall/images/upcall_state_machine.png b/src/dragonball/src/dbs_upcall/images/upcall_state_machine.png new file mode 100755 index 000000000..8f7256b30 Binary files /dev/null and b/src/dragonball/src/dbs_upcall/images/upcall_state_machine.png differ diff --git a/src/dragonball/src/dbs_upcall/src/dev_mgr_service.rs b/src/dragonball/src/dbs_upcall/src/dev_mgr_service.rs new file mode 100755 index 000000000..f61882810 --- /dev/null +++ b/src/dragonball/src/dbs_upcall/src/dev_mgr_service.rs @@ -0,0 +1,562 @@ +// Copyright 2022 Alibaba Corporation. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! # Upcall Device Manager Service. +//! +//! Provides basic operations for the upcall device manager, include: +//! - CPU / Mmio-Virtio Device's hot-plug +//! - CPU Device's hot-unplug + +use std::fmt; +use std::mem; + +use dbs_virtio_devices::vsock::backend::VsockStream; + +use crate::{ + Result, UpcallClientError, UpcallClientRequest, UpcallClientResponse, UpcallClientService, +}; + +const DEV_MGR_MSG_SIZE: usize = 0x400; +const DEV_MGR_MAGIC_VERSION: u32 = 0x444D0100; +const DEV_MGR_BYTE: &[u8; 1usize] = b"d"; + +/// Device manager's op code. +#[allow(dead_code)] +#[repr(u32)] +enum DevMgrMsgType { + Connect = 0x00000000, + AddCpu = 0x00000001, + DelCpu = 0x00000002, + AddMem = 0x00000003, + DelMem = 0x00000004, + AddMmio = 0x00000005, + DelMmio = 0x00000006, + AddPci = 0x00000007, + DelPci = 0x00000008, +} + +/// Device manager's header for messages. +#[repr(C)] +#[derive(Copy, Clone, Debug)] +struct DevMgrMsgHeader { + pub magic_version: u32, + pub msg_size: u32, + pub msg_type: u32, + pub msg_flags: u32, +} + +/// Command struct to add/del a MMIO Virtio Device. +#[repr(C)] +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct MmioDevRequest { + /// base address of the virtio MMIO configuration window. + pub mmio_base: u64, + /// size of the virtio MMIO configuration window. + pub mmio_size: u64, + /// Interrupt number assigned to the MMIO virito device. + pub mmio_irq: u32, +} + +/// Command struct to add/del a vCPU. +#[repr(C)] +#[derive(Clone)] +pub struct CpuDevRequest { + /// hotplug or hot unplug cpu count + pub count: u8, + #[cfg(target_arch = "x86_64")] + /// apic version + pub apic_ver: u8, + #[cfg(target_arch = "x86_64")] + /// apic id array + pub apic_ids: [u8; 256], +} + +impl PartialEq for CpuDevRequest { + #[cfg(target_arch = "x86_64")] + fn eq(&self, other: &CpuDevRequest) -> bool { + self.count == other.count + && self.apic_ver == other.apic_ver + && self + .apic_ids + .iter() + .zip(other.apic_ids.iter()) + .all(|(s, o)| s == o) + } + + #[cfg(target_arch = "aarch64")] + fn eq(&self, other: &CpuDevRequest) -> bool { + self.count == other.count + } +} + +impl fmt::Debug for CpuDevRequest { + #[cfg(target_arch = "x86_64")] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use std::fmt::Write as _; + let mut apic_ids = String::from("[ "); + for apic_id in self.apic_ids.iter() { + if apic_id == &0 { + break; + } + let _ = write!(apic_ids, "{apic_id}"); + apic_ids.push(' '); + } + apic_ids.push_str(" ]"); + f.debug_struct("CpuDevRequest") + .field("count", &self.count) + .field("apic_ver", &self.apic_ver) + .field("apic_ids", &apic_ids) + .finish() + } + + #[cfg(target_arch = "aarch64")] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("CpuDevRequest") + .field("count", &self.count) + .finish() + } +} + +/// Device manager's request representation in client side. +#[derive(Clone, PartialEq, Debug)] +pub enum DevMgrRequest { + /// Add a MMIO virtio device + AddMmioDev(MmioDevRequest), + /// Del a MMIO device device + DelMmioDev(MmioDevRequest), + /// Add a VCPU + AddVcpu(CpuDevRequest), + /// Del a VCPU + DelVcpu(CpuDevRequest), +} + +impl DevMgrRequest { + /// Convert client side's representation into server side's representation. + pub fn build(&self) -> Box<[u8; DEV_MGR_MSG_SIZE]> { + let buffer = Box::new([0; DEV_MGR_MSG_SIZE]); + let size_hdr = mem::size_of::(); + let msg_hdr = unsafe { &mut *(buffer.as_ptr() as *mut DevMgrMsgHeader) }; + + msg_hdr.magic_version = DEV_MGR_MAGIC_VERSION; + msg_hdr.msg_flags = 0; + + match self { + DevMgrRequest::AddMmioDev(s) => { + msg_hdr.msg_type = DevMgrMsgType::AddMmio as u32; + msg_hdr.msg_size = mem::size_of::() as u32; + let mmio_dev = + unsafe { &mut *(buffer[size_hdr..].as_ptr() as *mut MmioDevRequest) }; + *mmio_dev = *s; + } + DevMgrRequest::DelMmioDev(s) => { + msg_hdr.msg_type = DevMgrMsgType::DelMmio as u32; + msg_hdr.msg_size = mem::size_of::() as u32; + let mmio_dev = + unsafe { &mut *(buffer[size_hdr..].as_ptr() as *mut MmioDevRequest) }; + *mmio_dev = *s; + } + DevMgrRequest::AddVcpu(s) => { + msg_hdr.msg_type = DevMgrMsgType::AddCpu as u32; + msg_hdr.msg_size = mem::size_of::() as u32; + let vcpu_dev = unsafe { &mut *(buffer[size_hdr..].as_ptr() as *mut CpuDevRequest) }; + *vcpu_dev = s.clone(); + } + DevMgrRequest::DelVcpu(s) => { + msg_hdr.msg_type = DevMgrMsgType::DelCpu as u32; + msg_hdr.msg_size = mem::size_of::() as u32; + let vcpu_dev = unsafe { &mut *(buffer[size_hdr..].as_ptr() as *mut CpuDevRequest) }; + *vcpu_dev = s.clone(); + } + } + + buffer + } +} + +/// Device manager's response from cpu device. +#[repr(C)] +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct CpuDevResponse { + #[cfg(target_arch = "x86_64")] + /// apic id index of last act cpu + pub apic_id_index: u32, + #[cfg(target_arch = "aarch64")] + /// cpu id of last act cpu + pub cpu_id: u32, +} + +/// Device manager's response inner message. +#[derive(Debug, Eq, PartialEq)] +pub struct DevMgrResponseInfo { + /// 0 means success and other result is the error code. + pub result: i32, + /// Additional info returned by device. + pub info: I, +} + +/// Device manager's response representation in client side. +#[derive(Debug, Eq, PartialEq)] +pub enum DevMgrResponse { + /// Add mmio device's response (no response body) + AddMmioDev(DevMgrResponseInfo<()>), + /// Add / Del cpu device's response + CpuDev(DevMgrResponseInfo), + /// Other response + Other(DevMgrResponseInfo<()>), +} + +impl DevMgrResponse { + /// Convert server side's representation into client side's representation. + fn make(buffer: &[u8]) -> Result { + let size_hdr = mem::size_of::(); + let msg_hdr = unsafe { &mut *(buffer.as_ptr() as *mut DevMgrMsgHeader) }; + let result = unsafe { &mut *(buffer[size_hdr..].as_ptr() as *mut i32) }; + + match msg_hdr.msg_type { + msg_type + if msg_type == DevMgrMsgType::AddCpu as u32 + || msg_type == DevMgrMsgType::DelCpu as u32 => + { + let response = unsafe { + &mut *(buffer[(size_hdr + mem::size_of::())..].as_ptr() + as *mut CpuDevResponse) + }; + Ok(DevMgrResponse::CpuDev(DevMgrResponseInfo { + result: *result, + info: response.clone(), + })) + } + msg_type if msg_type == DevMgrMsgType::AddMmio as u32 => { + Ok(DevMgrResponse::AddMmioDev(DevMgrResponseInfo { + result: *result, + info: (), + })) + } + _ => Ok(DevMgrResponse::Other(DevMgrResponseInfo { + result: *result, + info: (), + })), + } + } +} + +/// Device manager service, realized upcall client service. +#[derive(Default)] +pub struct DevMgrService {} + +impl UpcallClientService for DevMgrService { + fn connection_start(&self, stream: &mut Box) -> Result<()> { + stream + .write_all(DEV_MGR_BYTE) + .map_err(UpcallClientError::ServiceConnect) + } + + fn connection_check(&self, stream: &mut Box) -> Result<()> { + let mut buf = [0; DEV_MGR_MSG_SIZE]; + stream + .read_exact(&mut buf) + .map_err(UpcallClientError::ServiceConnect)?; + let hdr = unsafe { &*(buf.as_ptr() as *const DevMgrMsgHeader) }; + if hdr.magic_version == DEV_MGR_MAGIC_VERSION + && hdr.msg_size == 0 + && hdr.msg_flags == 0 + && hdr.msg_type == DevMgrMsgType::Connect as u32 + { + Ok(()) + } else { + Err(UpcallClientError::InvalidMessage(format!( + "upcall device manager expect msg_type {:?}, but received {}", + DevMgrMsgType::Connect as u32, + hdr.msg_type + ))) + } + } + + fn send_request( + &self, + stream: &mut Box, + request: UpcallClientRequest, + ) -> Result<()> { + let msg = match request { + UpcallClientRequest::DevMgr(req) => req.build(), + // we don't have other message type yet + #[cfg(test)] + UpcallClientRequest::FakeRequest => unimplemented!(), + }; + stream + .write_all(&*msg) + .map_err(UpcallClientError::SendRequest) + } + + fn handle_response(&self, stream: &mut Box) -> Result { + let mut buf = [0; DEV_MGR_MSG_SIZE]; + stream + .read_exact(&mut buf) + .map_err(UpcallClientError::GetResponse)?; + let response = DevMgrResponse::make(&buf)?; + + Ok(UpcallClientResponse::DevMgr(response)) + } +} + +#[cfg(test)] +mod tests { + use dbs_virtio_devices::vsock::backend::{VsockBackend, VsockInnerBackend}; + + use super::*; + + #[test] + fn test_build_dev_mgr_request() { + let size_hdr = mem::size_of::(); + // add mmio dev request + { + let add_mmio_dev_request = MmioDevRequest { + mmio_base: 0, + mmio_size: 1, + mmio_irq: 2, + }; + let dev_mgr_request = DevMgrRequest::AddMmioDev(add_mmio_dev_request); + let buffer = dev_mgr_request.build(); + + // valid total size + assert_eq!(buffer.len(), DEV_MGR_MSG_SIZE); + + // valid header + let msg_hdr = unsafe { &mut *(buffer.as_ptr() as *mut DevMgrMsgHeader) }; + assert_eq!(msg_hdr.magic_version, DEV_MGR_MAGIC_VERSION); + assert_eq!(msg_hdr.msg_flags, 0); + assert_eq!(msg_hdr.msg_type, DevMgrMsgType::AddMmio as u32); + assert_eq!(msg_hdr.msg_size, mem::size_of::() as u32); + + // valid request + let mmio_dev_req = + unsafe { &mut *(buffer[size_hdr..].as_ptr() as *mut MmioDevRequest) }; + assert_eq!(mmio_dev_req, &add_mmio_dev_request); + } + + // add vcpu dev request + { + let cpu_dev_request = CpuDevRequest { + count: 1, + #[cfg(target_arch = "x86_64")] + apic_ver: 2, + #[cfg(target_arch = "x86_64")] + apic_ids: [3; 256], + }; + let dev_mgr_request = DevMgrRequest::AddVcpu(cpu_dev_request.clone()); + let buffer = dev_mgr_request.build(); + + // valid total size + assert_eq!(buffer.len(), DEV_MGR_MSG_SIZE); + + // valid header + let msg_hdr = unsafe { &mut *(buffer.as_ptr() as *mut DevMgrMsgHeader) }; + assert_eq!(msg_hdr.magic_version, DEV_MGR_MAGIC_VERSION); + assert_eq!(msg_hdr.msg_flags, 0); + assert_eq!(msg_hdr.msg_type, DevMgrMsgType::AddCpu as u32); + assert_eq!(msg_hdr.msg_size, mem::size_of::() as u32); + + // valid request + let cpu_dev_req = unsafe { &mut *(buffer[size_hdr..].as_ptr() as *mut CpuDevRequest) }; + assert_eq!(cpu_dev_req, &cpu_dev_request); + } + + // del vcpu dev request + { + let cpu_dev_request = CpuDevRequest { + count: 1, + #[cfg(target_arch = "x86_64")] + apic_ver: 2, + #[cfg(target_arch = "x86_64")] + apic_ids: [3; 256], + }; + let dev_mgr_request = DevMgrRequest::DelVcpu(cpu_dev_request.clone()); + let buffer = dev_mgr_request.build(); + + // valid total size + assert_eq!(buffer.len(), DEV_MGR_MSG_SIZE); + + // valid header + let msg_hdr = unsafe { &mut *(buffer.as_ptr() as *mut DevMgrMsgHeader) }; + assert_eq!(msg_hdr.magic_version, DEV_MGR_MAGIC_VERSION); + assert_eq!(msg_hdr.msg_flags, 0); + assert_eq!(msg_hdr.msg_type, DevMgrMsgType::DelCpu as u32); + assert_eq!(msg_hdr.msg_size, mem::size_of::() as u32); + + // valid request + let cpu_dev_req = unsafe { &mut *(buffer[size_hdr..].as_ptr() as *mut CpuDevRequest) }; + assert_eq!(cpu_dev_req, &cpu_dev_request); + } + } + + #[test] + fn test_make_dev_mgr_response() { + let size_hdr = mem::size_of::(); + + // test cpu response + { + let buffer = [0; DEV_MGR_MSG_SIZE]; + let msg_hdr = unsafe { &mut *(buffer.as_ptr() as *mut DevMgrMsgHeader) }; + + msg_hdr.magic_version = DEV_MGR_MAGIC_VERSION; + + msg_hdr.msg_type = DevMgrMsgType::AddCpu as u32; + msg_hdr.msg_size = mem::size_of::() as u32; + msg_hdr.msg_flags = 0; + + let result = unsafe { &mut *(buffer[size_hdr..].as_ptr() as *mut i32) }; + *result = 0; + + let vcpu_result = unsafe { + &mut *(buffer[(size_hdr + mem::size_of::())..].as_ptr() as *mut CpuDevResponse) + }; + + #[cfg(target_arch = "x86_64")] + { + vcpu_result.apic_id_index = 1; + } + #[cfg(target_arch = "aarch64")] + { + vcpu_result.cpu_id = 1; + } + + match DevMgrResponse::make(&buffer).unwrap() { + DevMgrResponse::CpuDev(resp) => { + assert_eq!(resp.result, 0); + #[cfg(target_arch = "x86_64")] + assert_eq!(resp.info.apic_id_index, 1); + #[cfg(target_arch = "aarch64")] + assert_eq!(resp.info.cpu_id, 1); + } + _ => unreachable!(), + } + } + + // test add mmio response + { + let buffer = [0; DEV_MGR_MSG_SIZE]; + let msg_hdr = unsafe { &mut *(buffer.as_ptr() as *mut DevMgrMsgHeader) }; + + msg_hdr.magic_version = DEV_MGR_MAGIC_VERSION; + + msg_hdr.msg_type = DevMgrMsgType::AddMmio as u32; + msg_hdr.msg_size = 0; + msg_hdr.msg_flags = 0; + + let result = unsafe { &mut *(buffer[size_hdr..].as_ptr() as *mut i32) }; + *result = 0; + + match DevMgrResponse::make(&buffer).unwrap() { + DevMgrResponse::AddMmioDev(resp) => { + assert_eq!(resp.result, 0); + } + _ => unreachable!(), + } + } + + // test result error + { + let buffer = [0; DEV_MGR_MSG_SIZE]; + let msg_hdr = unsafe { &mut *(buffer.as_ptr() as *mut DevMgrMsgHeader) }; + + msg_hdr.magic_version = DEV_MGR_MAGIC_VERSION; + + msg_hdr.msg_type = DevMgrMsgType::AddMmio as u32; + msg_hdr.msg_size = 0; + msg_hdr.msg_flags = 0; + + let result = unsafe { &mut *(buffer[size_hdr..].as_ptr() as *mut i32) }; + *result = 1; + + match DevMgrResponse::make(&buffer).unwrap() { + DevMgrResponse::AddMmioDev(resp) => { + assert_eq!(resp.result, 1); + } + _ => unreachable!(), + } + } + + // test invalid unknown msg flag + { + let buffer = [0; DEV_MGR_MSG_SIZE]; + let msg_hdr = unsafe { &mut *(buffer.as_ptr() as *mut DevMgrMsgHeader) }; + + msg_hdr.magic_version = DEV_MGR_MAGIC_VERSION; + + msg_hdr.msg_type = 0xabcd1234; + msg_hdr.msg_size = 0; + msg_hdr.msg_flags = 0; + + let result = unsafe { &mut *(buffer[size_hdr..].as_ptr() as *mut i32) }; + *result = 1; + + match DevMgrResponse::make(&buffer).unwrap() { + DevMgrResponse::Other(resp) => { + assert_eq!(resp.result, 1); + } + _ => unreachable!(), + } + } + } + + fn get_vsock_inner_backend_stream_pair() -> (Box, Box) { + let mut vsock_backend = VsockInnerBackend::new().unwrap(); + let connector = vsock_backend.get_connector(); + let outer_stream = connector.connect().unwrap(); + let inner_stream = vsock_backend.accept().unwrap(); + + (inner_stream, outer_stream) + } + + #[test] + fn test_dev_mgr_service_connection_start() { + let (mut inner_stream, mut outer_stream) = get_vsock_inner_backend_stream_pair(); + let dev_mgr_service = DevMgrService {}; + + assert!(dev_mgr_service.connection_start(&mut inner_stream).is_ok()); + let mut reader_buf = [0; 1]; + outer_stream.read_exact(&mut reader_buf).unwrap(); + assert_eq!(reader_buf, [b'd']); + } + + #[test] + fn test_dev_mgr_service_send_request() { + let (mut inner_stream, mut outer_stream) = get_vsock_inner_backend_stream_pair(); + let dev_mgr_service = DevMgrService {}; + + let add_mmio_dev_request = DevMgrRequest::AddMmioDev(MmioDevRequest { + mmio_base: 0, + mmio_size: 1, + mmio_irq: 2, + }); + let request = UpcallClientRequest::DevMgr(add_mmio_dev_request.clone()); + + assert!(dev_mgr_service + .send_request(&mut outer_stream, request) + .is_ok()); + + let mut reader_buf = [0; DEV_MGR_MSG_SIZE]; + inner_stream.read_exact(&mut reader_buf).unwrap(); + + assert!(add_mmio_dev_request + .build() + .iter() + .zip(reader_buf.iter()) + .all(|(req, buf)| req == buf)); + } + + #[test] + fn test_dev_mgr_service_handle_response() { + let (mut inner_stream, mut outer_stream) = get_vsock_inner_backend_stream_pair(); + let dev_mgr_service = DevMgrService {}; + + let buffer = [0; DEV_MGR_MSG_SIZE]; + let msg_hdr = unsafe { &mut *(buffer.as_ptr() as *mut DevMgrMsgHeader) }; + msg_hdr.magic_version = DEV_MGR_MAGIC_VERSION; + msg_hdr.msg_type = DevMgrMsgType::AddMmio as u32; + msg_hdr.msg_size = 0; + + inner_stream.write_all(&buffer).unwrap(); + assert!(dev_mgr_service.handle_response(&mut outer_stream).is_ok()); + } +} diff --git a/src/dragonball/src/dbs_upcall/src/lib.rs b/src/dragonball/src/dbs_upcall/src/lib.rs new file mode 100755 index 000000000..8e03c4e01 --- /dev/null +++ b/src/dragonball/src/dbs_upcall/src/lib.rs @@ -0,0 +1,1141 @@ +// Copyright 2022 Alibaba Corporation. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![deny(missing_docs)] + +//! # Upcall Client's Implementation +//! +//! Provides basic operations for upcall client, include: +//! - Connect to upcall server and service +//! - Send data to server +//! - Receive data from server + +mod dev_mgr_service; + +use std::io::Write; +use std::os::unix::io::AsRawFd; +use std::sync::{Arc, Mutex}; +use std::time::Duration; + +use dbs_utils::epoll_manager::{EpollManager, EventOps, EventSet, Events, MutEventSubscriber}; +use dbs_virtio_devices::vsock::backend::{VsockInnerConnector, VsockStream}; +use log::{debug, error, info, trace, warn}; +use timerfd::{SetTimeFlags, TimerFd, TimerState}; + +pub use crate::dev_mgr_service::{ + CpuDevRequest, DevMgrRequest, DevMgrResponse, DevMgrService, MmioDevRequest, +}; + +const SERVER_PORT: u32 = 0xDB; +const SERVER_RECONNECT_DURATION_MS: u64 = 10; +const SERVER_MAX_RECONNECT_TIME: u32 = 500; + +/// Upcall client error. +#[derive(Debug, thiserror::Error)] +pub enum UpcallClientError { + /// Received invalid upcall message. + #[error("received invalid upcall message: {0}")] + InvalidMessage(String), + /// Upcall server connect error. + #[error("upcall server connect error: {0}")] + ServerConnect(#[source] std::io::Error), + /// Upcall service connect error. + #[error("upcall service connect error: {0}")] + ServiceConnect(#[source] std::io::Error), + /// Upcall send request error. + #[error("upcall send request error: {0}")] + SendRequest(#[source] std::io::Error), + /// Upcall get response error. + #[error("upcall get response error: {0}")] + GetResponse(#[source] std::io::Error), + /// Errors with timerfd. + #[error("timerfd error: {0}")] + TimerFd(#[source] std::io::Error), + /// Upcall is not connected. + #[error("upcall is not connected")] + UpcallIsNotConnected, + /// Upcall is busy now. + #[error("upcall is busy now")] + UpcallIsBusy, +} + +/// Upcall client result. +pub type Result = std::result::Result; + +/// Upcall client state, used by upcall client state machine. +/// +// NOTE: here's not a state like `ServerDisconnect`, because we always connect +// to server immediately when constructing the connection or disconnected from +// server. +#[derive(Clone, Eq, PartialEq, Debug)] +pub enum UpcallClientState { + /// There are two possible scenarios for a connection in this state: + /// - Server's connection is broken, waiting for reconnect. + /// - Server connection request sent, waiting for server's response. + WaitingServer, + /// Service connection request sent, waiting for service's response. + WaitingService, + /// The upcall service is connected. + ServiceConnected, + /// The upcall channl is busy (request has been sent, but response has not + /// been received). + ServiceBusy, + /// Error state that cannot just reconnect to server. + ReconnectError, +} + +#[allow(clippy::large_enum_variant)] +/// Upcall client request of different services. +pub enum UpcallClientRequest { + /// Device manager's request. + DevMgr(DevMgrRequest), + #[cfg(test)] + /// Fake service's request. + FakeRequest, +} + +/// Upcall client response of different services. +#[derive(Debug, Eq, PartialEq)] +pub enum UpcallClientResponse { + /// Device manager's response. + DevMgr(DevMgrResponse), + /// Upcall client disconnected, and need to reconnect. + UpcallReset, + #[cfg(test)] + /// Fake service's response + FakeResponse, +} + +/// Shared info between upcall client and upcall epoll handler. +struct UpcallClientInfo { + service: S, + connector: VsockInnerConnector, + stream: Option>, + state: UpcallClientState, + result_callback: Option>, +} + +impl UpcallClientInfo { + fn server_connection_start(&mut self) -> Result<()> { + let mut stream = self + .connector + .connect() + .map_err(UpcallClientError::ServerConnect)?; + stream + .set_nonblocking(true) + .map_err(UpcallClientError::ServerConnect)?; + + let cmd = format!("CONNECT {SERVER_PORT}\n"); + stream + .write_all(&cmd.into_bytes()) + .map_err(UpcallClientError::ServerConnect)?; + + // drop the old stream + let _ = self.stream.replace(stream); + + Ok(()) + } + + fn server_connection_check(&mut self) -> Result<()> { + let mut buffer = [0; 50]; + let len = self + .stream + .as_mut() + .unwrap() + .read(&mut buffer) + .map_err(UpcallClientError::ServerConnect)?; + + if !(len > 2 && buffer[0..2] == [b'O', b'K']) { + return Err(UpcallClientError::InvalidMessage(format!( + "upcall server expect ok, but received {}", + String::from_utf8_lossy(&buffer[0..2]), + ))); + } + + Ok(()) + } + + fn service_connection_start(&mut self) -> Result<()> { + self.service.connection_start(self.stream.as_mut().unwrap()) + } + + fn service_connection_check(&mut self) -> Result<()> { + self.service.connection_check(self.stream.as_mut().unwrap()) + } + + fn send_request(&mut self, request: UpcallClientRequest) -> Result<()> { + self.service + .send_request(self.stream.as_mut().unwrap(), request) + } + + fn handle_response(&mut self) -> Result { + self.service.handle_response(self.stream.as_mut().unwrap()) + } + + fn set_state(&mut self, state: UpcallClientState) { + self.state = state; + } + + fn set_callback(&mut self, callback: Box) { + self.result_callback.replace(callback); + } + + fn consume_callback(&mut self, response: UpcallClientResponse) { + if let Some(cb) = self.result_callback.take() { + cb(response) + }; + } +} + +/// Upcall client's Implementation. +pub struct UpcallClient { + epoll_manager: EpollManager, + info: Arc>>, +} + +impl UpcallClient { + /// Create a new Upcall Client instance. + pub fn new( + connector: VsockInnerConnector, + epoll_manager: EpollManager, + service: S, + ) -> Result { + let info = UpcallClientInfo { + connector, + stream: None, + state: UpcallClientState::WaitingServer, + service, + result_callback: None, + }; + Ok(UpcallClient { + epoll_manager, + info: Arc::new(Mutex::new(info)), + }) + } + + /// Connect upcall client to upcall server. + pub fn connect(&mut self) -> Result<()> { + let handler = Box::new(UpcallEpollHandler::new(self.info.clone())?); + self.epoll_manager.add_subscriber(handler); + + Ok(()) + } + + fn send_request_inner( + &self, + request: UpcallClientRequest, + callback: Option>, + ) -> Result<()> { + let mut info = self.info.lock().unwrap(); + match info.state { + UpcallClientState::WaitingServer + | UpcallClientState::WaitingService + | UpcallClientState::ReconnectError => Err(UpcallClientError::UpcallIsNotConnected), + UpcallClientState::ServiceBusy => Err(UpcallClientError::UpcallIsBusy), + UpcallClientState::ServiceConnected => { + info.send_request(request)?; + info.set_state(UpcallClientState::ServiceBusy); + if let Some(cb) = callback { + info.set_callback(cb) + }; + Ok(()) + } + } + } + + /// Send request to upcall server, and get the response from callback + /// function. + pub fn send_request( + &self, + request: UpcallClientRequest, + callback: Box, + ) -> Result<()> { + self.send_request_inner(request, Some(callback)) + } + + /// Only send request to upcall server, and discard the response. + pub fn send_request_without_result(&self, request: UpcallClientRequest) -> Result<()> { + self.send_request_inner(request, None) + } + + /// Get the link state of upcall client. + pub fn get_state(&self) -> UpcallClientState { + self.info.lock().unwrap().state.clone() + } + + /// The upcall client is ready to send request to upcall server or not. + pub fn is_ready(&self) -> bool { + self.get_state() == UpcallClientState::ServiceConnected + } +} + +/// Event handler of upcall client. +pub struct UpcallEpollHandler { + info: Arc>>, + reconnect_timer: TimerFd, + reconnect_time: u32, + in_reconnect: bool, +} + +impl UpcallEpollHandler { + fn new(info: Arc>>) -> Result { + let handler = UpcallEpollHandler { + info, + reconnect_timer: TimerFd::new().map_err(UpcallClientError::TimerFd)?, + reconnect_time: 0, + in_reconnect: false, + }; + let info = handler.info.clone(); + info.lock().unwrap().server_connection_start()?; + + Ok(handler) + } + + fn set_reconnect(&mut self) -> Result<()> { + if self.in_reconnect { + info!("upcall server is waiting for reconnect"); + return Ok(()); + } + self.in_reconnect = true; + + self.reconnect_timer + .set_state(TimerState::Disarmed, SetTimeFlags::Default); + + if self.reconnect_time > SERVER_MAX_RECONNECT_TIME { + error!("upcall server's max reconnect time exceed"); + return Ok(()); + } + + self.reconnect_timer.set_state( + TimerState::Oneshot(Duration::from_millis(SERVER_RECONNECT_DURATION_MS)), + SetTimeFlags::Default, + ); + + self.reconnect_time += 1; + Ok(()) + } + + fn handle_stream_event(&mut self, ops: &mut EventOps) { + let info = self.info.clone(); + let mut info = info.lock().unwrap(); + match info.state { + UpcallClientState::WaitingServer => { + if let Err(e) = info.server_connection_check() { + debug!("upcall connect server check failed, {}", e); + info.set_state(UpcallClientState::WaitingServer); + if let Err(e) = self.set_reconnect() { + error!("set reconnect error: {}", e); + info.set_state(UpcallClientState::ReconnectError); + } + } else { + info!("upcall connect server success"); + // It's time to connect to service when server is connected. + if let Err(e) = info.service_connection_start() { + warn!("upcall connect service start failed {}", e); + info.set_state(UpcallClientState::WaitingServer); + if let Err(e) = self.set_reconnect() { + error!("set reconnect error: {}", e); + info.set_state(UpcallClientState::ReconnectError); + } + } else { + // only if both server connection check and service connection start are ok, change to next state + info.state = UpcallClientState::WaitingService; + } + } + } + UpcallClientState::WaitingService => { + if let Err(e) = info.service_connection_check() { + warn!("upcall connect service check failed, {}", e); + info.set_state(UpcallClientState::WaitingServer); + if let Err(e) = self.set_reconnect() { + error!("set reconnect error: {}", e); + info.set_state(UpcallClientState::ReconnectError); + } + } else { + info!("upcall connect service success"); + info.set_state(UpcallClientState::ServiceConnected); + } + } + UpcallClientState::ServiceBusy => match info.handle_response() { + Ok(response) => { + trace!("upcall handle response success"); + info.set_state(UpcallClientState::ServiceConnected); + info.consume_callback(response); + } + Err(e) => { + warn!("upcall response failed {}", e); + info.set_state(UpcallClientState::WaitingServer); + if let Err(e) = self.set_reconnect() { + error!("set reconnect error: {}", e); + info.set_state(UpcallClientState::ReconnectError); + } + } + }, + UpcallClientState::ServiceConnected | UpcallClientState::ReconnectError => { + error!("we should get message from event handler when connection state is `ServiceConnected`"); + } + } + + if self.in_reconnect { + // remove the old stream's fd in epoll and drop the old stream + if let Some(stream) = info.stream.as_ref() { + ops.remove(Events::new_raw(stream.as_raw_fd(), EventSet::IN)) + .unwrap(); + } + let _ = info.stream.take(); + + // consume the result callback before reconnect + info.consume_callback(UpcallClientResponse::UpcallReset); + } + } + + fn handle_reconnect_event(&mut self, ops: &mut EventOps) { + // we should clear the reconnect timer and flag first + self.in_reconnect = false; + self.reconnect_timer + .set_state(TimerState::Disarmed, SetTimeFlags::Default); + + let info = self.info.clone(); + let mut info = info.lock().unwrap(); + // reconnect to server + if let Err(e) = info.server_connection_start() { + warn!("upcall reconnect server /failed: {}", e); + if let Err(e) = self.set_reconnect() { + error!("set reconnect error: {}", e); + } + } + debug!("upcall reconnect server..."); + // add new stream's fn to epoll + if let Some(stream) = info.stream.as_ref() { + ops.add(Events::new_raw(stream.as_raw_fd(), EventSet::IN)) + .unwrap(); + } + } +} + +impl MutEventSubscriber for UpcallEpollHandler +where + S: UpcallClientService + Send + 'static, +{ + fn process(&mut self, events: Events, ops: &mut EventOps) { + trace!("UpcallEpollHandler: process"); + + let info = self.info.lock().unwrap(); + let stream_fd = info.stream.as_ref().map(|s| s.as_raw_fd()); + drop(info); + + let reconnect_fd = self.reconnect_timer.as_raw_fd(); + match events.fd() { + fd if Some(fd) == stream_fd => self.handle_stream_event(ops), + fd if fd == reconnect_fd => { + self.handle_reconnect_event(ops); + } + _ => error!("upcall epoll handler: unknown event"), + } + } + + fn init(&mut self, ops: &mut EventOps) { + trace!("UpcallEpollHandler: init"); + // add the reconnect time fd into epoll manager + ops.add(Events::new(&self.reconnect_timer, EventSet::IN)) + .unwrap(); + // add the first stream into epoll manager + let info = self.info.lock().unwrap(); + ops.add(Events::new_raw( + info.stream.as_ref().unwrap().as_raw_fd(), + EventSet::IN, + )) + .unwrap(); + } +} + +/// The definition of upcall client service. +pub trait UpcallClientService { + /// Start to connect to service. + fn connection_start(&self, stream: &mut Box) -> Result<()>; + /// Check service's connection callback. + fn connection_check(&self, stream: &mut Box) -> Result<()>; + /// Send request to service. + fn send_request( + &self, + stream: &mut Box, + request: UpcallClientRequest, + ) -> Result<()>; + /// Service's response callback. + fn handle_response(&self, stream: &mut Box) -> Result; +} + +#[cfg(test)] +mod tests { + use dbs_utils::epoll_manager::SubscriberOps; + use dbs_virtio_devices::vsock::backend::{VsockBackend, VsockInnerBackend}; + + use super::*; + + #[derive(Default)] + struct FakeService { + connection_start_err: bool, + connection_check_err: bool, + handle_response_err: bool, + } + + impl UpcallClientService for FakeService { + fn connection_start(&self, stream: &mut Box) -> Result<()> { + if self.connection_start_err { + return Err(UpcallClientError::InvalidMessage(String::from( + "test failed", + ))); + } + stream + .write_all(&String::from("CONN START").into_bytes()) + .unwrap(); + Ok(()) + } + fn connection_check(&self, stream: &mut Box) -> Result<()> { + if self.connection_check_err { + return Err(UpcallClientError::InvalidMessage(String::from( + "test failed", + ))); + } + let mut buffer = [0; 10]; + stream.read_exact(&mut buffer).unwrap(); + assert_eq!(buffer, String::from("CONN CHECK").into_bytes().as_slice()); + Ok(()) + } + fn send_request( + &self, + stream: &mut Box, + _request: UpcallClientRequest, + ) -> Result<()> { + stream + .write_all(&String::from("TEST REQ").into_bytes()) + .unwrap(); + Ok(()) + } + + fn handle_response( + &self, + stream: &mut Box, + ) -> Result { + if self.handle_response_err { + return Err(UpcallClientError::InvalidMessage(String::from( + "test failed", + ))); + } + let mut buffer = [0; 9]; + stream.read_exact(&mut buffer).unwrap(); + assert_eq!(buffer, String::from("TEST RESP").into_bytes().as_slice()); + Ok(UpcallClientResponse::FakeResponse) + } + } + + fn get_upcall_client_info() -> (VsockInnerBackend, UpcallClientInfo) { + let vsock_backend = VsockInnerBackend::new().unwrap(); + let connector = vsock_backend.get_connector(); + let upcall_client_info = UpcallClientInfo { + service: FakeService::default(), + connector, + stream: None, + state: UpcallClientState::WaitingServer, + result_callback: None, + }; + (vsock_backend, upcall_client_info) + } + + #[test] + fn test_upcall_client_info_server_connection_start_and_check() { + let (mut vsock_backend, mut info) = get_upcall_client_info(); + + assert!(info.server_connection_start().is_ok()); + assert!(info.stream.is_some()); + + let mut inner_stream = vsock_backend.accept().unwrap(); + let mut read_buffer = vec![0; 12]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + assert_eq!( + read_buffer, + format!("CONNECT {SERVER_PORT}\n",).into_bytes() + ); + + let writer_buffer = String::from("ERR").into_bytes(); + inner_stream.write_all(&writer_buffer).unwrap(); + assert!(info.server_connection_check().is_err()); + + let writer_buffer = String::from("OK 1024\n").into_bytes(); + inner_stream.write_all(&writer_buffer).unwrap(); + assert!(info.server_connection_check().is_ok()); + } + + #[test] + fn test_upcall_client_info_service_connection() { + let (mut vsock_backend, mut info) = get_upcall_client_info(); + info.server_connection_start().unwrap(); + + let mut inner_stream = vsock_backend.accept().unwrap(); + let mut read_buffer = vec![0; 12]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + + assert!(info.service_connection_start().is_ok()); + let mut read_buffer = vec![0; 10]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + assert_eq!( + read_buffer, + String::from("CONN START").into_bytes().as_slice() + ); + + let writer_buffer = String::from("CONN CHECK").into_bytes(); + inner_stream.write_all(&writer_buffer).unwrap(); + assert!(info.service_connection_check().is_ok()); + } + + #[test] + fn test_upcall_client_info_request_and_response() { + let (mut vsock_backend, mut info) = get_upcall_client_info(); + info.server_connection_start().unwrap(); + + let mut inner_stream = vsock_backend.accept().unwrap(); + let mut read_buffer = vec![0; 12]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + + assert!(info.send_request(UpcallClientRequest::FakeRequest).is_ok()); + let mut read_buffer = vec![0; 8]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + assert_eq!( + read_buffer, + String::from("TEST REQ").into_bytes().as_slice() + ); + + let writer_buffer = String::from("TEST RESP").into_bytes(); + inner_stream.write_all(&writer_buffer).unwrap(); + assert!(info.handle_response().is_ok()); + } + + #[test] + fn test_upcall_client_info_set_state() { + let (_, mut info) = get_upcall_client_info(); + + info.set_state(UpcallClientState::WaitingServer); + assert_eq!(info.state, UpcallClientState::WaitingServer); + + info.set_state(UpcallClientState::ReconnectError); + assert_eq!(info.state, UpcallClientState::ReconnectError); + } + + #[test] + fn test_upcall_client_info_callback() { + let (_, mut info) = get_upcall_client_info(); + assert!(info.result_callback.is_none()); + + let callbacked = Arc::new(Mutex::new(None)); + let callbacked_ = callbacked.clone(); + info.set_callback(Box::new(move |resp| { + *callbacked_.lock().unwrap() = Some(resp); + })); + assert!(info.result_callback.is_some()); + + info.consume_callback(UpcallClientResponse::FakeResponse); + assert!(info.result_callback.is_none()); + assert_eq!( + *callbacked.lock().unwrap(), + Some(UpcallClientResponse::FakeResponse) + ); + } + + fn get_upcall_client() -> (VsockInnerBackend, UpcallClient) { + let vsock_backend = VsockInnerBackend::new().unwrap(); + let connector = vsock_backend.get_connector(); + let epoll_manager = EpollManager::default(); + let upcall_client = + UpcallClient::new(connector, epoll_manager, FakeService::default()).unwrap(); + + (vsock_backend, upcall_client) + } + + #[test] + fn test_upcall_client_connect() { + let (mut vsock_backend, mut upcall_client) = get_upcall_client(); + + assert!(upcall_client.connect().is_ok()); + + let mut inner_stream = vsock_backend.accept().unwrap(); + let mut read_buffer = vec![0; 12]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + assert_eq!(read_buffer, format!("CONNECT {SERVER_PORT}\n").into_bytes()); + } + + #[allow(clippy::mutex_atomic)] + #[allow(clippy::redundant_clone)] + #[test] + fn test_upcall_client_send_request() { + let (mut vsock_backend, upcall_client) = get_upcall_client(); + let info = upcall_client.info.clone(); + let connector = vsock_backend.get_connector(); + let outer_stream = connector.connect().unwrap(); + info.lock().unwrap().stream = Some(outer_stream); + let mut inner_stream = vsock_backend.accept().unwrap(); + + let got_response = Arc::new(Mutex::new(false)); + // assume service is connected + { + let mut i = info.lock().unwrap(); + i.set_state(UpcallClientState::ServiceConnected); + } + + let got_response_ = got_response.clone(); + assert!(upcall_client + .send_request( + UpcallClientRequest::FakeRequest, + Box::new(move |_| { + *got_response_.lock().unwrap() = true; + }), + ) + .is_ok()); + assert!(info.lock().unwrap().result_callback.is_some()); + + let mut read_buffer = vec![0; 8]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + + let writer_buffer = String::from("TEST RESP").into_bytes(); + assert!(inner_stream.write_all(writer_buffer.as_slice()).is_ok()); + let response = info.lock().unwrap().handle_response().unwrap(); + info.lock().unwrap().consume_callback(response); + assert!(info.lock().unwrap().result_callback.is_none()); + + assert!(*got_response.lock().unwrap()); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn test_upcall_client_send_request_without_result() { + let (mut vsock_backend, upcall_client) = get_upcall_client(); + let info = upcall_client.info.clone(); + let connector = vsock_backend.get_connector(); + let outer_stream = connector.connect().unwrap(); + info.lock().unwrap().stream = Some(outer_stream); + let mut inner_stream = vsock_backend.accept().unwrap(); + + // assume service is connected + { + let mut i = info.lock().unwrap(); + i.set_state(UpcallClientState::ServiceConnected); + } + + assert!(upcall_client + .send_request_without_result(UpcallClientRequest::FakeRequest) + .is_ok()); + assert!(info.lock().unwrap().result_callback.is_none()); + + let mut read_buffer = vec![0; 8]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + + let writer_buffer = String::from("TEST RESP").into_bytes(); + assert!(inner_stream.write_all(writer_buffer.as_slice()).is_ok()); + assert!(info.lock().unwrap().handle_response().is_ok()); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn test_upcall_client_send_request_error() { + let (_, upcall_client) = get_upcall_client(); + let info = upcall_client.info.clone(); + + let do_test = || { + assert!(upcall_client + .send_request_inner(UpcallClientRequest::FakeRequest, None) + .is_err()); + }; + + { + let mut i = info.lock().unwrap(); + i.set_state(UpcallClientState::WaitingServer); + } + do_test(); + + { + let mut i = info.lock().unwrap(); + i.set_state(UpcallClientState::WaitingService); + } + do_test(); + + { + let mut i = info.lock().unwrap(); + i.set_state(UpcallClientState::ReconnectError); + } + do_test(); + + { + let mut i = info.lock().unwrap(); + i.set_state(UpcallClientState::ServiceBusy); + } + do_test(); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn test_upcall_client_get_state() { + let (_, upcall_client) = get_upcall_client(); + + assert_eq!(upcall_client.get_state(), UpcallClientState::WaitingServer); + + let info = upcall_client.info.clone(); + info.lock().unwrap().state = UpcallClientState::ServiceBusy; + assert_eq!(upcall_client.get_state(), UpcallClientState::ServiceBusy); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn test_upcall_client_is_ready() { + let (_, upcall_client) = get_upcall_client(); + + assert!(!upcall_client.is_ready()); + + let info = upcall_client.info.clone(); + info.lock().unwrap().state = UpcallClientState::ServiceConnected; + assert!(upcall_client.is_ready()); + } + + fn get_upcall_epoll_handler() -> (VsockInnerBackend, UpcallEpollHandler) { + let (inner_backend, info) = get_upcall_client_info(); + let epoll_handler = UpcallEpollHandler::new(Arc::new(Mutex::new(info))).unwrap(); + + (inner_backend, epoll_handler) + } + + #[test] + fn test_upcall_epoll_handler_set_reconnect() { + let (_, mut epoll_handler) = get_upcall_epoll_handler(); + + assert!(epoll_handler.set_reconnect().is_ok()); + assert_eq!(epoll_handler.reconnect_time, 1); + assert!(epoll_handler.in_reconnect); + match epoll_handler.reconnect_timer.get_state() { + TimerState::Oneshot(dur) => { + assert!(dur.as_millis() < 10 && dur.as_millis() > 5); + } + _ => unreachable!(), + } + } + + #[test] + fn test_upcall_epoll_handler_stream_event() { + // Waiting Server state, server connection check error + { + let (_, epoll_handler) = get_upcall_epoll_handler(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(epoll_handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_ops = inner_mgr.event_ops(id).unwrap(); + let (mut vsock_backend, mut epoll_handler) = get_upcall_epoll_handler(); + let info = epoll_handler.info.clone(); + let stream_fd = info.lock().unwrap().stream.as_ref().unwrap().as_raw_fd(); + event_ops + .add(Events::new_raw(stream_fd, EventSet::IN)) + .unwrap(); + + let info = epoll_handler.info.clone(); + info.lock() + .unwrap() + .set_state(UpcallClientState::WaitingServer); + + let mut inner_stream = vsock_backend.accept().unwrap(); + let mut read_buffer = vec![0; 12]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + + epoll_handler.handle_stream_event(&mut event_ops); + assert_eq!(info.lock().unwrap().state, UpcallClientState::WaitingServer); + assert_eq!(epoll_handler.reconnect_time, 1); + assert!(epoll_handler.in_reconnect); + } + + // Waiting Server state, server connection check success, but service + // connection start error + { + let (_, epoll_handler) = get_upcall_epoll_handler(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(epoll_handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_ops = inner_mgr.event_ops(id).unwrap(); + let (mut vsock_backend, mut epoll_handler) = get_upcall_epoll_handler(); + let info = epoll_handler.info.clone(); + let stream_fd = info.lock().unwrap().stream.as_ref().unwrap().as_raw_fd(); + event_ops + .add(Events::new_raw(stream_fd, EventSet::IN)) + .unwrap(); + + let info = epoll_handler.info.clone(); + info.lock() + .unwrap() + .set_state(UpcallClientState::WaitingServer); + info.lock().unwrap().service.connection_start_err = true; + + let mut inner_stream = vsock_backend.accept().unwrap(); + let mut read_buffer = vec![0; 12]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + + let writer_buffer = String::from("OK 1024\n").into_bytes(); + inner_stream.write_all(&writer_buffer).unwrap(); + + epoll_handler.handle_stream_event(&mut event_ops); + assert_eq!(info.lock().unwrap().state, UpcallClientState::WaitingServer); + assert_eq!(epoll_handler.reconnect_time, 1); + assert!(epoll_handler.in_reconnect); + } + + // Waiting Server state, server connection check success, and service + // connection start success, too + { + let (_, epoll_handler) = get_upcall_epoll_handler(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(epoll_handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_ops = inner_mgr.event_ops(id).unwrap(); + let (mut vsock_backend, mut epoll_handler) = get_upcall_epoll_handler(); + let info = epoll_handler.info.clone(); + let stream_fd = info.lock().unwrap().stream.as_ref().unwrap().as_raw_fd(); + event_ops + .add(Events::new_raw(stream_fd, EventSet::IN)) + .unwrap(); + + let info = epoll_handler.info.clone(); + info.lock() + .unwrap() + .set_state(UpcallClientState::WaitingServer); + + let mut inner_stream = vsock_backend.accept().unwrap(); + let mut read_buffer = vec![0; 12]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + + let writer_buffer = String::from("OK 1024\n").into_bytes(); + inner_stream.write_all(&writer_buffer).unwrap(); + + epoll_handler.handle_stream_event(&mut event_ops); + assert_eq!( + info.lock().unwrap().state, + UpcallClientState::WaitingService + ); + } + + // Waiting Service state, service connection check error + { + let (_, epoll_handler) = get_upcall_epoll_handler(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(epoll_handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_ops = inner_mgr.event_ops(id).unwrap(); + let (mut vsock_backend, mut epoll_handler) = get_upcall_epoll_handler(); + let info = epoll_handler.info.clone(); + let stream_fd = info.lock().unwrap().stream.as_ref().unwrap().as_raw_fd(); + event_ops + .add(Events::new_raw(stream_fd, EventSet::IN)) + .unwrap(); + + let info = epoll_handler.info.clone(); + info.lock() + .unwrap() + .set_state(UpcallClientState::WaitingService); + info.lock().unwrap().service.connection_check_err = true; + + let mut inner_stream = vsock_backend.accept().unwrap(); + let mut read_buffer = vec![0; 12]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + + epoll_handler.handle_stream_event(&mut event_ops); + assert_eq!(info.lock().unwrap().state, UpcallClientState::WaitingServer); + assert_eq!(epoll_handler.reconnect_time, 1); + assert!(epoll_handler.in_reconnect); + } + + // Waiting Service state, service connection check ok + { + let (_, epoll_handler) = get_upcall_epoll_handler(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(epoll_handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_ops = inner_mgr.event_ops(id).unwrap(); + let (mut vsock_backend, mut epoll_handler) = get_upcall_epoll_handler(); + let info = epoll_handler.info.clone(); + let stream_fd = info.lock().unwrap().stream.as_ref().unwrap().as_raw_fd(); + event_ops + .add(Events::new_raw(stream_fd, EventSet::IN)) + .unwrap(); + + let info = epoll_handler.info.clone(); + info.lock() + .unwrap() + .set_state(UpcallClientState::WaitingService); + + let mut inner_stream = vsock_backend.accept().unwrap(); + let mut read_buffer = vec![0; 12]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + + let writer_buffer = String::from("CONN CHECK").into_bytes(); + inner_stream.write_all(&writer_buffer).unwrap(); + + epoll_handler.handle_stream_event(&mut event_ops); + assert_eq!( + info.lock().unwrap().state, + UpcallClientState::ServiceConnected + ); + } + + // Service Busy state, handle response err + { + let (_, epoll_handler) = get_upcall_epoll_handler(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(epoll_handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_ops = inner_mgr.event_ops(id).unwrap(); + let (mut vsock_backend, mut epoll_handler) = get_upcall_epoll_handler(); + let info = epoll_handler.info.clone(); + let stream_fd = info.lock().unwrap().stream.as_ref().unwrap().as_raw_fd(); + event_ops + .add(Events::new_raw(stream_fd, EventSet::IN)) + .unwrap(); + + let info = epoll_handler.info.clone(); + info.lock() + .unwrap() + .set_state(UpcallClientState::ServiceBusy); + info.lock().unwrap().service.handle_response_err = true; + + let mut inner_stream = vsock_backend.accept().unwrap(); + let mut read_buffer = vec![0; 12]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + + epoll_handler.handle_stream_event(&mut event_ops); + assert_eq!(info.lock().unwrap().state, UpcallClientState::WaitingServer); + assert_eq!(epoll_handler.reconnect_time, 1); + assert!(epoll_handler.in_reconnect); + } + + // Service Busy state, handle response ok + { + let (_, epoll_handler) = get_upcall_epoll_handler(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(epoll_handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_ops = inner_mgr.event_ops(id).unwrap(); + let (mut vsock_backend, mut epoll_handler) = get_upcall_epoll_handler(); + let info = epoll_handler.info.clone(); + let stream_fd = info.lock().unwrap().stream.as_ref().unwrap().as_raw_fd(); + event_ops + .add(Events::new_raw(stream_fd, EventSet::IN)) + .unwrap(); + + let info = epoll_handler.info.clone(); + info.lock() + .unwrap() + .set_state(UpcallClientState::ServiceBusy); + + let mut inner_stream = vsock_backend.accept().unwrap(); + let mut read_buffer = vec![0; 12]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + + let writer_buffer = String::from("TEST RESP").into_bytes(); + inner_stream.write_all(&writer_buffer).unwrap(); + + epoll_handler.handle_stream_event(&mut event_ops); + assert_eq!( + info.lock().unwrap().state, + UpcallClientState::ServiceConnected + ); + } + + // Service Connected state + { + let (_, epoll_handler) = get_upcall_epoll_handler(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(epoll_handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_ops = inner_mgr.event_ops(id).unwrap(); + let (mut vsock_backend, mut epoll_handler) = get_upcall_epoll_handler(); + let info = epoll_handler.info.clone(); + let stream_fd = info.lock().unwrap().stream.as_ref().unwrap().as_raw_fd(); + event_ops + .add(Events::new_raw(stream_fd, EventSet::IN)) + .unwrap(); + + let info = epoll_handler.info.clone(); + info.lock() + .unwrap() + .set_state(UpcallClientState::ServiceConnected); + + let mut inner_stream = vsock_backend.accept().unwrap(); + let mut read_buffer = vec![0; 12]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + + epoll_handler.handle_stream_event(&mut event_ops); + assert_eq!( + info.lock().unwrap().state, + UpcallClientState::ServiceConnected + ); + } + + // Reconnect Error state + { + let (_, epoll_handler) = get_upcall_epoll_handler(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(epoll_handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_ops = inner_mgr.event_ops(id).unwrap(); + let (mut vsock_backend, mut epoll_handler) = get_upcall_epoll_handler(); + let info = epoll_handler.info.clone(); + let stream_fd = info.lock().unwrap().stream.as_ref().unwrap().as_raw_fd(); + event_ops + .add(Events::new_raw(stream_fd, EventSet::IN)) + .unwrap(); + + let info = epoll_handler.info.clone(); + info.lock() + .unwrap() + .set_state(UpcallClientState::ReconnectError); + + let mut inner_stream = vsock_backend.accept().unwrap(); + let mut read_buffer = vec![0; 12]; + assert!(inner_stream.read_exact(&mut read_buffer).is_ok()); + + epoll_handler.handle_stream_event(&mut event_ops); + assert_eq!( + info.lock().unwrap().state, + UpcallClientState::ReconnectError + ); + } + } + + #[test] + fn test_upcall_epoll_handler_reconnect_event() { + let (_, epoll_handler) = get_upcall_epoll_handler(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(epoll_handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_ops = inner_mgr.event_ops(id).unwrap(); + let (_, mut epoll_handler) = get_upcall_epoll_handler(); + + epoll_handler.handle_reconnect_event(&mut event_ops); + } + + #[test] + fn test_upcall_epoll_handler_process() { + let (_, epoll_handler) = get_upcall_epoll_handler(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(epoll_handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_ops = inner_mgr.event_ops(id).unwrap(); + let (_, mut epoll_handler) = get_upcall_epoll_handler(); + let info = epoll_handler.info.clone(); + let stream_fd = info.lock().unwrap().stream.as_ref().unwrap().as_raw_fd(); + let reconnect_fd = epoll_handler.reconnect_timer.as_raw_fd(); + let event_set = EventSet::EDGE_TRIGGERED; + event_ops + .add(Events::new_raw(stream_fd, EventSet::IN)) + .unwrap(); + + // test for stream event + let events = Events::new_raw(stream_fd, event_set); + epoll_handler.process(events, &mut event_ops); + + // test for reconnect event + let events = Events::new_raw(reconnect_fd, event_set); + epoll_handler.process(events, &mut event_ops); + } +} diff --git a/src/dragonball/src/dbs_utils/Cargo.toml b/src/dragonball/src/dbs_utils/Cargo.toml new file mode 100644 index 000000000..ae2267ffc --- /dev/null +++ b/src/dragonball/src/dbs_utils/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "dbs-utils" +version = "0.2.1" +authors = ["Alibaba Dragonball Team"] +description = "helpers and utilities used by dragonball-sandbox components" +license = "Apache-2.0 AND BSD-3-Clause" +edition = "2018" +homepage = "https://github.com/openanolis/dragonball-sandbox" +repository = "https://github.com/openanolis/dragonball-sandbox" +keywords = ["dragonball", "secure-sandbox", "utils"] +readme = "README.md" + +[dependencies] +anyhow = "1.0" +event-manager = { version = "0.2.1", features = [ "remote_endpoint" ] } +libc = "0.2.39" +log = "0.4.14" +serde = { version = "1.0.27", features = ["derive", "rc"] } +thiserror = "1.0" +timerfd = "1.0" +vmm-sys-util = "0.11.0" + +[dev-dependencies] +serde_json = "1.0.9" diff --git a/src/dragonball/src/dbs_utils/LICENSE b/src/dragonball/src/dbs_utils/LICENSE new file mode 120000 index 000000000..30cff7403 --- /dev/null +++ b/src/dragonball/src/dbs_utils/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/src/dragonball/src/dbs_utils/README.md b/src/dragonball/src/dbs_utils/README.md new file mode 100644 index 000000000..fae004e49 --- /dev/null +++ b/src/dragonball/src/dbs_utils/README.md @@ -0,0 +1,13 @@ +# dbs-utils + +This crate is a collection of modules that provides helpers and utilities used by multiple `dragonball-sandbox` components. + +And also provides some wrappers for [`vmm-sys-util`](https://github.com/rust-vmm/vmm-sys-util). + +## Acknowledgement + +Part of the code is derived from the [Firecracker](https://github.com/firecracker-microvm/firecracker) project. + +## License + +This project is licensed under [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). diff --git a/src/dragonball/src/dbs_utils/THIRD-PARTY b/src/dragonball/src/dbs_utils/THIRD-PARTY new file mode 120000 index 000000000..301d0a498 --- /dev/null +++ b/src/dragonball/src/dbs_utils/THIRD-PARTY @@ -0,0 +1 @@ +../../THIRD-PARTY \ No newline at end of file diff --git a/src/dragonball/src/dbs_utils/src/epoll_manager.rs b/src/dragonball/src/dbs_utils/src/epoll_manager.rs new file mode 100644 index 000000000..b27c523af --- /dev/null +++ b/src/dragonball/src/dbs_utils/src/epoll_manager.rs @@ -0,0 +1,174 @@ +// Copyright 2020 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! A simple wrapper over event_manager::EventManager to solve possible deadlock. + +use anyhow::{anyhow, Result}; +use std::sync::{Arc, Mutex}; + +pub use event_manager::{ + Error, EventManager, EventOps, EventSet, Events, MutEventSubscriber, RemoteEndpoint, + SubscriberId, SubscriberOps, +}; + +/// Type of epoll subscriber. +pub type EpollSubscriber = Box; + +type EpollManagerImpl = Arc>>; + +/// A wrapper struct over EventManager to solve possible deadlock. +/// +/// It's a rather tough topic to deal with the epoll event manager in rust way. +/// The event_manager::EventManager is designed for single-threaded environment and it leaves +/// the task for concurrent access to the clients. +/// There are two types of threads involved, epoll worker thread and vCPU threads. +/// To reduce overhead, the epoll worker thread calls epoll::wait() without timeout, so the +/// worker thread will hold the EpollManagerImpl::Mutex for undetermined periods. When the vCPU +/// threads tries to activate virtio devices, they need to acquire the same EpollManagerImpl::Mutex. +/// Thus the vCPU threads may block for an undetermined time. To solve this issue, we perform +/// an kick()/try_lock() loop to wake up the epoll worker thread from sleeping. +#[derive(Clone)] +pub struct EpollManager { + pub mgr: EpollManagerImpl, + endpoint: Arc>>, +} + +impl EpollManager { + /// Add a new epoll event subscriber. + pub fn add_subscriber(&self, handler: EpollSubscriber) -> SubscriberId { + let _ = self.endpoint.lock().unwrap().kick(); + if let Ok(mut mgr) = self.mgr.try_lock() { + mgr.add_subscriber(handler) + } else { + return self + .endpoint + .lock() + .unwrap() + .call_blocking::<_, _, Error>(move |mgr| Ok(mgr.add_subscriber(handler))) + .unwrap(); + } + } + + /// Remove a given epoll event subscriber. + pub fn remove_subscriber(&mut self, subscriber_id: SubscriberId) -> Result { + let mut mgr = self + .mgr + .lock() + .map_err(|e| anyhow!("EventManager lock fail. {:?}", e))?; + mgr.remove_subscriber(subscriber_id) + .map_err(|e| anyhow!("remove subscriber err. {:?}", e)) + } + + /// Add an epoll event to be monitored. + pub fn add_event( + &self, + subscriber_id: SubscriberId, + events: Events, + ) -> std::result::Result<(), Error> { + loop { + let _ = self.endpoint.lock().unwrap().kick(); + if let Ok(mut mgr) = self.mgr.try_lock() { + let mut ops = mgr.event_ops(subscriber_id)?; + return ops.add(events); + } + } + } + + /// Run the epoll polling loop. + pub fn handle_events(&self, timeout: i32) -> std::result::Result { + // Do not expect poisoned lock. + let mut guard = self.mgr.lock().unwrap(); + + guard.run_with_timeout(timeout) + } +} + +impl Default for EpollManager { + /// Create a new epoll manager. + fn default() -> Self { + let mgr = EventManager::new().expect("epoll_manager: failed create new instance"); + let endpoint = Arc::new(Mutex::new(mgr.remote_endpoint())); + + EpollManager { + mgr: Arc::new(Mutex::new(mgr)), + endpoint, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::os::unix::io::AsRawFd; + use vmm_sys_util::{epoll::EventSet, eventfd::EventFd}; + + struct DummySubscriber { + pub event: EventFd, + } + + impl DummySubscriber { + fn new() -> Self { + Self { + event: EventFd::new(0).unwrap(), + } + } + } + + impl MutEventSubscriber for DummySubscriber { + fn process(&mut self, events: Events, _ops: &mut EventOps) { + let source = events.fd(); + let event_set = events.event_set(); + assert_ne!(source, self.event.as_raw_fd()); + match event_set { + EventSet::IN => { + unreachable!() + } + EventSet::OUT => { + self.event.read().unwrap(); + } + _ => { + unreachable!() + } + } + } + + fn init(&mut self, _ops: &mut EventOps) {} + } + + #[test] + fn test_epoll_manager() { + let mut epoll_manager = EpollManager::default(); + let epoll_manager_clone = epoll_manager.clone(); + let thread = std::thread::spawn(move || loop { + let count = epoll_manager_clone.handle_events(-1).unwrap(); + if count == 0 { + continue; + } + assert_eq!(count, 1); + break; + }); + let handler = DummySubscriber::new(); + let event = handler.event.try_clone().unwrap(); + let id = epoll_manager.add_subscriber(Box::new(handler)); + + thread.join().unwrap(); + + epoll_manager + .add_event(id, Events::new(&event, EventSet::OUT)) + .unwrap(); + event.write(1).unwrap(); + + let epoll_manager_clone = epoll_manager.clone(); + let thread = std::thread::spawn(move || loop { + let count = epoll_manager_clone.handle_events(-1).unwrap(); + if count == 0 { + continue; + } + assert_eq!(count, 2); + break; + }); + + thread.join().unwrap(); + epoll_manager.remove_subscriber(id).unwrap(); + } +} diff --git a/src/dragonball/src/dbs_utils/src/lib.rs b/src/dragonball/src/dbs_utils/src/lib.rs new file mode 100644 index 000000000..a3013e2d2 --- /dev/null +++ b/src/dragonball/src/dbs_utils/src/lib.rs @@ -0,0 +1,9 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod epoll_manager; +pub mod metric; +pub mod net; +pub mod rate_limiter; +pub mod time; diff --git a/src/dragonball/src/dbs_utils/src/metric.rs b/src/dragonball/src/dbs_utils/src/metric.rs new file mode 100644 index 000000000..cfef025f0 --- /dev/null +++ b/src/dragonball/src/dbs_utils/src/metric.rs @@ -0,0 +1,199 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Defines the public components of the metric system. +//! +//! # Design +//! The main design goals of this system are: +//! * Use lockless operations, preferably ones that don't require anything other than +//! simple reads/writes being atomic. +//! * Exploit interior mutability and atomics being Sync to allow all methods (including the ones +//! which are effectively mutable) to be callable on a global non-mut static. +//! * Rely on `serde` to provide the actual serialization for writing the metrics. +//! * Since all metrics start at 0, we implement the `Default` trait via derive for all of them, +//! to avoid having to initialize everything by hand. +//! +//! The system implements 2 types of metrics: +//! * Shared Incremental Metrics (SharedIncMetrics) - dedicated for the metrics which need a counter +//! (i.e the number of times an API request failed). These metrics are reset upon flush. +//! * Shared Store Metrics (SharedStoreMetrics) - are targeted at keeping a persistent value, it is not +//! intended to act as a counter (i.e for measure the process start up time for example). +//! +//! The current approach for the `SharedIncMetrics` type is to store two values (current and previous) +//! and compute the delta between them each time we do a flush (i.e by serialization). There are a number of advantages +//! to this approach, including: +//! * We don't have to introduce an additional write (to reset the value) from the thread which +//! does to actual writing, so less synchronization effort is required. +//! * We don't have to worry at all that much about losing some data if writing fails for a while +//! (this could be a concern, I guess). +//! If if turns out this approach is not really what we want, it's pretty easy to resort to +//! something else, while working behind the same interface. + +use std::sync::atomic::{AtomicUsize, Ordering}; + +use serde::{Serialize, Serializer}; + +/// Used for defining new types of metrics that act as a counter (i.e they are continuously updated by +/// incrementing their value). +pub trait IncMetric { + /// Adds `value` to the current counter. + fn add(&self, value: usize); + /// Increments by 1 unit the current counter. + fn inc(&self) { + self.add(1); + } + /// Returns current value of the counter. + fn count(&self) -> usize; +} + +/// Representation of a metric that is expected to be incremented from more than one thread, so more +/// synchronization is necessary. +// It's currently used for vCPU metrics. An alternative here would be +// to have one instance of every metric for each thread, and to +// aggregate them when writing. However this probably overkill unless we have a lot of vCPUs +// incrementing metrics very often. Still, it's there if we ever need it :-s +// We will be keeping two values for each metric for being able to reset +// counters on each metric. +// 1st member - current value being updated +// 2nd member - old value that gets the current value whenever metrics is flushed to disk +#[derive(Default)] +pub struct SharedIncMetric(AtomicUsize, AtomicUsize); + +impl IncMetric for SharedIncMetric { + // While the order specified for this operation is still Relaxed, the actual instruction will + // be an asm "LOCK; something" and thus atomic across multiple threads, simply because of the + // fetch_and_add (as opposed to "store(load() + 1)") implementation for atomics. + // TODO: would a stronger ordering make a difference here? + fn add(&self, value: usize) { + self.0.fetch_add(value, Ordering::Relaxed); + } + + fn count(&self) -> usize { + self.0.load(Ordering::Relaxed) + } +} + +impl Serialize for SharedIncMetric { + /// Reset counters of each metrics. Here we suppose that Serialize's goal is to help with the + /// flushing of metrics. + /// !!! Any print of the metrics will also reset them. Use with caution !!! + fn serialize(&self, serializer: S) -> Result { + // There's no serializer.serialize_usize() for some reason :( + let snapshot = self.0.load(Ordering::Relaxed); + let res = serializer.serialize_u64(snapshot as u64 - self.1.load(Ordering::Relaxed) as u64); + + if res.is_ok() { + self.1.store(snapshot, Ordering::Relaxed); + } + res + } +} + +/// Used for defining new types of metrics that do not need a counter and act as a persistent indicator. +pub trait StoreMetric { + /// Returns current value of the counter. + fn fetch(&self) -> usize; + /// Stores `value` to the current counter. + fn store(&self, value: usize); +} + +/// Representation of a metric that is expected to hold a value that can be accessed +/// from more than one thread, so more synchronization is necessary. +#[derive(Default)] +pub struct SharedStoreMetric(AtomicUsize); + +impl StoreMetric for SharedStoreMetric { + fn fetch(&self) -> usize { + self.0.load(Ordering::Relaxed) + } + + fn store(&self, value: usize) { + self.0.store(value, Ordering::Relaxed); + } +} + +impl IncMetric for SharedStoreMetric { + fn add(&self, value: usize) { + // This operation wraps around on overflow. + self.0.fetch_add(value, Ordering::Relaxed); + } + + fn count(&self) -> usize { + self.0.load(Ordering::Relaxed) + } +} + +impl Serialize for SharedStoreMetric { + fn serialize(&self, serializer: S) -> Result { + serializer.serialize_u64(self.0.load(Ordering::Relaxed) as u64) + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::fence; + use std::sync::Arc; + use std::thread; + + use super::*; + + #[test] + fn test_shared_inc_metric() { + let metric = Arc::new(SharedIncMetric::default()); + + // We're going to create a number of threads that will attempt to increase this metric + // in parallel. If everything goes fine we still can't be sure the synchronization works, + // but if something fails, then we definitely have a problem :-s + + const NUM_THREADS_TO_SPAWN: usize = 4; + const NUM_INCREMENTS_PER_THREAD: usize = 10_0000; + const M2_INITIAL_COUNT: usize = 123; + + metric.add(M2_INITIAL_COUNT); + + let mut v = Vec::with_capacity(NUM_THREADS_TO_SPAWN); + + for _ in 0..NUM_THREADS_TO_SPAWN { + let r = metric.clone(); + v.push(thread::spawn(move || { + for _ in 0..NUM_INCREMENTS_PER_THREAD { + r.inc(); + } + })); + } + + for handle in v { + handle.join().unwrap(); + } + + assert_eq!( + metric.count(), + M2_INITIAL_COUNT + NUM_THREADS_TO_SPAWN * NUM_INCREMENTS_PER_THREAD + ); + } + + #[test] + fn test_shared_store_metric() { + let m1 = Arc::new(SharedStoreMetric::default()); + m1.store(1); + fence(Ordering::SeqCst); + assert_eq!(1, m1.fetch()); + } + + #[test] + fn test_serialize() { + let s = serde_json::to_string(&SharedIncMetric( + AtomicUsize::new(123), + AtomicUsize::new(111), + )); + assert!(s.is_ok()); + } + + #[test] + fn test_wraps_around() { + let m = SharedStoreMetric(AtomicUsize::new(usize::MAX)); + m.add(1); + assert_eq!(m.count(), 0); + } +} diff --git a/src/dragonball/src/dbs_utils/src/net/mac.rs b/src/dragonball/src/dbs_utils/src/net/mac.rs new file mode 100644 index 000000000..1c618694f --- /dev/null +++ b/src/dragonball/src/dbs_utils/src/net/mac.rs @@ -0,0 +1,161 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::fmt; +use std::result::Result; + +use serde::de::{Deserialize, Deserializer, Error}; +use serde::ser::{Serialize, Serializer}; + +/// Segments of MAC address separated by ":". +pub const MAC_ADDR_LEN: usize = 6; + +#[derive(Debug)] +pub enum MacError { + MacLengthError(usize), +} + +/// MAC address for ethernet NIC. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct MacAddr { + bytes: [u8; MAC_ADDR_LEN], +} + +impl MacAddr { + /// Parse a string into an MacAddr object. + /// The error contains the str that failed to be parsed, for nicer error message generation. + pub fn parse_str(s: &S) -> Result + where + S: AsRef + ?Sized, + { + let v: Vec<&str> = s.as_ref().split(':').collect(); + let mut bytes = [0u8; MAC_ADDR_LEN]; + + if v.len() != MAC_ADDR_LEN { + return Err(s.as_ref()); + } + + for i in 0..MAC_ADDR_LEN { + if v[i].len() != 2 { + return Err(s.as_ref()); + } + bytes[i] = u8::from_str_radix(v[i], 16).map_err(|_| s.as_ref())?; + } + + Ok(MacAddr { bytes }) + } + + /// Create a MacAddr object from raw bytes unchecked. + /// + /// Does not check whether src.len() == MAC_ADDR_LEN. + #[inline] + pub fn from_bytes_unchecked(src: &[u8]) -> MacAddr { + let mut bytes = [0u8; MAC_ADDR_LEN]; + let _ = &bytes[..].copy_from_slice(src); + + MacAddr { bytes } + } + + /// Create a MacAddr object from raw bytes. + #[inline] + pub fn from_bytes(src: &[u8]) -> Result { + if src.len() != MAC_ADDR_LEN { + return Err(MacError::MacLengthError(src.len())); + } + Ok(MacAddr::from_bytes_unchecked(src)) + } + + /// Get raw bytes of the MacAddr object. + #[inline] + pub fn get_bytes(&self) -> &[u8] { + &self.bytes + } +} + +impl fmt::Display for MacAddr { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let b = &self.bytes; + write!( + f, + "{:02x}:{:02x}:{:02x}:{:02x}:{:02x}:{:02x}", + b[0], b[1], b[2], b[3], b[4], b[5] + ) + } +} + +impl Serialize for MacAddr { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + self.to_string().serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for MacAddr { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + MacAddr::parse_str(&s).map_err(|_| D::Error::custom("The provided MAC address is invalid.")) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_mac_addr() { + // too long + assert!(MacAddr::parse_str("aa:aa:aa:aa:aa:aa:aa").is_err()); + + // invalid hex + assert!(MacAddr::parse_str("aa:aa:aa:aa:aa:ax").is_err()); + + // single digit mac address component should be invalid + assert!(MacAddr::parse_str("aa:aa:aa:aa:aa:b").is_err()); + + // components with more than two digits should also be invalid + assert!(MacAddr::parse_str("aa:aa:aa:aa:aa:bbb").is_err()); + + let mac = MacAddr::parse_str("12:34:56:78:9a:BC").unwrap(); + + println!("parsed MAC address: {mac}"); + + let bytes = mac.get_bytes(); + assert_eq!(bytes, [0x12u8, 0x34, 0x56, 0x78, 0x9a, 0xbc]); + } + + #[test] + fn test_from_bytes() { + let src1 = [0x01, 0x02, 0x03, 0x04, 0x05]; + let src2 = [0x01, 0x02, 0x03, 0x04, 0x05, 0x06]; + let src3 = [0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07]; + + assert!(MacAddr::from_bytes(&src1[..]).is_err()); + + let x = MacAddr::from_bytes(&src2[..]).unwrap(); + assert_eq!(x.to_string(), String::from("01:02:03:04:05:06")); + + assert!(MacAddr::from_bytes(&src3[..]).is_err()); + } + + #[cfg(feature = "with-serde")] + #[test] + fn test_mac_addr_serialization_and_deserialization() { + let mac: MacAddr = + serde_json::from_str("\"12:34:56:78:9a:bc\"").expect("MacAddr deserialization failed."); + + let bytes = mac.get_bytes(); + assert_eq!(bytes, [0x12u8, 0x34, 0x56, 0x78, 0x9a, 0xbc]); + + let s = serde_json::to_string(&mac).expect("MacAddr serialization failed."); + assert_eq!(s, "\"12:34:56:78:9a:bc\""); + } +} diff --git a/src/dragonball/src/dbs_utils/src/net/mod.rs b/src/dragonball/src/dbs_utils/src/net/mod.rs new file mode 100644 index 000000000..5260f0e59 --- /dev/null +++ b/src/dragonball/src/dbs_utils/src/net/mod.rs @@ -0,0 +1,20 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +#![deny(missing_docs)] +//! # Network-related utilities +//! +//! Provides tools for representing and handling network related concepts like MAC addresses and +//! network interfaces. + +mod mac; +pub use self::mac::{MacAddr, MAC_ADDR_LEN}; + +mod tap; +pub use self::tap::{Error as TapError, Tap}; + +pub mod net_gen; diff --git a/src/dragonball/src/dbs_utils/src/net/net_gen/if_tun.rs b/src/dragonball/src/dbs_utils/src/net/net_gen/if_tun.rs new file mode 100644 index 000000000..c5ce74e17 --- /dev/null +++ b/src/dragonball/src/dbs_utils/src/net/net_gen/if_tun.rs @@ -0,0 +1,603 @@ +// Copyright 2023 Alibaba Cloud. All Rights Reserved. +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. +// SPDX-License-Identifier: Apache-2.0 + +/* automatically generated by rust-bindgen */ + +#[repr(C)] +#[derive(Default)] +pub struct __IncompleteArrayField(::std::marker::PhantomData); +impl __IncompleteArrayField { + #[inline] + pub fn new() -> Self { + __IncompleteArrayField(::std::marker::PhantomData) + } + #[inline] + pub unsafe fn as_ptr(&self) -> *const T { + ::std::mem::transmute(self) + } + #[inline] + pub unsafe fn as_mut_ptr(&mut self) -> *mut T { + ::std::mem::transmute(self) + } + #[inline] + pub unsafe fn as_slice(&self, len: usize) -> &[T] { + ::std::slice::from_raw_parts(self.as_ptr(), len) + } + #[inline] + pub unsafe fn as_mut_slice(&mut self, len: usize) -> &mut [T] { + ::std::slice::from_raw_parts_mut(self.as_mut_ptr(), len) + } +} +impl ::std::fmt::Debug for __IncompleteArrayField { + fn fmt(&self, fmt: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { + fmt.write_str("__IncompleteArrayField") + } +} +impl ::std::clone::Clone for __IncompleteArrayField { + #[inline] + fn clone(&self) -> Self { + Self::new() + } +} +impl ::std::marker::Copy for __IncompleteArrayField {} +pub const __BITS_PER_LONG: ::std::os::raw::c_uint = 64; +pub const __FD_SETSIZE: ::std::os::raw::c_uint = 1024; +pub const ETH_ALEN: ::std::os::raw::c_uint = 6; +pub const ETH_HLEN: ::std::os::raw::c_uint = 14; +pub const ETH_ZLEN: ::std::os::raw::c_uint = 60; +pub const ETH_DATA_LEN: ::std::os::raw::c_uint = 1500; +pub const ETH_FRAME_LEN: ::std::os::raw::c_uint = 1514; +pub const ETH_FCS_LEN: ::std::os::raw::c_uint = 4; +pub const ETH_P_LOOP: ::std::os::raw::c_uint = 96; +pub const ETH_P_PUP: ::std::os::raw::c_uint = 512; +pub const ETH_P_PUPAT: ::std::os::raw::c_uint = 513; +pub const ETH_P_TSN: ::std::os::raw::c_uint = 8944; +pub const ETH_P_IP: ::std::os::raw::c_uint = 2048; +pub const ETH_P_X25: ::std::os::raw::c_uint = 2053; +pub const ETH_P_ARP: ::std::os::raw::c_uint = 2054; +pub const ETH_P_BPQ: ::std::os::raw::c_uint = 2303; +pub const ETH_P_IEEEPUP: ::std::os::raw::c_uint = 2560; +pub const ETH_P_IEEEPUPAT: ::std::os::raw::c_uint = 2561; +pub const ETH_P_BATMAN: ::std::os::raw::c_uint = 17157; +pub const ETH_P_DEC: ::std::os::raw::c_uint = 24576; +pub const ETH_P_DNA_DL: ::std::os::raw::c_uint = 24577; +pub const ETH_P_DNA_RC: ::std::os::raw::c_uint = 24578; +pub const ETH_P_DNA_RT: ::std::os::raw::c_uint = 24579; +pub const ETH_P_LAT: ::std::os::raw::c_uint = 24580; +pub const ETH_P_DIAG: ::std::os::raw::c_uint = 24581; +pub const ETH_P_CUST: ::std::os::raw::c_uint = 24582; +pub const ETH_P_SCA: ::std::os::raw::c_uint = 24583; +pub const ETH_P_TEB: ::std::os::raw::c_uint = 25944; +pub const ETH_P_RARP: ::std::os::raw::c_uint = 32821; +pub const ETH_P_ATALK: ::std::os::raw::c_uint = 32923; +pub const ETH_P_AARP: ::std::os::raw::c_uint = 33011; +pub const ETH_P_8021Q: ::std::os::raw::c_uint = 33024; +pub const ETH_P_IPX: ::std::os::raw::c_uint = 33079; +pub const ETH_P_IPV6: ::std::os::raw::c_uint = 34525; +pub const ETH_P_PAUSE: ::std::os::raw::c_uint = 34824; +pub const ETH_P_SLOW: ::std::os::raw::c_uint = 34825; +pub const ETH_P_WCCP: ::std::os::raw::c_uint = 34878; +pub const ETH_P_MPLS_UC: ::std::os::raw::c_uint = 34887; +pub const ETH_P_MPLS_MC: ::std::os::raw::c_uint = 34888; +pub const ETH_P_ATMMPOA: ::std::os::raw::c_uint = 34892; +pub const ETH_P_PPP_DISC: ::std::os::raw::c_uint = 34915; +pub const ETH_P_PPP_SES: ::std::os::raw::c_uint = 34916; +pub const ETH_P_LINK_CTL: ::std::os::raw::c_uint = 34924; +pub const ETH_P_ATMFATE: ::std::os::raw::c_uint = 34948; +pub const ETH_P_PAE: ::std::os::raw::c_uint = 34958; +pub const ETH_P_AOE: ::std::os::raw::c_uint = 34978; +pub const ETH_P_8021AD: ::std::os::raw::c_uint = 34984; +pub const ETH_P_802_EX1: ::std::os::raw::c_uint = 34997; +pub const ETH_P_TIPC: ::std::os::raw::c_uint = 35018; +pub const ETH_P_8021AH: ::std::os::raw::c_uint = 35047; +pub const ETH_P_MVRP: ::std::os::raw::c_uint = 35061; +pub const ETH_P_1588: ::std::os::raw::c_uint = 35063; +pub const ETH_P_PRP: ::std::os::raw::c_uint = 35067; +pub const ETH_P_FCOE: ::std::os::raw::c_uint = 35078; +pub const ETH_P_TDLS: ::std::os::raw::c_uint = 35085; +pub const ETH_P_FIP: ::std::os::raw::c_uint = 35092; +pub const ETH_P_80221: ::std::os::raw::c_uint = 35095; +pub const ETH_P_LOOPBACK: ::std::os::raw::c_uint = 36864; +pub const ETH_P_QINQ1: ::std::os::raw::c_uint = 37120; +pub const ETH_P_QINQ2: ::std::os::raw::c_uint = 37376; +pub const ETH_P_QINQ3: ::std::os::raw::c_uint = 37632; +pub const ETH_P_EDSA: ::std::os::raw::c_uint = 56026; +pub const ETH_P_AF_IUCV: ::std::os::raw::c_uint = 64507; +pub const ETH_P_802_3_MIN: ::std::os::raw::c_uint = 1536; +pub const ETH_P_802_3: ::std::os::raw::c_uint = 1; +pub const ETH_P_AX25: ::std::os::raw::c_uint = 2; +pub const ETH_P_ALL: ::std::os::raw::c_uint = 3; +pub const ETH_P_802_2: ::std::os::raw::c_uint = 4; +pub const ETH_P_SNAP: ::std::os::raw::c_uint = 5; +pub const ETH_P_DDCMP: ::std::os::raw::c_uint = 6; +pub const ETH_P_WAN_PPP: ::std::os::raw::c_uint = 7; +pub const ETH_P_PPP_MP: ::std::os::raw::c_uint = 8; +pub const ETH_P_LOCALTALK: ::std::os::raw::c_uint = 9; +pub const ETH_P_CAN: ::std::os::raw::c_uint = 12; +pub const ETH_P_CANFD: ::std::os::raw::c_uint = 13; +pub const ETH_P_PPPTALK: ::std::os::raw::c_uint = 16; +pub const ETH_P_TR_802_2: ::std::os::raw::c_uint = 17; +pub const ETH_P_MOBITEX: ::std::os::raw::c_uint = 21; +pub const ETH_P_CONTROL: ::std::os::raw::c_uint = 22; +pub const ETH_P_IRDA: ::std::os::raw::c_uint = 23; +pub const ETH_P_ECONET: ::std::os::raw::c_uint = 24; +pub const ETH_P_HDLC: ::std::os::raw::c_uint = 25; +pub const ETH_P_ARCNET: ::std::os::raw::c_uint = 26; +pub const ETH_P_DSA: ::std::os::raw::c_uint = 27; +pub const ETH_P_TRAILER: ::std::os::raw::c_uint = 28; +pub const ETH_P_PHONET: ::std::os::raw::c_uint = 245; +pub const ETH_P_IEEE802154: ::std::os::raw::c_uint = 246; +pub const ETH_P_CAIF: ::std::os::raw::c_uint = 247; +pub const ETH_P_XDSA: ::std::os::raw::c_uint = 248; +pub const BPF_LD: ::std::os::raw::c_uint = 0; +pub const BPF_LDX: ::std::os::raw::c_uint = 1; +pub const BPF_ST: ::std::os::raw::c_uint = 2; +pub const BPF_STX: ::std::os::raw::c_uint = 3; +pub const BPF_ALU: ::std::os::raw::c_uint = 4; +pub const BPF_JMP: ::std::os::raw::c_uint = 5; +pub const BPF_RET: ::std::os::raw::c_uint = 6; +pub const BPF_MISC: ::std::os::raw::c_uint = 7; +pub const BPF_W: ::std::os::raw::c_uint = 0; +pub const BPF_H: ::std::os::raw::c_uint = 8; +pub const BPF_B: ::std::os::raw::c_uint = 16; +pub const BPF_IMM: ::std::os::raw::c_uint = 0; +pub const BPF_ABS: ::std::os::raw::c_uint = 32; +pub const BPF_IND: ::std::os::raw::c_uint = 64; +pub const BPF_MEM: ::std::os::raw::c_uint = 96; +pub const BPF_LEN: ::std::os::raw::c_uint = 128; +pub const BPF_MSH: ::std::os::raw::c_uint = 160; +pub const BPF_ADD: ::std::os::raw::c_uint = 0; +pub const BPF_SUB: ::std::os::raw::c_uint = 16; +pub const BPF_MUL: ::std::os::raw::c_uint = 32; +pub const BPF_DIV: ::std::os::raw::c_uint = 48; +pub const BPF_OR: ::std::os::raw::c_uint = 64; +pub const BPF_AND: ::std::os::raw::c_uint = 80; +pub const BPF_LSH: ::std::os::raw::c_uint = 96; +pub const BPF_RSH: ::std::os::raw::c_uint = 112; +pub const BPF_NEG: ::std::os::raw::c_uint = 128; +pub const BPF_MOD: ::std::os::raw::c_uint = 144; +pub const BPF_XOR: ::std::os::raw::c_uint = 160; +pub const BPF_JA: ::std::os::raw::c_uint = 0; +pub const BPF_JEQ: ::std::os::raw::c_uint = 16; +pub const BPF_JGT: ::std::os::raw::c_uint = 32; +pub const BPF_JGE: ::std::os::raw::c_uint = 48; +pub const BPF_JSET: ::std::os::raw::c_uint = 64; +pub const BPF_K: ::std::os::raw::c_uint = 0; +pub const BPF_X: ::std::os::raw::c_uint = 8; +pub const BPF_MAXINSNS: ::std::os::raw::c_uint = 4096; +pub const BPF_MAJOR_VERSION: ::std::os::raw::c_uint = 1; +pub const BPF_MINOR_VERSION: ::std::os::raw::c_uint = 1; +pub const BPF_A: ::std::os::raw::c_uint = 16; +pub const BPF_TAX: ::std::os::raw::c_uint = 0; +pub const BPF_TXA: ::std::os::raw::c_uint = 128; +pub const BPF_MEMWORDS: ::std::os::raw::c_uint = 16; +pub const SKF_AD_OFF: ::std::os::raw::c_int = -4096; +pub const SKF_AD_PROTOCOL: ::std::os::raw::c_uint = 0; +pub const SKF_AD_PKTTYPE: ::std::os::raw::c_uint = 4; +pub const SKF_AD_IFINDEX: ::std::os::raw::c_uint = 8; +pub const SKF_AD_NLATTR: ::std::os::raw::c_uint = 12; +pub const SKF_AD_NLATTR_NEST: ::std::os::raw::c_uint = 16; +pub const SKF_AD_MARK: ::std::os::raw::c_uint = 20; +pub const SKF_AD_QUEUE: ::std::os::raw::c_uint = 24; +pub const SKF_AD_HATYPE: ::std::os::raw::c_uint = 28; +pub const SKF_AD_RXHASH: ::std::os::raw::c_uint = 32; +pub const SKF_AD_CPU: ::std::os::raw::c_uint = 36; +pub const SKF_AD_ALU_XOR_X: ::std::os::raw::c_uint = 40; +pub const SKF_AD_VLAN_TAG: ::std::os::raw::c_uint = 44; +pub const SKF_AD_VLAN_TAG_PRESENT: ::std::os::raw::c_uint = 48; +pub const SKF_AD_PAY_OFFSET: ::std::os::raw::c_uint = 52; +pub const SKF_AD_RANDOM: ::std::os::raw::c_uint = 56; +pub const SKF_AD_VLAN_TPID: ::std::os::raw::c_uint = 60; +pub const SKF_AD_MAX: ::std::os::raw::c_uint = 64; +pub const SKF_NET_OFF: ::std::os::raw::c_int = -1048576; +pub const SKF_LL_OFF: ::std::os::raw::c_int = -2097152; +pub const BPF_NET_OFF: ::std::os::raw::c_int = -1048576; +pub const BPF_LL_OFF: ::std::os::raw::c_int = -2097152; +pub const TUN_READQ_SIZE: ::std::os::raw::c_uint = 500; +pub const TUN_TYPE_MASK: ::std::os::raw::c_uint = 15; +pub const IFF_TUN: ::std::os::raw::c_uint = 1; +pub const IFF_TAP: ::std::os::raw::c_uint = 2; +pub const IFF_NO_PI: ::std::os::raw::c_uint = 4096; +pub const IFF_ONE_QUEUE: ::std::os::raw::c_uint = 8192; +pub const IFF_VNET_HDR: ::std::os::raw::c_uint = 16384; +pub const IFF_TUN_EXCL: ::std::os::raw::c_uint = 32768; +pub const IFF_MULTI_QUEUE: ::std::os::raw::c_uint = 256; +pub const IFF_ATTACH_QUEUE: ::std::os::raw::c_uint = 512; +pub const IFF_DETACH_QUEUE: ::std::os::raw::c_uint = 1024; +pub const IFF_PERSIST: ::std::os::raw::c_uint = 2048; +pub const IFF_NOFILTER: ::std::os::raw::c_uint = 4096; +pub const TUN_TX_TIMESTAMP: ::std::os::raw::c_uint = 1; +pub const TUN_F_CSUM: ::std::os::raw::c_uint = 1; +pub const TUN_F_TSO4: ::std::os::raw::c_uint = 2; +pub const TUN_F_TSO6: ::std::os::raw::c_uint = 4; +pub const TUN_F_TSO_ECN: ::std::os::raw::c_uint = 8; +pub const TUN_F_UFO: ::std::os::raw::c_uint = 16; +pub const TUN_PKT_STRIP: ::std::os::raw::c_uint = 1; +pub const TUN_FLT_ALLMULTI: ::std::os::raw::c_uint = 1; +pub type __s8 = ::std::os::raw::c_schar; +pub type __u8 = ::std::os::raw::c_uchar; +pub type __s16 = ::std::os::raw::c_short; +pub type __u16 = ::std::os::raw::c_ushort; +pub type __s32 = ::std::os::raw::c_int; +pub type __u32 = ::std::os::raw::c_uint; +pub type __s64 = ::std::os::raw::c_longlong; +pub type __u64 = ::std::os::raw::c_ulonglong; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct __kernel_fd_set { + pub fds_bits: [::std::os::raw::c_ulong; 16usize], +} +#[test] +fn bindgen_test_layout___kernel_fd_set() { + assert_eq!( + ::std::mem::size_of::<__kernel_fd_set>(), + 128usize, + concat!("Size of: ", stringify!(__kernel_fd_set)) + ); + assert_eq!( + ::std::mem::align_of::<__kernel_fd_set>(), + 8usize, + concat!("Alignment of ", stringify!(__kernel_fd_set)) + ); + assert_eq!( + unsafe { &(*(0 as *const __kernel_fd_set)).fds_bits as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(__kernel_fd_set), + "::", + stringify!(fds_bits) + ) + ); +} +impl Clone for __kernel_fd_set { + fn clone(&self) -> Self { + *self + } +} +pub type __kernel_sighandler_t = + ::std::option::Option; +pub type __kernel_key_t = ::std::os::raw::c_int; +pub type __kernel_mqd_t = ::std::os::raw::c_int; +pub type __kernel_old_uid_t = ::std::os::raw::c_ushort; +pub type __kernel_old_gid_t = ::std::os::raw::c_ushort; +pub type __kernel_old_dev_t = ::std::os::raw::c_ulong; +pub type __kernel_long_t = ::std::os::raw::c_long; +pub type __kernel_ulong_t = ::std::os::raw::c_ulong; +pub type __kernel_ino_t = __kernel_ulong_t; +pub type __kernel_mode_t = ::std::os::raw::c_uint; +pub type __kernel_pid_t = ::std::os::raw::c_int; +pub type __kernel_ipc_pid_t = ::std::os::raw::c_int; +pub type __kernel_uid_t = ::std::os::raw::c_uint; +pub type __kernel_gid_t = ::std::os::raw::c_uint; +pub type __kernel_suseconds_t = __kernel_long_t; +pub type __kernel_daddr_t = ::std::os::raw::c_int; +pub type __kernel_uid32_t = ::std::os::raw::c_uint; +pub type __kernel_gid32_t = ::std::os::raw::c_uint; +pub type __kernel_size_t = __kernel_ulong_t; +pub type __kernel_ssize_t = __kernel_long_t; +pub type __kernel_ptrdiff_t = __kernel_long_t; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct __kernel_fsid_t { + pub val: [::std::os::raw::c_int; 2usize], +} +#[test] +fn bindgen_test_layout___kernel_fsid_t() { + assert_eq!( + ::std::mem::size_of::<__kernel_fsid_t>(), + 8usize, + concat!("Size of: ", stringify!(__kernel_fsid_t)) + ); + assert_eq!( + ::std::mem::align_of::<__kernel_fsid_t>(), + 4usize, + concat!("Alignment of ", stringify!(__kernel_fsid_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const __kernel_fsid_t)).val as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(__kernel_fsid_t), + "::", + stringify!(val) + ) + ); +} +impl Clone for __kernel_fsid_t { + fn clone(&self) -> Self { + *self + } +} +pub type __kernel_off_t = __kernel_long_t; +pub type __kernel_loff_t = ::std::os::raw::c_longlong; +pub type __kernel_time_t = __kernel_long_t; +pub type __kernel_clock_t = __kernel_long_t; +pub type __kernel_timer_t = ::std::os::raw::c_int; +pub type __kernel_clockid_t = ::std::os::raw::c_int; +pub type __kernel_caddr_t = *mut ::std::os::raw::c_char; +pub type __kernel_uid16_t = ::std::os::raw::c_ushort; +pub type __kernel_gid16_t = ::std::os::raw::c_ushort; +pub type __le16 = __u16; +pub type __be16 = __u16; +pub type __le32 = __u32; +pub type __be32 = __u32; +pub type __le64 = __u64; +pub type __be64 = __u64; +pub type __sum16 = __u16; +pub type __wsum = __u32; +#[repr(C, packed)] +#[derive(Debug, Default, Copy)] +pub struct ethhdr { + pub h_dest: [::std::os::raw::c_uchar; 6usize], + pub h_source: [::std::os::raw::c_uchar; 6usize], + pub h_proto: __be16, +} +#[test] +fn bindgen_test_layout_ethhdr() { + assert_eq!( + ::std::mem::size_of::(), + 14usize, + concat!("Size of: ", stringify!(ethhdr)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(ethhdr)) + ); + let ethhdr_test = ethhdr::default(); + let p_ethhdr_test = ðhdr_test as *const ethhdr as usize; + assert_eq!( + std::ptr::addr_of!(ethhdr_test.h_dest) as usize - p_ethhdr_test, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ethhdr), + "::", + stringify!(h_dest) + ) + ); + assert_eq!( + std::ptr::addr_of!(ethhdr_test.h_source) as usize - p_ethhdr_test, + 6usize, + concat!( + "Alignment of field: ", + stringify!(ethhdr), + "::", + stringify!(h_source) + ) + ); + assert_eq!( + std::ptr::addr_of!(ethhdr_test.h_proto) as usize - p_ethhdr_test, + 12usize, + concat!( + "Alignment of field: ", + stringify!(ethhdr), + "::", + stringify!(h_proto) + ) + ); +} +impl Clone for ethhdr { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct sock_filter { + pub code: __u16, + pub jt: __u8, + pub jf: __u8, + pub k: __u32, +} +#[test] +fn bindgen_test_layout_sock_filter() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!("Size of: ", stringify!(sock_filter)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(sock_filter)) + ); + assert_eq!( + unsafe { &(*(0 as *const sock_filter)).code as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(sock_filter), + "::", + stringify!(code) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const sock_filter)).jt as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(sock_filter), + "::", + stringify!(jt) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const sock_filter)).jf as *const _ as usize }, + 3usize, + concat!( + "Alignment of field: ", + stringify!(sock_filter), + "::", + stringify!(jf) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const sock_filter)).k as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(sock_filter), + "::", + stringify!(k) + ) + ); +} +impl Clone for sock_filter { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Copy)] +pub struct sock_fprog { + pub len: ::std::os::raw::c_ushort, + pub filter: *mut sock_filter, +} +#[test] +fn bindgen_test_layout_sock_fprog() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(sock_fprog)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(sock_fprog)) + ); + assert_eq!( + unsafe { &(*(0 as *const sock_fprog)).len as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(sock_fprog), + "::", + stringify!(len) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const sock_fprog)).filter as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(sock_fprog), + "::", + stringify!(filter) + ) + ); +} +impl Clone for sock_fprog { + fn clone(&self) -> Self { + *self + } +} +impl Default for sock_fprog { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct tun_pi { + pub flags: __u16, + pub proto: __be16, +} +#[test] +fn bindgen_test_layout_tun_pi() { + assert_eq!( + ::std::mem::size_of::(), + 4usize, + concat!("Size of: ", stringify!(tun_pi)) + ); + assert_eq!( + ::std::mem::align_of::(), + 2usize, + concat!("Alignment of ", stringify!(tun_pi)) + ); + assert_eq!( + unsafe { &(*(0 as *const tun_pi)).flags as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(tun_pi), + "::", + stringify!(flags) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const tun_pi)).proto as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(tun_pi), + "::", + stringify!(proto) + ) + ); +} +impl Clone for tun_pi { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct tun_filter { + pub flags: __u16, + pub count: __u16, + pub addr: __IncompleteArrayField<[__u8; 6usize]>, +} +#[test] +fn bindgen_test_layout_tun_filter() { + assert_eq!( + ::std::mem::size_of::(), + 4usize, + concat!("Size of: ", stringify!(tun_filter)) + ); + assert_eq!( + ::std::mem::align_of::(), + 2usize, + concat!("Alignment of ", stringify!(tun_filter)) + ); + assert_eq!( + unsafe { &(*(0 as *const tun_filter)).flags as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(tun_filter), + "::", + stringify!(flags) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const tun_filter)).count as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(tun_filter), + "::", + stringify!(count) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const tun_filter)).addr as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(tun_filter), + "::", + stringify!(addr) + ) + ); +} +impl Clone for tun_filter { + fn clone(&self) -> Self { + *self + } +} diff --git a/src/dragonball/src/dbs_utils/src/net/net_gen/iff.rs b/src/dragonball/src/dbs_utils/src/net/net_gen/iff.rs new file mode 100644 index 000000000..9043cc6cb --- /dev/null +++ b/src/dragonball/src/dbs_utils/src/net/net_gen/iff.rs @@ -0,0 +1,3266 @@ +// Copyright 2023 Alibaba Cloud. All Rights Reserved. +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. +// SPDX-License-Identifier: Apache-2.0 + +/* automatically generated by rust-bindgen */ + +#[repr(C)] +#[derive(Default)] +pub struct __IncompleteArrayField(::std::marker::PhantomData); +impl __IncompleteArrayField { + #[inline] + pub fn new() -> Self { + __IncompleteArrayField(::std::marker::PhantomData) + } + #[inline] + pub unsafe fn as_ptr(&self) -> *const T { + ::std::mem::transmute(self) + } + #[inline] + pub unsafe fn as_mut_ptr(&mut self) -> *mut T { + ::std::mem::transmute(self) + } + #[inline] + pub unsafe fn as_slice(&self, len: usize) -> &[T] { + ::std::slice::from_raw_parts(self.as_ptr(), len) + } + #[inline] + pub unsafe fn as_mut_slice(&mut self, len: usize) -> &mut [T] { + ::std::slice::from_raw_parts_mut(self.as_mut_ptr(), len) + } +} +impl ::std::fmt::Debug for __IncompleteArrayField { + fn fmt(&self, fmt: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { + fmt.write_str("__IncompleteArrayField") + } +} +impl ::std::clone::Clone for __IncompleteArrayField { + #[inline] + fn clone(&self) -> Self { + Self::new() + } +} +impl ::std::marker::Copy for __IncompleteArrayField {} +#[repr(C)] +pub struct __BindgenUnionField(::std::marker::PhantomData); +impl __BindgenUnionField { + #[inline] + pub fn new() -> Self { + __BindgenUnionField(::std::marker::PhantomData) + } + #[inline] + pub unsafe fn as_ref(&self) -> &T { + ::std::mem::transmute(self) + } + #[inline] + pub unsafe fn as_mut(&mut self) -> &mut T { + ::std::mem::transmute(self) + } +} +impl ::std::default::Default for __BindgenUnionField { + #[inline] + fn default() -> Self { + Self::new() + } +} +impl ::std::clone::Clone for __BindgenUnionField { + #[inline] + fn clone(&self) -> Self { + Self::new() + } +} +impl ::std::marker::Copy for __BindgenUnionField {} +impl ::std::fmt::Debug for __BindgenUnionField { + fn fmt(&self, fmt: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { + fmt.write_str("__BindgenUnionField") + } +} +pub const __UAPI_DEF_IN_ADDR: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IN_IPPROTO: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IN_PKTINFO: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IP_MREQ: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_SOCKADDR_IN: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IN_CLASS: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IN6_ADDR: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IN6_ADDR_ALT: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_SOCKADDR_IN6: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IPV6_MREQ: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IPPROTO_V6: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IPV6_OPTIONS: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IN6_PKTINFO: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IP6_MTUINFO: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_XATTR: ::std::os::raw::c_uint = 1; +pub const __BITS_PER_LONG: ::std::os::raw::c_uint = 64; +pub const __FD_SETSIZE: ::std::os::raw::c_uint = 1024; +pub const _K_SS_MAXSIZE: ::std::os::raw::c_uint = 128; +pub const _SYS_SOCKET_H: ::std::os::raw::c_uint = 1; +pub const _FEATURES_H: ::std::os::raw::c_uint = 1; +pub const _DEFAULT_SOURCE: ::std::os::raw::c_uint = 1; +pub const __USE_ISOC11: ::std::os::raw::c_uint = 1; +pub const __USE_ISOC99: ::std::os::raw::c_uint = 1; +pub const __USE_ISOC95: ::std::os::raw::c_uint = 1; +pub const __USE_POSIX_IMPLICITLY: ::std::os::raw::c_uint = 1; +pub const _POSIX_SOURCE: ::std::os::raw::c_uint = 1; +pub const _POSIX_C_SOURCE: ::std::os::raw::c_uint = 200809; +pub const __USE_POSIX: ::std::os::raw::c_uint = 1; +pub const __USE_POSIX2: ::std::os::raw::c_uint = 1; +pub const __USE_POSIX199309: ::std::os::raw::c_uint = 1; +pub const __USE_POSIX199506: ::std::os::raw::c_uint = 1; +pub const __USE_XOPEN2K: ::std::os::raw::c_uint = 1; +pub const __USE_XOPEN2K8: ::std::os::raw::c_uint = 1; +pub const _ATFILE_SOURCE: ::std::os::raw::c_uint = 1; +pub const __USE_MISC: ::std::os::raw::c_uint = 1; +pub const __USE_ATFILE: ::std::os::raw::c_uint = 1; +pub const __USE_FORTIFY_LEVEL: ::std::os::raw::c_uint = 0; +pub const _STDC_PREDEF_H: ::std::os::raw::c_uint = 1; +pub const __STDC_IEC_559__: ::std::os::raw::c_uint = 1; +pub const __STDC_IEC_559_COMPLEX__: ::std::os::raw::c_uint = 1; +pub const __STDC_ISO_10646__: ::std::os::raw::c_uint = 201505; +pub const __STDC_NO_THREADS__: ::std::os::raw::c_uint = 1; +pub const __GNU_LIBRARY__: ::std::os::raw::c_uint = 6; +pub const __GLIBC__: ::std::os::raw::c_uint = 2; +pub const __GLIBC_MINOR__: ::std::os::raw::c_uint = 23; +pub const _SYS_CDEFS_H: ::std::os::raw::c_uint = 1; +pub const __WORDSIZE: ::std::os::raw::c_uint = 64; +pub const __WORDSIZE_TIME64_COMPAT32: ::std::os::raw::c_uint = 1; +pub const __SYSCALL_WORDSIZE: ::std::os::raw::c_uint = 64; +pub const _SYS_UIO_H: ::std::os::raw::c_uint = 1; +pub const _SYS_TYPES_H: ::std::os::raw::c_uint = 1; +pub const _BITS_TYPES_H: ::std::os::raw::c_uint = 1; +pub const _BITS_TYPESIZES_H: ::std::os::raw::c_uint = 1; +pub const __OFF_T_MATCHES_OFF64_T: ::std::os::raw::c_uint = 1; +pub const __INO_T_MATCHES_INO64_T: ::std::os::raw::c_uint = 1; +pub const __clock_t_defined: ::std::os::raw::c_uint = 1; +pub const __time_t_defined: ::std::os::raw::c_uint = 1; +pub const __clockid_t_defined: ::std::os::raw::c_uint = 1; +pub const __timer_t_defined: ::std::os::raw::c_uint = 1; +pub const __BIT_TYPES_DEFINED__: ::std::os::raw::c_uint = 1; +pub const _ENDIAN_H: ::std::os::raw::c_uint = 1; +pub const __LITTLE_ENDIAN: ::std::os::raw::c_uint = 1234; +pub const __BIG_ENDIAN: ::std::os::raw::c_uint = 4321; +pub const __PDP_ENDIAN: ::std::os::raw::c_uint = 3412; +pub const __BYTE_ORDER: ::std::os::raw::c_uint = 1234; +pub const __FLOAT_WORD_ORDER: ::std::os::raw::c_uint = 1234; +pub const LITTLE_ENDIAN: ::std::os::raw::c_uint = 1234; +pub const BIG_ENDIAN: ::std::os::raw::c_uint = 4321; +pub const PDP_ENDIAN: ::std::os::raw::c_uint = 3412; +pub const BYTE_ORDER: ::std::os::raw::c_uint = 1234; +pub const _BITS_BYTESWAP_H: ::std::os::raw::c_uint = 1; +pub const _SYS_SELECT_H: ::std::os::raw::c_uint = 1; +pub const __FD_ZERO_STOS: &'static [u8; 6usize] = b"stosq\x00"; +pub const _SIGSET_H_types: ::std::os::raw::c_uint = 1; +pub const __timespec_defined: ::std::os::raw::c_uint = 1; +pub const _STRUCT_TIMEVAL: ::std::os::raw::c_uint = 1; +pub const FD_SETSIZE: ::std::os::raw::c_uint = 1024; +pub const _SYS_SYSMACROS_H: ::std::os::raw::c_uint = 1; +pub const _BITS_PTHREADTYPES_H: ::std::os::raw::c_uint = 1; +pub const __SIZEOF_PTHREAD_ATTR_T: ::std::os::raw::c_uint = 56; +pub const __SIZEOF_PTHREAD_MUTEX_T: ::std::os::raw::c_uint = 40; +pub const __SIZEOF_PTHREAD_MUTEXATTR_T: ::std::os::raw::c_uint = 4; +pub const __SIZEOF_PTHREAD_COND_T: ::std::os::raw::c_uint = 48; +pub const __SIZEOF_PTHREAD_CONDATTR_T: ::std::os::raw::c_uint = 4; +pub const __SIZEOF_PTHREAD_RWLOCK_T: ::std::os::raw::c_uint = 56; +pub const __SIZEOF_PTHREAD_RWLOCKATTR_T: ::std::os::raw::c_uint = 8; +pub const __SIZEOF_PTHREAD_BARRIER_T: ::std::os::raw::c_uint = 32; +pub const __SIZEOF_PTHREAD_BARRIERATTR_T: ::std::os::raw::c_uint = 4; +pub const __have_pthread_attr_t: ::std::os::raw::c_uint = 1; +pub const __PTHREAD_MUTEX_HAVE_PREV: ::std::os::raw::c_uint = 1; +pub const __PTHREAD_RWLOCK_INT_FLAGS_SHARED: ::std::os::raw::c_uint = 1; +pub const _BITS_UIO_H: ::std::os::raw::c_uint = 1; +pub const UIO_MAXIOV: ::std::os::raw::c_uint = 1024; +pub const PF_UNSPEC: ::std::os::raw::c_uint = 0; +pub const PF_LOCAL: ::std::os::raw::c_uint = 1; +pub const PF_UNIX: ::std::os::raw::c_uint = 1; +pub const PF_FILE: ::std::os::raw::c_uint = 1; +pub const PF_INET: ::std::os::raw::c_uint = 2; +pub const PF_AX25: ::std::os::raw::c_uint = 3; +pub const PF_IPX: ::std::os::raw::c_uint = 4; +pub const PF_APPLETALK: ::std::os::raw::c_uint = 5; +pub const PF_NETROM: ::std::os::raw::c_uint = 6; +pub const PF_BRIDGE: ::std::os::raw::c_uint = 7; +pub const PF_ATMPVC: ::std::os::raw::c_uint = 8; +pub const PF_X25: ::std::os::raw::c_uint = 9; +pub const PF_INET6: ::std::os::raw::c_uint = 10; +pub const PF_ROSE: ::std::os::raw::c_uint = 11; +pub const PF_DECnet: ::std::os::raw::c_uint = 12; +pub const PF_NETBEUI: ::std::os::raw::c_uint = 13; +pub const PF_SECURITY: ::std::os::raw::c_uint = 14; +pub const PF_KEY: ::std::os::raw::c_uint = 15; +pub const PF_NETLINK: ::std::os::raw::c_uint = 16; +pub const PF_ROUTE: ::std::os::raw::c_uint = 16; +pub const PF_PACKET: ::std::os::raw::c_uint = 17; +pub const PF_ASH: ::std::os::raw::c_uint = 18; +pub const PF_ECONET: ::std::os::raw::c_uint = 19; +pub const PF_ATMSVC: ::std::os::raw::c_uint = 20; +pub const PF_RDS: ::std::os::raw::c_uint = 21; +pub const PF_SNA: ::std::os::raw::c_uint = 22; +pub const PF_IRDA: ::std::os::raw::c_uint = 23; +pub const PF_PPPOX: ::std::os::raw::c_uint = 24; +pub const PF_WANPIPE: ::std::os::raw::c_uint = 25; +pub const PF_LLC: ::std::os::raw::c_uint = 26; +pub const PF_IB: ::std::os::raw::c_uint = 27; +pub const PF_MPLS: ::std::os::raw::c_uint = 28; +pub const PF_CAN: ::std::os::raw::c_uint = 29; +pub const PF_TIPC: ::std::os::raw::c_uint = 30; +pub const PF_BLUETOOTH: ::std::os::raw::c_uint = 31; +pub const PF_IUCV: ::std::os::raw::c_uint = 32; +pub const PF_RXRPC: ::std::os::raw::c_uint = 33; +pub const PF_ISDN: ::std::os::raw::c_uint = 34; +pub const PF_PHONET: ::std::os::raw::c_uint = 35; +pub const PF_IEEE802154: ::std::os::raw::c_uint = 36; +pub const PF_CAIF: ::std::os::raw::c_uint = 37; +pub const PF_ALG: ::std::os::raw::c_uint = 38; +pub const PF_NFC: ::std::os::raw::c_uint = 39; +pub const PF_VSOCK: ::std::os::raw::c_uint = 40; +pub const PF_MAX: ::std::os::raw::c_uint = 41; +pub const AF_UNSPEC: ::std::os::raw::c_uint = 0; +pub const AF_LOCAL: ::std::os::raw::c_uint = 1; +pub const AF_UNIX: ::std::os::raw::c_uint = 1; +pub const AF_FILE: ::std::os::raw::c_uint = 1; +pub const AF_INET: ::std::os::raw::c_uint = 2; +pub const AF_AX25: ::std::os::raw::c_uint = 3; +pub const AF_IPX: ::std::os::raw::c_uint = 4; +pub const AF_APPLETALK: ::std::os::raw::c_uint = 5; +pub const AF_NETROM: ::std::os::raw::c_uint = 6; +pub const AF_BRIDGE: ::std::os::raw::c_uint = 7; +pub const AF_ATMPVC: ::std::os::raw::c_uint = 8; +pub const AF_X25: ::std::os::raw::c_uint = 9; +pub const AF_INET6: ::std::os::raw::c_uint = 10; +pub const AF_ROSE: ::std::os::raw::c_uint = 11; +pub const AF_DECnet: ::std::os::raw::c_uint = 12; +pub const AF_NETBEUI: ::std::os::raw::c_uint = 13; +pub const AF_SECURITY: ::std::os::raw::c_uint = 14; +pub const AF_KEY: ::std::os::raw::c_uint = 15; +pub const AF_NETLINK: ::std::os::raw::c_uint = 16; +pub const AF_ROUTE: ::std::os::raw::c_uint = 16; +pub const AF_PACKET: ::std::os::raw::c_uint = 17; +pub const AF_ASH: ::std::os::raw::c_uint = 18; +pub const AF_ECONET: ::std::os::raw::c_uint = 19; +pub const AF_ATMSVC: ::std::os::raw::c_uint = 20; +pub const AF_RDS: ::std::os::raw::c_uint = 21; +pub const AF_SNA: ::std::os::raw::c_uint = 22; +pub const AF_IRDA: ::std::os::raw::c_uint = 23; +pub const AF_PPPOX: ::std::os::raw::c_uint = 24; +pub const AF_WANPIPE: ::std::os::raw::c_uint = 25; +pub const AF_LLC: ::std::os::raw::c_uint = 26; +pub const AF_IB: ::std::os::raw::c_uint = 27; +pub const AF_MPLS: ::std::os::raw::c_uint = 28; +pub const AF_CAN: ::std::os::raw::c_uint = 29; +pub const AF_TIPC: ::std::os::raw::c_uint = 30; +pub const AF_BLUETOOTH: ::std::os::raw::c_uint = 31; +pub const AF_IUCV: ::std::os::raw::c_uint = 32; +pub const AF_RXRPC: ::std::os::raw::c_uint = 33; +pub const AF_ISDN: ::std::os::raw::c_uint = 34; +pub const AF_PHONET: ::std::os::raw::c_uint = 35; +pub const AF_IEEE802154: ::std::os::raw::c_uint = 36; +pub const AF_CAIF: ::std::os::raw::c_uint = 37; +pub const AF_ALG: ::std::os::raw::c_uint = 38; +pub const AF_NFC: ::std::os::raw::c_uint = 39; +pub const AF_VSOCK: ::std::os::raw::c_uint = 40; +pub const AF_MAX: ::std::os::raw::c_uint = 41; +pub const SOL_RAW: ::std::os::raw::c_uint = 255; +pub const SOL_DECNET: ::std::os::raw::c_uint = 261; +pub const SOL_X25: ::std::os::raw::c_uint = 262; +pub const SOL_PACKET: ::std::os::raw::c_uint = 263; +pub const SOL_ATM: ::std::os::raw::c_uint = 264; +pub const SOL_AAL: ::std::os::raw::c_uint = 265; +pub const SOL_IRDA: ::std::os::raw::c_uint = 266; +pub const SOMAXCONN: ::std::os::raw::c_uint = 128; +pub const _BITS_SOCKADDR_H: ::std::os::raw::c_uint = 1; +pub const _SS_SIZE: ::std::os::raw::c_uint = 128; +pub const FIOSETOWN: ::std::os::raw::c_uint = 35073; +pub const SIOCSPGRP: ::std::os::raw::c_uint = 35074; +pub const FIOGETOWN: ::std::os::raw::c_uint = 35075; +pub const SIOCGPGRP: ::std::os::raw::c_uint = 35076; +pub const SIOCATMARK: ::std::os::raw::c_uint = 35077; +pub const SIOCGSTAMP: ::std::os::raw::c_uint = 35078; +pub const SIOCGSTAMPNS: ::std::os::raw::c_uint = 35079; +pub const SOL_SOCKET: ::std::os::raw::c_uint = 1; +pub const SO_DEBUG: ::std::os::raw::c_uint = 1; +pub const SO_REUSEADDR: ::std::os::raw::c_uint = 2; +pub const SO_TYPE: ::std::os::raw::c_uint = 3; +pub const SO_ERROR: ::std::os::raw::c_uint = 4; +pub const SO_DONTROUTE: ::std::os::raw::c_uint = 5; +pub const SO_BROADCAST: ::std::os::raw::c_uint = 6; +pub const SO_SNDBUF: ::std::os::raw::c_uint = 7; +pub const SO_RCVBUF: ::std::os::raw::c_uint = 8; +pub const SO_SNDBUFFORCE: ::std::os::raw::c_uint = 32; +pub const SO_RCVBUFFORCE: ::std::os::raw::c_uint = 33; +pub const SO_KEEPALIVE: ::std::os::raw::c_uint = 9; +pub const SO_OOBINLINE: ::std::os::raw::c_uint = 10; +pub const SO_NO_CHECK: ::std::os::raw::c_uint = 11; +pub const SO_PRIORITY: ::std::os::raw::c_uint = 12; +pub const SO_LINGER: ::std::os::raw::c_uint = 13; +pub const SO_BSDCOMPAT: ::std::os::raw::c_uint = 14; +pub const SO_REUSEPORT: ::std::os::raw::c_uint = 15; +pub const SO_PASSCRED: ::std::os::raw::c_uint = 16; +pub const SO_PEERCRED: ::std::os::raw::c_uint = 17; +pub const SO_RCVLOWAT: ::std::os::raw::c_uint = 18; +pub const SO_SNDLOWAT: ::std::os::raw::c_uint = 19; +pub const SO_RCVTIMEO: ::std::os::raw::c_uint = 20; +pub const SO_SNDTIMEO: ::std::os::raw::c_uint = 21; +pub const SO_SECURITY_AUTHENTICATION: ::std::os::raw::c_uint = 22; +pub const SO_SECURITY_ENCRYPTION_TRANSPORT: ::std::os::raw::c_uint = 23; +pub const SO_SECURITY_ENCRYPTION_NETWORK: ::std::os::raw::c_uint = 24; +pub const SO_BINDTODEVICE: ::std::os::raw::c_uint = 25; +pub const SO_ATTACH_FILTER: ::std::os::raw::c_uint = 26; +pub const SO_DETACH_FILTER: ::std::os::raw::c_uint = 27; +pub const SO_GET_FILTER: ::std::os::raw::c_uint = 26; +pub const SO_PEERNAME: ::std::os::raw::c_uint = 28; +pub const SO_TIMESTAMP: ::std::os::raw::c_uint = 29; +pub const SCM_TIMESTAMP: ::std::os::raw::c_uint = 29; +pub const SO_ACCEPTCONN: ::std::os::raw::c_uint = 30; +pub const SO_PEERSEC: ::std::os::raw::c_uint = 31; +pub const SO_PASSSEC: ::std::os::raw::c_uint = 34; +pub const SO_TIMESTAMPNS: ::std::os::raw::c_uint = 35; +pub const SCM_TIMESTAMPNS: ::std::os::raw::c_uint = 35; +pub const SO_MARK: ::std::os::raw::c_uint = 36; +pub const SO_TIMESTAMPING: ::std::os::raw::c_uint = 37; +pub const SCM_TIMESTAMPING: ::std::os::raw::c_uint = 37; +pub const SO_PROTOCOL: ::std::os::raw::c_uint = 38; +pub const SO_DOMAIN: ::std::os::raw::c_uint = 39; +pub const SO_RXQ_OVFL: ::std::os::raw::c_uint = 40; +pub const SO_WIFI_STATUS: ::std::os::raw::c_uint = 41; +pub const SCM_WIFI_STATUS: ::std::os::raw::c_uint = 41; +pub const SO_PEEK_OFF: ::std::os::raw::c_uint = 42; +pub const SO_NOFCS: ::std::os::raw::c_uint = 43; +pub const SO_LOCK_FILTER: ::std::os::raw::c_uint = 44; +pub const SO_SELECT_ERR_QUEUE: ::std::os::raw::c_uint = 45; +pub const SO_BUSY_POLL: ::std::os::raw::c_uint = 46; +pub const SO_MAX_PACING_RATE: ::std::os::raw::c_uint = 47; +pub const SO_BPF_EXTENSIONS: ::std::os::raw::c_uint = 48; +pub const SO_INCOMING_CPU: ::std::os::raw::c_uint = 49; +pub const SO_ATTACH_BPF: ::std::os::raw::c_uint = 50; +pub const SO_DETACH_BPF: ::std::os::raw::c_uint = 27; +pub const IFNAMSIZ: ::std::os::raw::c_uint = 16; +pub const IFALIASZ: ::std::os::raw::c_uint = 256; +pub const GENERIC_HDLC_VERSION: ::std::os::raw::c_uint = 4; +pub const CLOCK_DEFAULT: ::std::os::raw::c_uint = 0; +pub const CLOCK_EXT: ::std::os::raw::c_uint = 1; +pub const CLOCK_INT: ::std::os::raw::c_uint = 2; +pub const CLOCK_TXINT: ::std::os::raw::c_uint = 3; +pub const CLOCK_TXFROMRX: ::std::os::raw::c_uint = 4; +pub const ENCODING_DEFAULT: ::std::os::raw::c_uint = 0; +pub const ENCODING_NRZ: ::std::os::raw::c_uint = 1; +pub const ENCODING_NRZI: ::std::os::raw::c_uint = 2; +pub const ENCODING_FM_MARK: ::std::os::raw::c_uint = 3; +pub const ENCODING_FM_SPACE: ::std::os::raw::c_uint = 4; +pub const ENCODING_MANCHESTER: ::std::os::raw::c_uint = 5; +pub const PARITY_DEFAULT: ::std::os::raw::c_uint = 0; +pub const PARITY_NONE: ::std::os::raw::c_uint = 1; +pub const PARITY_CRC16_PR0: ::std::os::raw::c_uint = 2; +pub const PARITY_CRC16_PR1: ::std::os::raw::c_uint = 3; +pub const PARITY_CRC16_PR0_CCITT: ::std::os::raw::c_uint = 4; +pub const PARITY_CRC16_PR1_CCITT: ::std::os::raw::c_uint = 5; +pub const PARITY_CRC32_PR0_CCITT: ::std::os::raw::c_uint = 6; +pub const PARITY_CRC32_PR1_CCITT: ::std::os::raw::c_uint = 7; +pub const LMI_DEFAULT: ::std::os::raw::c_uint = 0; +pub const LMI_NONE: ::std::os::raw::c_uint = 1; +pub const LMI_ANSI: ::std::os::raw::c_uint = 2; +pub const LMI_CCITT: ::std::os::raw::c_uint = 3; +pub const LMI_CISCO: ::std::os::raw::c_uint = 4; +pub const IF_GET_IFACE: ::std::os::raw::c_uint = 1; +pub const IF_GET_PROTO: ::std::os::raw::c_uint = 2; +pub const IF_IFACE_V35: ::std::os::raw::c_uint = 4096; +pub const IF_IFACE_V24: ::std::os::raw::c_uint = 4097; +pub const IF_IFACE_X21: ::std::os::raw::c_uint = 4098; +pub const IF_IFACE_T1: ::std::os::raw::c_uint = 4099; +pub const IF_IFACE_E1: ::std::os::raw::c_uint = 4100; +pub const IF_IFACE_SYNC_SERIAL: ::std::os::raw::c_uint = 4101; +pub const IF_IFACE_X21D: ::std::os::raw::c_uint = 4102; +pub const IF_PROTO_HDLC: ::std::os::raw::c_uint = 8192; +pub const IF_PROTO_PPP: ::std::os::raw::c_uint = 8193; +pub const IF_PROTO_CISCO: ::std::os::raw::c_uint = 8194; +pub const IF_PROTO_FR: ::std::os::raw::c_uint = 8195; +pub const IF_PROTO_FR_ADD_PVC: ::std::os::raw::c_uint = 8196; +pub const IF_PROTO_FR_DEL_PVC: ::std::os::raw::c_uint = 8197; +pub const IF_PROTO_X25: ::std::os::raw::c_uint = 8198; +pub const IF_PROTO_HDLC_ETH: ::std::os::raw::c_uint = 8199; +pub const IF_PROTO_FR_ADD_ETH_PVC: ::std::os::raw::c_uint = 8200; +pub const IF_PROTO_FR_DEL_ETH_PVC: ::std::os::raw::c_uint = 8201; +pub const IF_PROTO_FR_PVC: ::std::os::raw::c_uint = 8202; +pub const IF_PROTO_FR_ETH_PVC: ::std::os::raw::c_uint = 8203; +pub const IF_PROTO_RAW: ::std::os::raw::c_uint = 8204; +pub const IFHWADDRLEN: ::std::os::raw::c_uint = 6; +pub type __s8 = ::std::os::raw::c_schar; +pub type __u8 = ::std::os::raw::c_uchar; +pub type __s16 = ::std::os::raw::c_short; +pub type __u16 = ::std::os::raw::c_ushort; +pub type __s32 = ::std::os::raw::c_int; +pub type __u32 = ::std::os::raw::c_uint; +pub type __s64 = ::std::os::raw::c_longlong; +pub type __u64 = ::std::os::raw::c_ulonglong; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct __kernel_fd_set { + pub fds_bits: [::std::os::raw::c_ulong; 16usize], +} +#[test] +fn bindgen_test_layout___kernel_fd_set() { + assert_eq!( + ::std::mem::size_of::<__kernel_fd_set>(), + 128usize, + concat!("Size of: ", stringify!(__kernel_fd_set)) + ); + assert_eq!( + ::std::mem::align_of::<__kernel_fd_set>(), + 8usize, + concat!("Alignment of ", stringify!(__kernel_fd_set)) + ); + assert_eq!( + unsafe { &(*(0 as *const __kernel_fd_set)).fds_bits as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(__kernel_fd_set), + "::", + stringify!(fds_bits) + ) + ); +} +impl Clone for __kernel_fd_set { + fn clone(&self) -> Self { + *self + } +} +pub type __kernel_sighandler_t = + ::std::option::Option; +pub type __kernel_key_t = ::std::os::raw::c_int; +pub type __kernel_mqd_t = ::std::os::raw::c_int; +pub type __kernel_old_uid_t = ::std::os::raw::c_ushort; +pub type __kernel_old_gid_t = ::std::os::raw::c_ushort; +pub type __kernel_old_dev_t = ::std::os::raw::c_ulong; +pub type __kernel_long_t = ::std::os::raw::c_long; +pub type __kernel_ulong_t = ::std::os::raw::c_ulong; +pub type __kernel_ino_t = __kernel_ulong_t; +pub type __kernel_mode_t = ::std::os::raw::c_uint; +pub type __kernel_pid_t = ::std::os::raw::c_int; +pub type __kernel_ipc_pid_t = ::std::os::raw::c_int; +pub type __kernel_uid_t = ::std::os::raw::c_uint; +pub type __kernel_gid_t = ::std::os::raw::c_uint; +pub type __kernel_suseconds_t = __kernel_long_t; +pub type __kernel_daddr_t = ::std::os::raw::c_int; +pub type __kernel_uid32_t = ::std::os::raw::c_uint; +pub type __kernel_gid32_t = ::std::os::raw::c_uint; +pub type __kernel_size_t = __kernel_ulong_t; +pub type __kernel_ssize_t = __kernel_long_t; +pub type __kernel_ptrdiff_t = __kernel_long_t; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct __kernel_fsid_t { + pub val: [::std::os::raw::c_int; 2usize], +} +#[test] +fn bindgen_test_layout___kernel_fsid_t() { + assert_eq!( + ::std::mem::size_of::<__kernel_fsid_t>(), + 8usize, + concat!("Size of: ", stringify!(__kernel_fsid_t)) + ); + assert_eq!( + ::std::mem::align_of::<__kernel_fsid_t>(), + 4usize, + concat!("Alignment of ", stringify!(__kernel_fsid_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const __kernel_fsid_t)).val as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(__kernel_fsid_t), + "::", + stringify!(val) + ) + ); +} +impl Clone for __kernel_fsid_t { + fn clone(&self) -> Self { + *self + } +} +pub type __kernel_off_t = __kernel_long_t; +pub type __kernel_loff_t = ::std::os::raw::c_longlong; +pub type __kernel_time_t = __kernel_long_t; +pub type __kernel_clock_t = __kernel_long_t; +pub type __kernel_timer_t = ::std::os::raw::c_int; +pub type __kernel_clockid_t = ::std::os::raw::c_int; +pub type __kernel_caddr_t = *mut ::std::os::raw::c_char; +pub type __kernel_uid16_t = ::std::os::raw::c_ushort; +pub type __kernel_gid16_t = ::std::os::raw::c_ushort; +pub type __le16 = __u16; +pub type __be16 = __u16; +pub type __le32 = __u32; +pub type __be32 = __u32; +pub type __le64 = __u64; +pub type __be64 = __u64; +pub type __sum16 = __u16; +pub type __wsum = __u32; +pub type __kernel_sa_family_t = ::std::os::raw::c_ushort; +#[repr(C)] +pub struct __kernel_sockaddr_storage { + pub ss_family: __kernel_sa_family_t, + pub __data: [::std::os::raw::c_char; 126usize], + pub __bindgen_align: [u64; 0usize], +} +#[test] +fn bindgen_test_layout___kernel_sockaddr_storage() { + assert_eq!( + ::std::mem::size_of::<__kernel_sockaddr_storage>(), + 128usize, + concat!("Size of: ", stringify!(__kernel_sockaddr_storage)) + ); + assert_eq!( + ::std::mem::align_of::<__kernel_sockaddr_storage>(), + 8usize, + concat!("Alignment of ", stringify!(__kernel_sockaddr_storage)) + ); + assert_eq!( + unsafe { &(*(0 as *const __kernel_sockaddr_storage)).ss_family as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(__kernel_sockaddr_storage), + "::", + stringify!(ss_family) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const __kernel_sockaddr_storage)).__data as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(__kernel_sockaddr_storage), + "::", + stringify!(__data) + ) + ); +} +impl Default for __kernel_sockaddr_storage { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +pub type __u_char = ::std::os::raw::c_uchar; +pub type __u_short = ::std::os::raw::c_ushort; +pub type __u_int = ::std::os::raw::c_uint; +pub type __u_long = ::std::os::raw::c_ulong; +pub type __int8_t = ::std::os::raw::c_schar; +pub type __uint8_t = ::std::os::raw::c_uchar; +pub type __int16_t = ::std::os::raw::c_short; +pub type __uint16_t = ::std::os::raw::c_ushort; +pub type __int32_t = ::std::os::raw::c_int; +pub type __uint32_t = ::std::os::raw::c_uint; +pub type __int64_t = ::std::os::raw::c_long; +pub type __uint64_t = ::std::os::raw::c_ulong; +pub type __quad_t = ::std::os::raw::c_long; +pub type __u_quad_t = ::std::os::raw::c_ulong; +pub type __dev_t = ::std::os::raw::c_ulong; +pub type __uid_t = ::std::os::raw::c_uint; +pub type __gid_t = ::std::os::raw::c_uint; +pub type __ino_t = ::std::os::raw::c_ulong; +pub type __ino64_t = ::std::os::raw::c_ulong; +pub type __mode_t = ::std::os::raw::c_uint; +pub type __nlink_t = ::std::os::raw::c_ulong; +pub type __off_t = ::std::os::raw::c_long; +pub type __off64_t = ::std::os::raw::c_long; +pub type __pid_t = ::std::os::raw::c_int; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct __fsid_t { + pub __val: [::std::os::raw::c_int; 2usize], +} +#[test] +fn bindgen_test_layout___fsid_t() { + assert_eq!( + ::std::mem::size_of::<__fsid_t>(), + 8usize, + concat!("Size of: ", stringify!(__fsid_t)) + ); + assert_eq!( + ::std::mem::align_of::<__fsid_t>(), + 4usize, + concat!("Alignment of ", stringify!(__fsid_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const __fsid_t)).__val as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(__fsid_t), + "::", + stringify!(__val) + ) + ); +} +impl Clone for __fsid_t { + fn clone(&self) -> Self { + *self + } +} +pub type __clock_t = ::std::os::raw::c_long; +pub type __rlim_t = ::std::os::raw::c_ulong; +pub type __rlim64_t = ::std::os::raw::c_ulong; +pub type __id_t = ::std::os::raw::c_uint; +pub type __time_t = ::std::os::raw::c_long; +pub type __useconds_t = ::std::os::raw::c_uint; +pub type __suseconds_t = ::std::os::raw::c_long; +pub type __daddr_t = ::std::os::raw::c_int; +pub type __key_t = ::std::os::raw::c_int; +pub type __clockid_t = ::std::os::raw::c_int; +pub type __timer_t = *mut ::std::os::raw::c_void; +pub type __blksize_t = ::std::os::raw::c_long; +pub type __blkcnt_t = ::std::os::raw::c_long; +pub type __blkcnt64_t = ::std::os::raw::c_long; +pub type __fsblkcnt_t = ::std::os::raw::c_ulong; +pub type __fsblkcnt64_t = ::std::os::raw::c_ulong; +pub type __fsfilcnt_t = ::std::os::raw::c_ulong; +pub type __fsfilcnt64_t = ::std::os::raw::c_ulong; +pub type __fsword_t = ::std::os::raw::c_long; +pub type __ssize_t = ::std::os::raw::c_long; +pub type __syscall_slong_t = ::std::os::raw::c_long; +pub type __syscall_ulong_t = ::std::os::raw::c_ulong; +pub type __loff_t = __off64_t; +pub type __qaddr_t = *mut __quad_t; +pub type __caddr_t = *mut ::std::os::raw::c_char; +pub type __intptr_t = ::std::os::raw::c_long; +pub type __socklen_t = ::std::os::raw::c_uint; +pub type u_char = __u_char; +pub type u_short = __u_short; +pub type u_int = __u_int; +pub type u_long = __u_long; +pub type quad_t = __quad_t; +pub type u_quad_t = __u_quad_t; +pub type fsid_t = __fsid_t; +pub type loff_t = __loff_t; +pub type ino_t = __ino_t; +pub type dev_t = __dev_t; +pub type gid_t = __gid_t; +pub type mode_t = __mode_t; +pub type nlink_t = __nlink_t; +pub type uid_t = __uid_t; +pub type off_t = __off_t; +pub type pid_t = __pid_t; +pub type id_t = __id_t; +pub type daddr_t = __daddr_t; +pub type caddr_t = __caddr_t; +pub type key_t = __key_t; +pub type clock_t = __clock_t; +pub type time_t = __time_t; +pub type clockid_t = __clockid_t; +pub type timer_t = __timer_t; +pub type ulong = ::std::os::raw::c_ulong; +pub type ushort = ::std::os::raw::c_ushort; +pub type uint = ::std::os::raw::c_uint; +pub type u_int8_t = ::std::os::raw::c_uchar; +pub type u_int16_t = ::std::os::raw::c_ushort; +pub type u_int32_t = ::std::os::raw::c_uint; +pub type u_int64_t = ::std::os::raw::c_ulong; +pub type register_t = ::std::os::raw::c_long; +pub type __sig_atomic_t = ::std::os::raw::c_int; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct __sigset_t { + pub __val: [::std::os::raw::c_ulong; 16usize], +} +#[test] +fn bindgen_test_layout___sigset_t() { + assert_eq!( + ::std::mem::size_of::<__sigset_t>(), + 128usize, + concat!("Size of: ", stringify!(__sigset_t)) + ); + assert_eq!( + ::std::mem::align_of::<__sigset_t>(), + 8usize, + concat!("Alignment of ", stringify!(__sigset_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const __sigset_t)).__val as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(__sigset_t), + "::", + stringify!(__val) + ) + ); +} +impl Clone for __sigset_t { + fn clone(&self) -> Self { + *self + } +} +pub type sigset_t = __sigset_t; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct timespec { + pub tv_sec: __time_t, + pub tv_nsec: __syscall_slong_t, +} +#[test] +fn bindgen_test_layout_timespec() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(timespec)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(timespec)) + ); + assert_eq!( + unsafe { &(*(0 as *const timespec)).tv_sec as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(timespec), + "::", + stringify!(tv_sec) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const timespec)).tv_nsec as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(timespec), + "::", + stringify!(tv_nsec) + ) + ); +} +impl Clone for timespec { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct timeval { + pub tv_sec: __time_t, + pub tv_usec: __suseconds_t, +} +#[test] +fn bindgen_test_layout_timeval() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(timeval)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(timeval)) + ); + assert_eq!( + unsafe { &(*(0 as *const timeval)).tv_sec as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(timeval), + "::", + stringify!(tv_sec) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const timeval)).tv_usec as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(timeval), + "::", + stringify!(tv_usec) + ) + ); +} +impl Clone for timeval { + fn clone(&self) -> Self { + *self + } +} +pub type suseconds_t = __suseconds_t; +pub type __fd_mask = ::std::os::raw::c_long; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct fd_set { + pub __fds_bits: [__fd_mask; 16usize], +} +#[test] +fn bindgen_test_layout_fd_set() { + assert_eq!( + ::std::mem::size_of::(), + 128usize, + concat!("Size of: ", stringify!(fd_set)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(fd_set)) + ); + assert_eq!( + unsafe { &(*(0 as *const fd_set)).__fds_bits as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(fd_set), + "::", + stringify!(__fds_bits) + ) + ); +} +impl Clone for fd_set { + fn clone(&self) -> Self { + *self + } +} +pub type fd_mask = __fd_mask; +extern "C" { + pub fn select( + __nfds: ::std::os::raw::c_int, + __readfds: *mut fd_set, + __writefds: *mut fd_set, + __exceptfds: *mut fd_set, + __timeout: *mut timeval, + ) -> ::std::os::raw::c_int; +} +extern "C" { + pub fn pselect( + __nfds: ::std::os::raw::c_int, + __readfds: *mut fd_set, + __writefds: *mut fd_set, + __exceptfds: *mut fd_set, + __timeout: *const timespec, + __sigmask: *const __sigset_t, + ) -> ::std::os::raw::c_int; +} +extern "C" { + pub fn gnu_dev_major(__dev: ::std::os::raw::c_ulonglong) -> ::std::os::raw::c_uint; +} +extern "C" { + pub fn gnu_dev_minor(__dev: ::std::os::raw::c_ulonglong) -> ::std::os::raw::c_uint; +} +extern "C" { + pub fn gnu_dev_makedev( + __major: ::std::os::raw::c_uint, + __minor: ::std::os::raw::c_uint, + ) -> ::std::os::raw::c_ulonglong; +} +pub type blksize_t = __blksize_t; +pub type blkcnt_t = __blkcnt_t; +pub type fsblkcnt_t = __fsblkcnt_t; +pub type fsfilcnt_t = __fsfilcnt_t; +pub type pthread_t = ::std::os::raw::c_ulong; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct pthread_attr_t { + pub __size: __BindgenUnionField<[::std::os::raw::c_char; 56usize]>, + pub __align: __BindgenUnionField<::std::os::raw::c_long>, + pub bindgen_union_field: [u64; 7usize], +} +#[test] +fn bindgen_test_layout_pthread_attr_t() { + assert_eq!( + ::std::mem::size_of::(), + 56usize, + concat!("Size of: ", stringify!(pthread_attr_t)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(pthread_attr_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_attr_t)).__size as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_attr_t), + "::", + stringify!(__size) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_attr_t)).__align as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_attr_t), + "::", + stringify!(__align) + ) + ); +} +impl Clone for pthread_attr_t { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Copy)] +pub struct __pthread_internal_list { + pub __prev: *mut __pthread_internal_list, + pub __next: *mut __pthread_internal_list, +} +#[test] +fn bindgen_test_layout___pthread_internal_list() { + assert_eq!( + ::std::mem::size_of::<__pthread_internal_list>(), + 16usize, + concat!("Size of: ", stringify!(__pthread_internal_list)) + ); + assert_eq!( + ::std::mem::align_of::<__pthread_internal_list>(), + 8usize, + concat!("Alignment of ", stringify!(__pthread_internal_list)) + ); + assert_eq!( + unsafe { &(*(0 as *const __pthread_internal_list)).__prev as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(__pthread_internal_list), + "::", + stringify!(__prev) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const __pthread_internal_list)).__next as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(__pthread_internal_list), + "::", + stringify!(__next) + ) + ); +} +impl Clone for __pthread_internal_list { + fn clone(&self) -> Self { + *self + } +} +impl Default for __pthread_internal_list { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +pub type __pthread_list_t = __pthread_internal_list; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct pthread_mutex_t { + pub __data: __BindgenUnionField, + pub __size: __BindgenUnionField<[::std::os::raw::c_char; 40usize]>, + pub __align: __BindgenUnionField<::std::os::raw::c_long>, + pub bindgen_union_field: [u64; 5usize], +} +#[repr(C)] +#[derive(Debug, Copy)] +pub struct pthread_mutex_t___pthread_mutex_s { + pub __lock: ::std::os::raw::c_int, + pub __count: ::std::os::raw::c_uint, + pub __owner: ::std::os::raw::c_int, + pub __nusers: ::std::os::raw::c_uint, + pub __kind: ::std::os::raw::c_int, + pub __spins: ::std::os::raw::c_short, + pub __elision: ::std::os::raw::c_short, + pub __list: __pthread_list_t, +} +#[test] +fn bindgen_test_layout_pthread_mutex_t___pthread_mutex_s() { + assert_eq!( + ::std::mem::size_of::(), + 40usize, + concat!("Size of: ", stringify!(pthread_mutex_t___pthread_mutex_s)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!( + "Alignment of ", + stringify!(pthread_mutex_t___pthread_mutex_s) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_mutex_t___pthread_mutex_s)).__lock as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_mutex_t___pthread_mutex_s), + "::", + stringify!(__lock) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_mutex_t___pthread_mutex_s)).__count as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(pthread_mutex_t___pthread_mutex_s), + "::", + stringify!(__count) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_mutex_t___pthread_mutex_s)).__owner as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(pthread_mutex_t___pthread_mutex_s), + "::", + stringify!(__owner) + ) + ); + assert_eq!( + unsafe { + &(*(0 as *const pthread_mutex_t___pthread_mutex_s)).__nusers as *const _ as usize + }, + 12usize, + concat!( + "Alignment of field: ", + stringify!(pthread_mutex_t___pthread_mutex_s), + "::", + stringify!(__nusers) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_mutex_t___pthread_mutex_s)).__kind as *const _ as usize }, + 16usize, + concat!( + "Alignment of field: ", + stringify!(pthread_mutex_t___pthread_mutex_s), + "::", + stringify!(__kind) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_mutex_t___pthread_mutex_s)).__spins as *const _ as usize }, + 20usize, + concat!( + "Alignment of field: ", + stringify!(pthread_mutex_t___pthread_mutex_s), + "::", + stringify!(__spins) + ) + ); + assert_eq!( + unsafe { + &(*(0 as *const pthread_mutex_t___pthread_mutex_s)).__elision as *const _ as usize + }, + 22usize, + concat!( + "Alignment of field: ", + stringify!(pthread_mutex_t___pthread_mutex_s), + "::", + stringify!(__elision) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_mutex_t___pthread_mutex_s)).__list as *const _ as usize }, + 24usize, + concat!( + "Alignment of field: ", + stringify!(pthread_mutex_t___pthread_mutex_s), + "::", + stringify!(__list) + ) + ); +} +impl Clone for pthread_mutex_t___pthread_mutex_s { + fn clone(&self) -> Self { + *self + } +} +impl Default for pthread_mutex_t___pthread_mutex_s { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +#[test] +fn bindgen_test_layout_pthread_mutex_t() { + assert_eq!( + ::std::mem::size_of::(), + 40usize, + concat!("Size of: ", stringify!(pthread_mutex_t)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(pthread_mutex_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_mutex_t)).__data as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_mutex_t), + "::", + stringify!(__data) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_mutex_t)).__size as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_mutex_t), + "::", + stringify!(__size) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_mutex_t)).__align as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_mutex_t), + "::", + stringify!(__align) + ) + ); +} +impl Clone for pthread_mutex_t { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct pthread_mutexattr_t { + pub __size: __BindgenUnionField<[::std::os::raw::c_char; 4usize]>, + pub __align: __BindgenUnionField<::std::os::raw::c_int>, + pub bindgen_union_field: u32, +} +#[test] +fn bindgen_test_layout_pthread_mutexattr_t() { + assert_eq!( + ::std::mem::size_of::(), + 4usize, + concat!("Size of: ", stringify!(pthread_mutexattr_t)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(pthread_mutexattr_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_mutexattr_t)).__size as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_mutexattr_t), + "::", + stringify!(__size) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_mutexattr_t)).__align as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_mutexattr_t), + "::", + stringify!(__align) + ) + ); +} +impl Clone for pthread_mutexattr_t { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct pthread_cond_t { + pub __data: __BindgenUnionField, + pub __size: __BindgenUnionField<[::std::os::raw::c_char; 48usize]>, + pub __align: __BindgenUnionField<::std::os::raw::c_longlong>, + pub bindgen_union_field: [u64; 6usize], +} +#[repr(C)] +#[derive(Debug, Copy)] +pub struct pthread_cond_t__bindgen_ty_1 { + pub __lock: ::std::os::raw::c_int, + pub __futex: ::std::os::raw::c_uint, + pub __total_seq: ::std::os::raw::c_ulonglong, + pub __wakeup_seq: ::std::os::raw::c_ulonglong, + pub __woken_seq: ::std::os::raw::c_ulonglong, + pub __mutex: *mut ::std::os::raw::c_void, + pub __nwaiters: ::std::os::raw::c_uint, + pub __broadcast_seq: ::std::os::raw::c_uint, +} +#[test] +fn bindgen_test_layout_pthread_cond_t__bindgen_ty_1() { + assert_eq!( + ::std::mem::size_of::(), + 48usize, + concat!("Size of: ", stringify!(pthread_cond_t__bindgen_ty_1)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(pthread_cond_t__bindgen_ty_1)) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_cond_t__bindgen_ty_1)).__lock as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_cond_t__bindgen_ty_1), + "::", + stringify!(__lock) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_cond_t__bindgen_ty_1)).__futex as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(pthread_cond_t__bindgen_ty_1), + "::", + stringify!(__futex) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_cond_t__bindgen_ty_1)).__total_seq as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(pthread_cond_t__bindgen_ty_1), + "::", + stringify!(__total_seq) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_cond_t__bindgen_ty_1)).__wakeup_seq as *const _ as usize }, + 16usize, + concat!( + "Alignment of field: ", + stringify!(pthread_cond_t__bindgen_ty_1), + "::", + stringify!(__wakeup_seq) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_cond_t__bindgen_ty_1)).__woken_seq as *const _ as usize }, + 24usize, + concat!( + "Alignment of field: ", + stringify!(pthread_cond_t__bindgen_ty_1), + "::", + stringify!(__woken_seq) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_cond_t__bindgen_ty_1)).__mutex as *const _ as usize }, + 32usize, + concat!( + "Alignment of field: ", + stringify!(pthread_cond_t__bindgen_ty_1), + "::", + stringify!(__mutex) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_cond_t__bindgen_ty_1)).__nwaiters as *const _ as usize }, + 40usize, + concat!( + "Alignment of field: ", + stringify!(pthread_cond_t__bindgen_ty_1), + "::", + stringify!(__nwaiters) + ) + ); + assert_eq!( + unsafe { + &(*(0 as *const pthread_cond_t__bindgen_ty_1)).__broadcast_seq as *const _ as usize + }, + 44usize, + concat!( + "Alignment of field: ", + stringify!(pthread_cond_t__bindgen_ty_1), + "::", + stringify!(__broadcast_seq) + ) + ); +} +impl Clone for pthread_cond_t__bindgen_ty_1 { + fn clone(&self) -> Self { + *self + } +} +impl Default for pthread_cond_t__bindgen_ty_1 { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +#[test] +fn bindgen_test_layout_pthread_cond_t() { + assert_eq!( + ::std::mem::size_of::(), + 48usize, + concat!("Size of: ", stringify!(pthread_cond_t)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(pthread_cond_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_cond_t)).__data as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_cond_t), + "::", + stringify!(__data) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_cond_t)).__size as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_cond_t), + "::", + stringify!(__size) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_cond_t)).__align as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_cond_t), + "::", + stringify!(__align) + ) + ); +} +impl Clone for pthread_cond_t { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct pthread_condattr_t { + pub __size: __BindgenUnionField<[::std::os::raw::c_char; 4usize]>, + pub __align: __BindgenUnionField<::std::os::raw::c_int>, + pub bindgen_union_field: u32, +} +#[test] +fn bindgen_test_layout_pthread_condattr_t() { + assert_eq!( + ::std::mem::size_of::(), + 4usize, + concat!("Size of: ", stringify!(pthread_condattr_t)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(pthread_condattr_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_condattr_t)).__size as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_condattr_t), + "::", + stringify!(__size) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_condattr_t)).__align as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_condattr_t), + "::", + stringify!(__align) + ) + ); +} +impl Clone for pthread_condattr_t { + fn clone(&self) -> Self { + *self + } +} +pub type pthread_key_t = ::std::os::raw::c_uint; +pub type pthread_once_t = ::std::os::raw::c_int; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct pthread_rwlock_t { + pub __data: __BindgenUnionField, + pub __size: __BindgenUnionField<[::std::os::raw::c_char; 56usize]>, + pub __align: __BindgenUnionField<::std::os::raw::c_long>, + pub bindgen_union_field: [u64; 7usize], +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct pthread_rwlock_t__bindgen_ty_1 { + pub __lock: ::std::os::raw::c_int, + pub __nr_readers: ::std::os::raw::c_uint, + pub __readers_wakeup: ::std::os::raw::c_uint, + pub __writer_wakeup: ::std::os::raw::c_uint, + pub __nr_readers_queued: ::std::os::raw::c_uint, + pub __nr_writers_queued: ::std::os::raw::c_uint, + pub __writer: ::std::os::raw::c_int, + pub __shared: ::std::os::raw::c_int, + pub __rwelision: ::std::os::raw::c_schar, + pub __pad1: [::std::os::raw::c_uchar; 7usize], + pub __pad2: ::std::os::raw::c_ulong, + pub __flags: ::std::os::raw::c_uint, +} +#[test] +fn bindgen_test_layout_pthread_rwlock_t__bindgen_ty_1() { + assert_eq!( + ::std::mem::size_of::(), + 56usize, + concat!("Size of: ", stringify!(pthread_rwlock_t__bindgen_ty_1)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(pthread_rwlock_t__bindgen_ty_1)) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_rwlock_t__bindgen_ty_1)).__lock as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t__bindgen_ty_1), + "::", + stringify!(__lock) + ) + ); + assert_eq!( + unsafe { + &(*(0 as *const pthread_rwlock_t__bindgen_ty_1)).__nr_readers as *const _ as usize + }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t__bindgen_ty_1), + "::", + stringify!(__nr_readers) + ) + ); + assert_eq!( + unsafe { + &(*(0 as *const pthread_rwlock_t__bindgen_ty_1)).__readers_wakeup as *const _ as usize + }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t__bindgen_ty_1), + "::", + stringify!(__readers_wakeup) + ) + ); + assert_eq!( + unsafe { + &(*(0 as *const pthread_rwlock_t__bindgen_ty_1)).__writer_wakeup as *const _ as usize + }, + 12usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t__bindgen_ty_1), + "::", + stringify!(__writer_wakeup) + ) + ); + assert_eq!( + unsafe { + &(*(0 as *const pthread_rwlock_t__bindgen_ty_1)).__nr_readers_queued as *const _ + as usize + }, + 16usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t__bindgen_ty_1), + "::", + stringify!(__nr_readers_queued) + ) + ); + assert_eq!( + unsafe { + &(*(0 as *const pthread_rwlock_t__bindgen_ty_1)).__nr_writers_queued as *const _ + as usize + }, + 20usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t__bindgen_ty_1), + "::", + stringify!(__nr_writers_queued) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_rwlock_t__bindgen_ty_1)).__writer as *const _ as usize }, + 24usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t__bindgen_ty_1), + "::", + stringify!(__writer) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_rwlock_t__bindgen_ty_1)).__shared as *const _ as usize }, + 28usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t__bindgen_ty_1), + "::", + stringify!(__shared) + ) + ); + assert_eq!( + unsafe { + &(*(0 as *const pthread_rwlock_t__bindgen_ty_1)).__rwelision as *const _ as usize + }, + 32usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t__bindgen_ty_1), + "::", + stringify!(__rwelision) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_rwlock_t__bindgen_ty_1)).__pad1 as *const _ as usize }, + 33usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t__bindgen_ty_1), + "::", + stringify!(__pad1) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_rwlock_t__bindgen_ty_1)).__pad2 as *const _ as usize }, + 40usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t__bindgen_ty_1), + "::", + stringify!(__pad2) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_rwlock_t__bindgen_ty_1)).__flags as *const _ as usize }, + 48usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t__bindgen_ty_1), + "::", + stringify!(__flags) + ) + ); +} +impl Clone for pthread_rwlock_t__bindgen_ty_1 { + fn clone(&self) -> Self { + *self + } +} +#[test] +fn bindgen_test_layout_pthread_rwlock_t() { + assert_eq!( + ::std::mem::size_of::(), + 56usize, + concat!("Size of: ", stringify!(pthread_rwlock_t)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(pthread_rwlock_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_rwlock_t)).__data as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t), + "::", + stringify!(__data) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_rwlock_t)).__size as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t), + "::", + stringify!(__size) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_rwlock_t)).__align as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlock_t), + "::", + stringify!(__align) + ) + ); +} +impl Clone for pthread_rwlock_t { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct pthread_rwlockattr_t { + pub __size: __BindgenUnionField<[::std::os::raw::c_char; 8usize]>, + pub __align: __BindgenUnionField<::std::os::raw::c_long>, + pub bindgen_union_field: u64, +} +#[test] +fn bindgen_test_layout_pthread_rwlockattr_t() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!("Size of: ", stringify!(pthread_rwlockattr_t)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(pthread_rwlockattr_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_rwlockattr_t)).__size as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlockattr_t), + "::", + stringify!(__size) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_rwlockattr_t)).__align as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_rwlockattr_t), + "::", + stringify!(__align) + ) + ); +} +impl Clone for pthread_rwlockattr_t { + fn clone(&self) -> Self { + *self + } +} +pub type pthread_spinlock_t = ::std::os::raw::c_int; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct pthread_barrier_t { + pub __size: __BindgenUnionField<[::std::os::raw::c_char; 32usize]>, + pub __align: __BindgenUnionField<::std::os::raw::c_long>, + pub bindgen_union_field: [u64; 4usize], +} +#[test] +fn bindgen_test_layout_pthread_barrier_t() { + assert_eq!( + ::std::mem::size_of::(), + 32usize, + concat!("Size of: ", stringify!(pthread_barrier_t)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(pthread_barrier_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_barrier_t)).__size as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_barrier_t), + "::", + stringify!(__size) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_barrier_t)).__align as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_barrier_t), + "::", + stringify!(__align) + ) + ); +} +impl Clone for pthread_barrier_t { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct pthread_barrierattr_t { + pub __size: __BindgenUnionField<[::std::os::raw::c_char; 4usize]>, + pub __align: __BindgenUnionField<::std::os::raw::c_int>, + pub bindgen_union_field: u32, +} +#[test] +fn bindgen_test_layout_pthread_barrierattr_t() { + assert_eq!( + ::std::mem::size_of::(), + 4usize, + concat!("Size of: ", stringify!(pthread_barrierattr_t)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(pthread_barrierattr_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_barrierattr_t)).__size as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_barrierattr_t), + "::", + stringify!(__size) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const pthread_barrierattr_t)).__align as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(pthread_barrierattr_t), + "::", + stringify!(__align) + ) + ); +} +impl Clone for pthread_barrierattr_t { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Copy)] +pub struct iovec { + pub iov_base: *mut ::std::os::raw::c_void, + pub iov_len: usize, +} +#[test] +fn bindgen_test_layout_iovec() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(iovec)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(iovec)) + ); + assert_eq!( + unsafe { &(*(0 as *const iovec)).iov_base as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(iovec), + "::", + stringify!(iov_base) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const iovec)).iov_len as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(iovec), + "::", + stringify!(iov_len) + ) + ); +} +impl Clone for iovec { + fn clone(&self) -> Self { + *self + } +} +impl Default for iovec { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +extern "C" { + pub fn readv( + __fd: ::std::os::raw::c_int, + __iovec: *const iovec, + __count: ::std::os::raw::c_int, + ) -> isize; +} +extern "C" { + pub fn writev( + __fd: ::std::os::raw::c_int, + __iovec: *const iovec, + __count: ::std::os::raw::c_int, + ) -> isize; +} +extern "C" { + pub fn preadv( + __fd: ::std::os::raw::c_int, + __iovec: *const iovec, + __count: ::std::os::raw::c_int, + __offset: __off_t, + ) -> isize; +} +extern "C" { + pub fn pwritev( + __fd: ::std::os::raw::c_int, + __iovec: *const iovec, + __count: ::std::os::raw::c_int, + __offset: __off_t, + ) -> isize; +} +pub type socklen_t = __socklen_t; +pub const __socket_type_SOCK_STREAM: __socket_type = 1; +pub const __socket_type_SOCK_DGRAM: __socket_type = 2; +pub const __socket_type_SOCK_RAW: __socket_type = 3; +pub const __socket_type_SOCK_RDM: __socket_type = 4; +pub const __socket_type_SOCK_SEQPACKET: __socket_type = 5; +pub const __socket_type_SOCK_DCCP: __socket_type = 6; +pub const __socket_type_SOCK_PACKET: __socket_type = 10; +pub const __socket_type_SOCK_CLOEXEC: __socket_type = 524288; +pub const __socket_type_SOCK_NONBLOCK: __socket_type = 2048; +pub type __socket_type = ::std::os::raw::c_uint; +pub type sa_family_t = ::std::os::raw::c_ushort; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct sockaddr { + pub sa_family: sa_family_t, + pub sa_data: [::std::os::raw::c_char; 14usize], +} +#[test] +fn bindgen_test_layout_sockaddr() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(sockaddr)) + ); + assert_eq!( + ::std::mem::align_of::(), + 2usize, + concat!("Alignment of ", stringify!(sockaddr)) + ); + assert_eq!( + unsafe { &(*(0 as *const sockaddr)).sa_family as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(sockaddr), + "::", + stringify!(sa_family) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const sockaddr)).sa_data as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(sockaddr), + "::", + stringify!(sa_data) + ) + ); +} +impl Clone for sockaddr { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +pub struct sockaddr_storage { + pub ss_family: sa_family_t, + pub __ss_padding: [::std::os::raw::c_char; 118usize], + pub __ss_align: ::std::os::raw::c_ulong, +} +#[test] +fn bindgen_test_layout_sockaddr_storage() { + assert_eq!( + ::std::mem::size_of::(), + 128usize, + concat!("Size of: ", stringify!(sockaddr_storage)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(sockaddr_storage)) + ); + assert_eq!( + unsafe { &(*(0 as *const sockaddr_storage)).ss_family as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(sockaddr_storage), + "::", + stringify!(ss_family) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const sockaddr_storage)).__ss_padding as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(sockaddr_storage), + "::", + stringify!(__ss_padding) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const sockaddr_storage)).__ss_align as *const _ as usize }, + 120usize, + concat!( + "Alignment of field: ", + stringify!(sockaddr_storage), + "::", + stringify!(__ss_align) + ) + ); +} +impl Default for sockaddr_storage { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +pub const MSG_OOB: _bindgen_ty_1 = 1; +pub const MSG_PEEK: _bindgen_ty_1 = 2; +pub const MSG_DONTROUTE: _bindgen_ty_1 = 4; +pub const MSG_CTRUNC: _bindgen_ty_1 = 8; +pub const MSG_PROXY: _bindgen_ty_1 = 16; +pub const MSG_TRUNC: _bindgen_ty_1 = 32; +pub const MSG_DONTWAIT: _bindgen_ty_1 = 64; +pub const MSG_EOR: _bindgen_ty_1 = 128; +pub const MSG_WAITALL: _bindgen_ty_1 = 256; +pub const MSG_FIN: _bindgen_ty_1 = 512; +pub const MSG_SYN: _bindgen_ty_1 = 1024; +pub const MSG_CONFIRM: _bindgen_ty_1 = 2048; +pub const MSG_RST: _bindgen_ty_1 = 4096; +pub const MSG_ERRQUEUE: _bindgen_ty_1 = 8192; +pub const MSG_NOSIGNAL: _bindgen_ty_1 = 16384; +pub const MSG_MORE: _bindgen_ty_1 = 32768; +pub const MSG_WAITFORONE: _bindgen_ty_1 = 65536; +pub const MSG_FASTOPEN: _bindgen_ty_1 = 536870912; +pub const MSG_CMSG_CLOEXEC: _bindgen_ty_1 = 1073741824; +pub type _bindgen_ty_1 = ::std::os::raw::c_uint; +#[repr(C)] +#[derive(Debug, Copy)] +pub struct msghdr { + pub msg_name: *mut ::std::os::raw::c_void, + pub msg_namelen: socklen_t, + pub msg_iov: *mut iovec, + pub msg_iovlen: usize, + pub msg_control: *mut ::std::os::raw::c_void, + pub msg_controllen: usize, + pub msg_flags: ::std::os::raw::c_int, +} +#[test] +fn bindgen_test_layout_msghdr() { + assert_eq!( + ::std::mem::size_of::(), + 56usize, + concat!("Size of: ", stringify!(msghdr)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(msghdr)) + ); + assert_eq!( + unsafe { &(*(0 as *const msghdr)).msg_name as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(msghdr), + "::", + stringify!(msg_name) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const msghdr)).msg_namelen as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(msghdr), + "::", + stringify!(msg_namelen) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const msghdr)).msg_iov as *const _ as usize }, + 16usize, + concat!( + "Alignment of field: ", + stringify!(msghdr), + "::", + stringify!(msg_iov) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const msghdr)).msg_iovlen as *const _ as usize }, + 24usize, + concat!( + "Alignment of field: ", + stringify!(msghdr), + "::", + stringify!(msg_iovlen) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const msghdr)).msg_control as *const _ as usize }, + 32usize, + concat!( + "Alignment of field: ", + stringify!(msghdr), + "::", + stringify!(msg_control) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const msghdr)).msg_controllen as *const _ as usize }, + 40usize, + concat!( + "Alignment of field: ", + stringify!(msghdr), + "::", + stringify!(msg_controllen) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const msghdr)).msg_flags as *const _ as usize }, + 48usize, + concat!( + "Alignment of field: ", + stringify!(msghdr), + "::", + stringify!(msg_flags) + ) + ); +} +impl Clone for msghdr { + fn clone(&self) -> Self { + *self + } +} +impl Default for msghdr { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct cmsghdr { + pub cmsg_len: usize, + pub cmsg_level: ::std::os::raw::c_int, + pub cmsg_type: ::std::os::raw::c_int, + pub __cmsg_data: __IncompleteArrayField<::std::os::raw::c_uchar>, +} +#[test] +fn bindgen_test_layout_cmsghdr() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(cmsghdr)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(cmsghdr)) + ); +} +impl Clone for cmsghdr { + fn clone(&self) -> Self { + *self + } +} +extern "C" { + pub fn __cmsg_nxthdr(__mhdr: *mut msghdr, __cmsg: *mut cmsghdr) -> *mut cmsghdr; +} +pub const SCM_RIGHTS: _bindgen_ty_2 = 1; +pub type _bindgen_ty_2 = ::std::os::raw::c_uint; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct linger { + pub l_onoff: ::std::os::raw::c_int, + pub l_linger: ::std::os::raw::c_int, +} +#[test] +fn bindgen_test_layout_linger() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!("Size of: ", stringify!(linger)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(linger)) + ); + assert_eq!( + unsafe { &(*(0 as *const linger)).l_onoff as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(linger), + "::", + stringify!(l_onoff) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const linger)).l_linger as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(linger), + "::", + stringify!(l_linger) + ) + ); +} +impl Clone for linger { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct osockaddr { + pub sa_family: ::std::os::raw::c_ushort, + pub sa_data: [::std::os::raw::c_uchar; 14usize], +} +#[test] +fn bindgen_test_layout_osockaddr() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(osockaddr)) + ); + assert_eq!( + ::std::mem::align_of::(), + 2usize, + concat!("Alignment of ", stringify!(osockaddr)) + ); + assert_eq!( + unsafe { &(*(0 as *const osockaddr)).sa_family as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(osockaddr), + "::", + stringify!(sa_family) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const osockaddr)).sa_data as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(osockaddr), + "::", + stringify!(sa_data) + ) + ); +} +impl Clone for osockaddr { + fn clone(&self) -> Self { + *self + } +} +pub const SHUT_RD: _bindgen_ty_3 = 0; +pub const SHUT_WR: _bindgen_ty_3 = 1; +pub const SHUT_RDWR: _bindgen_ty_3 = 2; +pub type _bindgen_ty_3 = ::std::os::raw::c_uint; +extern "C" { + pub fn socket( + __domain: ::std::os::raw::c_int, + __type: ::std::os::raw::c_int, + __protocol: ::std::os::raw::c_int, + ) -> ::std::os::raw::c_int; +} +extern "C" { + pub fn socketpair( + __domain: ::std::os::raw::c_int, + __type: ::std::os::raw::c_int, + __protocol: ::std::os::raw::c_int, + __fds: *mut ::std::os::raw::c_int, + ) -> ::std::os::raw::c_int; +} +extern "C" { + pub fn bind( + __fd: ::std::os::raw::c_int, + __addr: *const sockaddr, + __len: socklen_t, + ) -> ::std::os::raw::c_int; +} +extern "C" { + pub fn getsockname( + __fd: ::std::os::raw::c_int, + __addr: *mut sockaddr, + __len: *mut socklen_t, + ) -> ::std::os::raw::c_int; +} +extern "C" { + pub fn connect( + __fd: ::std::os::raw::c_int, + __addr: *const sockaddr, + __len: socklen_t, + ) -> ::std::os::raw::c_int; +} +extern "C" { + pub fn getpeername( + __fd: ::std::os::raw::c_int, + __addr: *mut sockaddr, + __len: *mut socklen_t, + ) -> ::std::os::raw::c_int; +} +extern "C" { + pub fn send( + __fd: ::std::os::raw::c_int, + __buf: *const ::std::os::raw::c_void, + __n: usize, + __flags: ::std::os::raw::c_int, + ) -> isize; +} +extern "C" { + pub fn recv( + __fd: ::std::os::raw::c_int, + __buf: *mut ::std::os::raw::c_void, + __n: usize, + __flags: ::std::os::raw::c_int, + ) -> isize; +} +extern "C" { + pub fn sendto( + __fd: ::std::os::raw::c_int, + __buf: *const ::std::os::raw::c_void, + __n: usize, + __flags: ::std::os::raw::c_int, + __addr: *const sockaddr, + __addr_len: socklen_t, + ) -> isize; +} +extern "C" { + pub fn recvfrom( + __fd: ::std::os::raw::c_int, + __buf: *mut ::std::os::raw::c_void, + __n: usize, + __flags: ::std::os::raw::c_int, + __addr: *mut sockaddr, + __addr_len: *mut socklen_t, + ) -> isize; +} +extern "C" { + pub fn sendmsg( + __fd: ::std::os::raw::c_int, + __message: *const msghdr, + __flags: ::std::os::raw::c_int, + ) -> isize; +} +extern "C" { + pub fn recvmsg( + __fd: ::std::os::raw::c_int, + __message: *mut msghdr, + __flags: ::std::os::raw::c_int, + ) -> isize; +} +extern "C" { + pub fn getsockopt( + __fd: ::std::os::raw::c_int, + __level: ::std::os::raw::c_int, + __optname: ::std::os::raw::c_int, + __optval: *mut ::std::os::raw::c_void, + __optlen: *mut socklen_t, + ) -> ::std::os::raw::c_int; +} +extern "C" { + pub fn setsockopt( + __fd: ::std::os::raw::c_int, + __level: ::std::os::raw::c_int, + __optname: ::std::os::raw::c_int, + __optval: *const ::std::os::raw::c_void, + __optlen: socklen_t, + ) -> ::std::os::raw::c_int; +} +extern "C" { + pub fn listen(__fd: ::std::os::raw::c_int, __n: ::std::os::raw::c_int) + -> ::std::os::raw::c_int; +} +extern "C" { + pub fn accept( + __fd: ::std::os::raw::c_int, + __addr: *mut sockaddr, + __addr_len: *mut socklen_t, + ) -> ::std::os::raw::c_int; +} +extern "C" { + pub fn shutdown( + __fd: ::std::os::raw::c_int, + __how: ::std::os::raw::c_int, + ) -> ::std::os::raw::c_int; +} +extern "C" { + pub fn sockatmark(__fd: ::std::os::raw::c_int) -> ::std::os::raw::c_int; +} +extern "C" { + pub fn isfdtype( + __fd: ::std::os::raw::c_int, + __fdtype: ::std::os::raw::c_int, + ) -> ::std::os::raw::c_int; +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct sync_serial_settings { + pub clock_rate: ::std::os::raw::c_uint, + pub clock_type: ::std::os::raw::c_uint, + pub loopback: ::std::os::raw::c_ushort, +} +#[test] +fn bindgen_test_layout_sync_serial_settings() { + assert_eq!( + ::std::mem::size_of::(), + 12usize, + concat!("Size of: ", stringify!(sync_serial_settings)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(sync_serial_settings)) + ); + assert_eq!( + unsafe { &(*(0 as *const sync_serial_settings)).clock_rate as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(sync_serial_settings), + "::", + stringify!(clock_rate) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const sync_serial_settings)).clock_type as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(sync_serial_settings), + "::", + stringify!(clock_type) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const sync_serial_settings)).loopback as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(sync_serial_settings), + "::", + stringify!(loopback) + ) + ); +} +impl Clone for sync_serial_settings { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct te1_settings { + pub clock_rate: ::std::os::raw::c_uint, + pub clock_type: ::std::os::raw::c_uint, + pub loopback: ::std::os::raw::c_ushort, + pub slot_map: ::std::os::raw::c_uint, +} +#[test] +fn bindgen_test_layout_te1_settings() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(te1_settings)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(te1_settings)) + ); + assert_eq!( + unsafe { &(*(0 as *const te1_settings)).clock_rate as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(te1_settings), + "::", + stringify!(clock_rate) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const te1_settings)).clock_type as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(te1_settings), + "::", + stringify!(clock_type) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const te1_settings)).loopback as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(te1_settings), + "::", + stringify!(loopback) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const te1_settings)).slot_map as *const _ as usize }, + 12usize, + concat!( + "Alignment of field: ", + stringify!(te1_settings), + "::", + stringify!(slot_map) + ) + ); +} +impl Clone for te1_settings { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct raw_hdlc_proto { + pub encoding: ::std::os::raw::c_ushort, + pub parity: ::std::os::raw::c_ushort, +} +#[test] +fn bindgen_test_layout_raw_hdlc_proto() { + assert_eq!( + ::std::mem::size_of::(), + 4usize, + concat!("Size of: ", stringify!(raw_hdlc_proto)) + ); + assert_eq!( + ::std::mem::align_of::(), + 2usize, + concat!("Alignment of ", stringify!(raw_hdlc_proto)) + ); + assert_eq!( + unsafe { &(*(0 as *const raw_hdlc_proto)).encoding as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(raw_hdlc_proto), + "::", + stringify!(encoding) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const raw_hdlc_proto)).parity as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(raw_hdlc_proto), + "::", + stringify!(parity) + ) + ); +} +impl Clone for raw_hdlc_proto { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct fr_proto { + pub t391: ::std::os::raw::c_uint, + pub t392: ::std::os::raw::c_uint, + pub n391: ::std::os::raw::c_uint, + pub n392: ::std::os::raw::c_uint, + pub n393: ::std::os::raw::c_uint, + pub lmi: ::std::os::raw::c_ushort, + pub dce: ::std::os::raw::c_ushort, +} +#[test] +fn bindgen_test_layout_fr_proto() { + assert_eq!( + ::std::mem::size_of::(), + 24usize, + concat!("Size of: ", stringify!(fr_proto)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(fr_proto)) + ); + assert_eq!( + unsafe { &(*(0 as *const fr_proto)).t391 as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(fr_proto), + "::", + stringify!(t391) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const fr_proto)).t392 as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(fr_proto), + "::", + stringify!(t392) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const fr_proto)).n391 as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(fr_proto), + "::", + stringify!(n391) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const fr_proto)).n392 as *const _ as usize }, + 12usize, + concat!( + "Alignment of field: ", + stringify!(fr_proto), + "::", + stringify!(n392) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const fr_proto)).n393 as *const _ as usize }, + 16usize, + concat!( + "Alignment of field: ", + stringify!(fr_proto), + "::", + stringify!(n393) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const fr_proto)).lmi as *const _ as usize }, + 20usize, + concat!( + "Alignment of field: ", + stringify!(fr_proto), + "::", + stringify!(lmi) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const fr_proto)).dce as *const _ as usize }, + 22usize, + concat!( + "Alignment of field: ", + stringify!(fr_proto), + "::", + stringify!(dce) + ) + ); +} +impl Clone for fr_proto { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct fr_proto_pvc { + pub dlci: ::std::os::raw::c_uint, +} +#[test] +fn bindgen_test_layout_fr_proto_pvc() { + assert_eq!( + ::std::mem::size_of::(), + 4usize, + concat!("Size of: ", stringify!(fr_proto_pvc)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(fr_proto_pvc)) + ); + assert_eq!( + unsafe { &(*(0 as *const fr_proto_pvc)).dlci as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(fr_proto_pvc), + "::", + stringify!(dlci) + ) + ); +} +impl Clone for fr_proto_pvc { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct fr_proto_pvc_info { + pub dlci: ::std::os::raw::c_uint, + pub master: [::std::os::raw::c_char; 16usize], +} +#[test] +fn bindgen_test_layout_fr_proto_pvc_info() { + assert_eq!( + ::std::mem::size_of::(), + 20usize, + concat!("Size of: ", stringify!(fr_proto_pvc_info)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(fr_proto_pvc_info)) + ); + assert_eq!( + unsafe { &(*(0 as *const fr_proto_pvc_info)).dlci as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(fr_proto_pvc_info), + "::", + stringify!(dlci) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const fr_proto_pvc_info)).master as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(fr_proto_pvc_info), + "::", + stringify!(master) + ) + ); +} +impl Clone for fr_proto_pvc_info { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct cisco_proto { + pub interval: ::std::os::raw::c_uint, + pub timeout: ::std::os::raw::c_uint, +} +#[test] +fn bindgen_test_layout_cisco_proto() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!("Size of: ", stringify!(cisco_proto)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(cisco_proto)) + ); + assert_eq!( + unsafe { &(*(0 as *const cisco_proto)).interval as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(cisco_proto), + "::", + stringify!(interval) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const cisco_proto)).timeout as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(cisco_proto), + "::", + stringify!(timeout) + ) + ); +} +impl Clone for cisco_proto { + fn clone(&self) -> Self { + *self + } +} +pub const net_device_flags_IFF_UP: net_device_flags = 1; +pub const net_device_flags_IFF_BROADCAST: net_device_flags = 2; +pub const net_device_flags_IFF_DEBUG: net_device_flags = 4; +pub const net_device_flags_IFF_LOOPBACK: net_device_flags = 8; +pub const net_device_flags_IFF_POINTOPOINT: net_device_flags = 16; +pub const net_device_flags_IFF_NOTRAILERS: net_device_flags = 32; +pub const net_device_flags_IFF_RUNNING: net_device_flags = 64; +pub const net_device_flags_IFF_NOARP: net_device_flags = 128; +pub const net_device_flags_IFF_PROMISC: net_device_flags = 256; +pub const net_device_flags_IFF_ALLMULTI: net_device_flags = 512; +pub const net_device_flags_IFF_MASTER: net_device_flags = 1024; +pub const net_device_flags_IFF_SLAVE: net_device_flags = 2048; +pub const net_device_flags_IFF_MULTICAST: net_device_flags = 4096; +pub const net_device_flags_IFF_PORTSEL: net_device_flags = 8192; +pub const net_device_flags_IFF_AUTOMEDIA: net_device_flags = 16384; +pub const net_device_flags_IFF_DYNAMIC: net_device_flags = 32768; +/** + * enum net_device_flags - &struct net_device flags + * + * These are the &struct net_device flags, they can be set by drivers, the + * kernel and some can be triggered by userspace. Userspace can query and + * set these flags using userspace utilities but there is also a sysfs + * entry available for all dev flags which can be queried and set. These flags + * are shared for all types of net_devices. The sysfs entries are available + * via /sys/class/net//flags. Flags which can be toggled through sysfs + * are annotated below, note that only a few flags can be toggled and some + * other flags are always preserved from the original net_device flags + * even if you try to set them via sysfs. Flags which are always preserved + * are kept under the flag grouping @IFF_VOLATILE. Flags which are __volatile__ + * are annotated below as such. + * + * You should have a pretty good reason to be extending these flags. + * + * @IFF_UP: interface is up. Can be toggled through sysfs. + * @IFF_BROADCAST: broadcast address valid. Volatile. + * @IFF_DEBUG: turn on debugging. Can be toggled through sysfs. + * @IFF_LOOPBACK: is a loopback net. Volatile. + * @IFF_POINTOPOINT: interface is has p-p link. Volatile. + * @IFF_NOTRAILERS: avoid use of trailers. Can be toggled through sysfs. + * Volatile. + * @IFF_RUNNING: interface RFC2863 OPER_UP. Volatile. + * @IFF_NOARP: no ARP protocol. Can be toggled through sysfs. Volatile. + * @IFF_PROMISC: receive all packets. Can be toggled through sysfs. + * @IFF_ALLMULTI: receive all multicast packets. Can be toggled through + * sysfs. + * @IFF_MASTER: master of a load balancer. Volatile. + * @IFF_SLAVE: slave of a load balancer. Volatile. + * @IFF_MULTICAST: Supports multicast. Can be toggled through sysfs. + * @IFF_PORTSEL: can set media type. Can be toggled through sysfs. + * @IFF_AUTOMEDIA: auto media select active. Can be toggled through sysfs. + * @IFF_DYNAMIC: dialup device with changing addresses. Can be toggled + * through sysfs. + * @IFF_LOWER_UP: driver signals L1 up. Volatile. + * @IFF_DORMANT: driver signals dormant. Volatile. + * @IFF_ECHO: echo sent packets. Volatile. + */ +pub type net_device_flags = ::std::os::raw::c_uint; +pub const IF_OPER_UNKNOWN: _bindgen_ty_4 = 0; +pub const IF_OPER_NOTPRESENT: _bindgen_ty_4 = 1; +pub const IF_OPER_DOWN: _bindgen_ty_4 = 2; +pub const IF_OPER_LOWERLAYERDOWN: _bindgen_ty_4 = 3; +pub const IF_OPER_TESTING: _bindgen_ty_4 = 4; +pub const IF_OPER_DORMANT: _bindgen_ty_4 = 5; +pub const IF_OPER_UP: _bindgen_ty_4 = 6; +pub type _bindgen_ty_4 = ::std::os::raw::c_uint; +pub const IF_LINK_MODE_DEFAULT: _bindgen_ty_5 = 0; +pub const IF_LINK_MODE_DORMANT: _bindgen_ty_5 = 1; +pub type _bindgen_ty_5 = ::std::os::raw::c_uint; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct ifmap { + pub mem_start: ::std::os::raw::c_ulong, + pub mem_end: ::std::os::raw::c_ulong, + pub base_addr: ::std::os::raw::c_ushort, + pub irq: ::std::os::raw::c_uchar, + pub dma: ::std::os::raw::c_uchar, + pub port: ::std::os::raw::c_uchar, +} +#[test] +fn bindgen_test_layout_ifmap() { + assert_eq!( + ::std::mem::size_of::(), + 24usize, + concat!("Size of: ", stringify!(ifmap)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(ifmap)) + ); + assert_eq!( + unsafe { &(*(0 as *const ifmap)).mem_start as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifmap), + "::", + stringify!(mem_start) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifmap)).mem_end as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(ifmap), + "::", + stringify!(mem_end) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifmap)).base_addr as *const _ as usize }, + 16usize, + concat!( + "Alignment of field: ", + stringify!(ifmap), + "::", + stringify!(base_addr) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifmap)).irq as *const _ as usize }, + 18usize, + concat!( + "Alignment of field: ", + stringify!(ifmap), + "::", + stringify!(irq) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifmap)).dma as *const _ as usize }, + 19usize, + concat!( + "Alignment of field: ", + stringify!(ifmap), + "::", + stringify!(dma) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifmap)).port as *const _ as usize }, + 20usize, + concat!( + "Alignment of field: ", + stringify!(ifmap), + "::", + stringify!(port) + ) + ); +} +impl Clone for ifmap { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct if_settings { + pub type_: ::std::os::raw::c_uint, + pub size: ::std::os::raw::c_uint, + pub ifs_ifsu: if_settings__bindgen_ty_1, +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct if_settings__bindgen_ty_1 { + pub raw_hdlc: __BindgenUnionField<*mut raw_hdlc_proto>, + pub cisco: __BindgenUnionField<*mut cisco_proto>, + pub fr: __BindgenUnionField<*mut fr_proto>, + pub fr_pvc: __BindgenUnionField<*mut fr_proto_pvc>, + pub fr_pvc_info: __BindgenUnionField<*mut fr_proto_pvc_info>, + pub sync: __BindgenUnionField<*mut sync_serial_settings>, + pub te1: __BindgenUnionField<*mut te1_settings>, + pub bindgen_union_field: u64, +} +#[test] +fn bindgen_test_layout_if_settings__bindgen_ty_1() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!("Size of: ", stringify!(if_settings__bindgen_ty_1)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(if_settings__bindgen_ty_1)) + ); + assert_eq!( + unsafe { &(*(0 as *const if_settings__bindgen_ty_1)).raw_hdlc as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(if_settings__bindgen_ty_1), + "::", + stringify!(raw_hdlc) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const if_settings__bindgen_ty_1)).cisco as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(if_settings__bindgen_ty_1), + "::", + stringify!(cisco) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const if_settings__bindgen_ty_1)).fr as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(if_settings__bindgen_ty_1), + "::", + stringify!(fr) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const if_settings__bindgen_ty_1)).fr_pvc as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(if_settings__bindgen_ty_1), + "::", + stringify!(fr_pvc) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const if_settings__bindgen_ty_1)).fr_pvc_info as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(if_settings__bindgen_ty_1), + "::", + stringify!(fr_pvc_info) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const if_settings__bindgen_ty_1)).sync as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(if_settings__bindgen_ty_1), + "::", + stringify!(sync) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const if_settings__bindgen_ty_1)).te1 as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(if_settings__bindgen_ty_1), + "::", + stringify!(te1) + ) + ); +} +impl Clone for if_settings__bindgen_ty_1 { + fn clone(&self) -> Self { + *self + } +} +#[test] +fn bindgen_test_layout_if_settings() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(if_settings)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(if_settings)) + ); + assert_eq!( + unsafe { &(*(0 as *const if_settings)).type_ as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(if_settings), + "::", + stringify!(type_) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const if_settings)).size as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(if_settings), + "::", + stringify!(size) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const if_settings)).ifs_ifsu as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(if_settings), + "::", + stringify!(ifs_ifsu) + ) + ); +} +impl Clone for if_settings { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct ifreq { + pub ifr_ifrn: ifreq__bindgen_ty_1, + pub ifr_ifru: ifreq__bindgen_ty_2, +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct ifreq__bindgen_ty_1 { + pub ifrn_name: __BindgenUnionField<[::std::os::raw::c_uchar; 16usize]>, + pub bindgen_union_field: [u8; 16usize], +} +#[test] +fn bindgen_test_layout_ifreq__bindgen_ty_1() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(ifreq__bindgen_ty_1)) + ); + assert_eq!( + ::std::mem::align_of::(), + 1usize, + concat!("Alignment of ", stringify!(ifreq__bindgen_ty_1)) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_1)).ifrn_name as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_1), + "::", + stringify!(ifrn_name) + ) + ); +} +impl Clone for ifreq__bindgen_ty_1 { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct ifreq__bindgen_ty_2 { + pub ifru_addr: __BindgenUnionField, + pub ifru_dstaddr: __BindgenUnionField, + pub ifru_broadaddr: __BindgenUnionField, + pub ifru_netmask: __BindgenUnionField, + pub ifru_hwaddr: __BindgenUnionField, + pub ifru_flags: __BindgenUnionField<::std::os::raw::c_short>, + pub ifru_ivalue: __BindgenUnionField<::std::os::raw::c_int>, + pub ifru_mtu: __BindgenUnionField<::std::os::raw::c_int>, + pub ifru_map: __BindgenUnionField, + pub ifru_slave: __BindgenUnionField<[::std::os::raw::c_char; 16usize]>, + pub ifru_newname: __BindgenUnionField<[::std::os::raw::c_char; 16usize]>, + pub ifru_data: __BindgenUnionField<*mut ::std::os::raw::c_void>, + pub ifru_settings: __BindgenUnionField, + pub bindgen_union_field: [u64; 3usize], +} +#[test] +fn bindgen_test_layout_ifreq__bindgen_ty_2() { + assert_eq!( + ::std::mem::size_of::(), + 24usize, + concat!("Size of: ", stringify!(ifreq__bindgen_ty_2)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(ifreq__bindgen_ty_2)) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_2)).ifru_addr as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_2), + "::", + stringify!(ifru_addr) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_2)).ifru_dstaddr as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_2), + "::", + stringify!(ifru_dstaddr) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_2)).ifru_broadaddr as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_2), + "::", + stringify!(ifru_broadaddr) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_2)).ifru_netmask as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_2), + "::", + stringify!(ifru_netmask) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_2)).ifru_hwaddr as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_2), + "::", + stringify!(ifru_hwaddr) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_2)).ifru_flags as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_2), + "::", + stringify!(ifru_flags) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_2)).ifru_ivalue as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_2), + "::", + stringify!(ifru_ivalue) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_2)).ifru_mtu as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_2), + "::", + stringify!(ifru_mtu) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_2)).ifru_map as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_2), + "::", + stringify!(ifru_map) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_2)).ifru_slave as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_2), + "::", + stringify!(ifru_slave) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_2)).ifru_newname as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_2), + "::", + stringify!(ifru_newname) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_2)).ifru_data as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_2), + "::", + stringify!(ifru_data) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq__bindgen_ty_2)).ifru_settings as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq__bindgen_ty_2), + "::", + stringify!(ifru_settings) + ) + ); +} +impl Clone for ifreq__bindgen_ty_2 { + fn clone(&self) -> Self { + *self + } +} +#[test] +fn bindgen_test_layout_ifreq() { + assert_eq!( + ::std::mem::size_of::(), + 40usize, + concat!("Size of: ", stringify!(ifreq)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(ifreq)) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq)).ifr_ifrn as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ifreq), + "::", + stringify!(ifr_ifrn) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ifreq)).ifr_ifru as *const _ as usize }, + 16usize, + concat!( + "Alignment of field: ", + stringify!(ifreq), + "::", + stringify!(ifr_ifru) + ) + ); +} +impl Clone for ifreq { + fn clone(&self) -> Self { + *self + } +} diff --git a/src/dragonball/src/dbs_utils/src/net/net_gen/inn.rs b/src/dragonball/src/dbs_utils/src/net/net_gen/inn.rs new file mode 100644 index 000000000..84401eafa --- /dev/null +++ b/src/dragonball/src/dbs_utils/src/net/net_gen/inn.rs @@ -0,0 +1,845 @@ +// Copyright 2023 Alibaba Cloud. All Rights Reserved. +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. +// SPDX-License-Identifier: Apache-2.0 + +/* automatically generated by rust-bindgen */ + +pub const __BITS_PER_LONG: ::std::os::raw::c_uint = 64; +pub const __FD_SETSIZE: ::std::os::raw::c_uint = 1024; +pub const __UAPI_DEF_IN_ADDR: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IN_IPPROTO: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IN_PKTINFO: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IP_MREQ: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_SOCKADDR_IN: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IN_CLASS: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IN6_ADDR: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IN6_ADDR_ALT: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_SOCKADDR_IN6: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IPV6_MREQ: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IPPROTO_V6: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IPV6_OPTIONS: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IN6_PKTINFO: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_IP6_MTUINFO: ::std::os::raw::c_uint = 1; +pub const __UAPI_DEF_XATTR: ::std::os::raw::c_uint = 1; +pub const _K_SS_MAXSIZE: ::std::os::raw::c_uint = 128; +pub const IP_TOS: ::std::os::raw::c_uint = 1; +pub const IP_TTL: ::std::os::raw::c_uint = 2; +pub const IP_HDRINCL: ::std::os::raw::c_uint = 3; +pub const IP_OPTIONS: ::std::os::raw::c_uint = 4; +pub const IP_ROUTER_ALERT: ::std::os::raw::c_uint = 5; +pub const IP_RECVOPTS: ::std::os::raw::c_uint = 6; +pub const IP_RETOPTS: ::std::os::raw::c_uint = 7; +pub const IP_PKTINFO: ::std::os::raw::c_uint = 8; +pub const IP_PKTOPTIONS: ::std::os::raw::c_uint = 9; +pub const IP_MTU_DISCOVER: ::std::os::raw::c_uint = 10; +pub const IP_RECVERR: ::std::os::raw::c_uint = 11; +pub const IP_RECVTTL: ::std::os::raw::c_uint = 12; +pub const IP_RECVTOS: ::std::os::raw::c_uint = 13; +pub const IP_MTU: ::std::os::raw::c_uint = 14; +pub const IP_FREEBIND: ::std::os::raw::c_uint = 15; +pub const IP_IPSEC_POLICY: ::std::os::raw::c_uint = 16; +pub const IP_XFRM_POLICY: ::std::os::raw::c_uint = 17; +pub const IP_PASSSEC: ::std::os::raw::c_uint = 18; +pub const IP_TRANSPARENT: ::std::os::raw::c_uint = 19; +pub const IP_RECVRETOPTS: ::std::os::raw::c_uint = 7; +pub const IP_ORIGDSTADDR: ::std::os::raw::c_uint = 20; +pub const IP_RECVORIGDSTADDR: ::std::os::raw::c_uint = 20; +pub const IP_MINTTL: ::std::os::raw::c_uint = 21; +pub const IP_NODEFRAG: ::std::os::raw::c_uint = 22; +pub const IP_CHECKSUM: ::std::os::raw::c_uint = 23; +pub const IP_BIND_ADDRESS_NO_PORT: ::std::os::raw::c_uint = 24; +pub const IP_RECVFRAGSIZE: ::std::os::raw::c_uint = 25; +pub const IP_PMTUDISC_DONT: ::std::os::raw::c_uint = 0; +pub const IP_PMTUDISC_WANT: ::std::os::raw::c_uint = 1; +pub const IP_PMTUDISC_DO: ::std::os::raw::c_uint = 2; +pub const IP_PMTUDISC_PROBE: ::std::os::raw::c_uint = 3; +pub const IP_PMTUDISC_INTERFACE: ::std::os::raw::c_uint = 4; +pub const IP_PMTUDISC_OMIT: ::std::os::raw::c_uint = 5; +pub const IP_MULTICAST_IF: ::std::os::raw::c_uint = 32; +pub const IP_MULTICAST_TTL: ::std::os::raw::c_uint = 33; +pub const IP_MULTICAST_LOOP: ::std::os::raw::c_uint = 34; +pub const IP_ADD_MEMBERSHIP: ::std::os::raw::c_uint = 35; +pub const IP_DROP_MEMBERSHIP: ::std::os::raw::c_uint = 36; +pub const IP_UNBLOCK_SOURCE: ::std::os::raw::c_uint = 37; +pub const IP_BLOCK_SOURCE: ::std::os::raw::c_uint = 38; +pub const IP_ADD_SOURCE_MEMBERSHIP: ::std::os::raw::c_uint = 39; +pub const IP_DROP_SOURCE_MEMBERSHIP: ::std::os::raw::c_uint = 40; +pub const IP_MSFILTER: ::std::os::raw::c_uint = 41; +pub const MCAST_JOIN_GROUP: ::std::os::raw::c_uint = 42; +pub const MCAST_BLOCK_SOURCE: ::std::os::raw::c_uint = 43; +pub const MCAST_UNBLOCK_SOURCE: ::std::os::raw::c_uint = 44; +pub const MCAST_LEAVE_GROUP: ::std::os::raw::c_uint = 45; +pub const MCAST_JOIN_SOURCE_GROUP: ::std::os::raw::c_uint = 46; +pub const MCAST_LEAVE_SOURCE_GROUP: ::std::os::raw::c_uint = 47; +pub const MCAST_MSFILTER: ::std::os::raw::c_uint = 48; +pub const IP_MULTICAST_ALL: ::std::os::raw::c_uint = 49; +pub const IP_UNICAST_IF: ::std::os::raw::c_uint = 50; +pub const MCAST_EXCLUDE: ::std::os::raw::c_uint = 0; +pub const MCAST_INCLUDE: ::std::os::raw::c_uint = 1; +pub const IP_DEFAULT_MULTICAST_TTL: ::std::os::raw::c_uint = 1; +pub const IP_DEFAULT_MULTICAST_LOOP: ::std::os::raw::c_uint = 1; +pub const __SOCK_SIZE__: ::std::os::raw::c_uint = 16; +pub const IN_CLASSA_NET: ::std::os::raw::c_uint = 4278190080; +pub const IN_CLASSA_NSHIFT: ::std::os::raw::c_uint = 24; +pub const IN_CLASSA_HOST: ::std::os::raw::c_uint = 16777215; +pub const IN_CLASSA_MAX: ::std::os::raw::c_uint = 128; +pub const IN_CLASSB_NET: ::std::os::raw::c_uint = 4294901760; +pub const IN_CLASSB_NSHIFT: ::std::os::raw::c_uint = 16; +pub const IN_CLASSB_HOST: ::std::os::raw::c_uint = 65535; +pub const IN_CLASSB_MAX: ::std::os::raw::c_uint = 65536; +pub const IN_CLASSC_NET: ::std::os::raw::c_uint = 4294967040; +pub const IN_CLASSC_NSHIFT: ::std::os::raw::c_uint = 8; +pub const IN_CLASSC_HOST: ::std::os::raw::c_uint = 255; +pub const IN_MULTICAST_NET: ::std::os::raw::c_uint = 4026531840; +pub const IN_LOOPBACKNET: ::std::os::raw::c_uint = 127; +pub const INADDR_LOOPBACK: ::std::os::raw::c_uint = 2130706433; +pub const INADDR_UNSPEC_GROUP: ::std::os::raw::c_uint = 3758096384; +pub const INADDR_ALLHOSTS_GROUP: ::std::os::raw::c_uint = 3758096385; +pub const INADDR_ALLRTRS_GROUP: ::std::os::raw::c_uint = 3758096386; +pub const INADDR_MAX_LOCAL_GROUP: ::std::os::raw::c_uint = 3758096639; +pub const __LITTLE_ENDIAN: ::std::os::raw::c_uint = 1234; +pub type __s8 = ::std::os::raw::c_schar; +pub type __u8 = ::std::os::raw::c_uchar; +pub type __s16 = ::std::os::raw::c_short; +pub type __u16 = ::std::os::raw::c_ushort; +pub type __s32 = ::std::os::raw::c_int; +pub type __u32 = ::std::os::raw::c_uint; +pub type __s64 = ::std::os::raw::c_longlong; +pub type __u64 = ::std::os::raw::c_ulonglong; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct __kernel_fd_set { + pub fds_bits: [::std::os::raw::c_ulong; 16usize], +} +#[test] +fn bindgen_test_layout___kernel_fd_set() { + assert_eq!( + ::std::mem::size_of::<__kernel_fd_set>(), + 128usize, + concat!("Size of: ", stringify!(__kernel_fd_set)) + ); + assert_eq!( + ::std::mem::align_of::<__kernel_fd_set>(), + 8usize, + concat!("Alignment of ", stringify!(__kernel_fd_set)) + ); + assert_eq!( + unsafe { &(*(0 as *const __kernel_fd_set)).fds_bits as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(__kernel_fd_set), + "::", + stringify!(fds_bits) + ) + ); +} +impl Clone for __kernel_fd_set { + fn clone(&self) -> Self { + *self + } +} +pub type __kernel_sighandler_t = + ::std::option::Option; +pub type __kernel_key_t = ::std::os::raw::c_int; +pub type __kernel_mqd_t = ::std::os::raw::c_int; +pub type __kernel_old_uid_t = ::std::os::raw::c_ushort; +pub type __kernel_old_gid_t = ::std::os::raw::c_ushort; +pub type __kernel_old_dev_t = ::std::os::raw::c_ulong; +pub type __kernel_long_t = ::std::os::raw::c_long; +pub type __kernel_ulong_t = ::std::os::raw::c_ulong; +pub type __kernel_ino_t = __kernel_ulong_t; +pub type __kernel_mode_t = ::std::os::raw::c_uint; +pub type __kernel_pid_t = ::std::os::raw::c_int; +pub type __kernel_ipc_pid_t = ::std::os::raw::c_int; +pub type __kernel_uid_t = ::std::os::raw::c_uint; +pub type __kernel_gid_t = ::std::os::raw::c_uint; +pub type __kernel_suseconds_t = __kernel_long_t; +pub type __kernel_daddr_t = ::std::os::raw::c_int; +pub type __kernel_uid32_t = ::std::os::raw::c_uint; +pub type __kernel_gid32_t = ::std::os::raw::c_uint; +pub type __kernel_size_t = __kernel_ulong_t; +pub type __kernel_ssize_t = __kernel_long_t; +pub type __kernel_ptrdiff_t = __kernel_long_t; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct __kernel_fsid_t { + pub val: [::std::os::raw::c_int; 2usize], +} +#[test] +fn bindgen_test_layout___kernel_fsid_t() { + assert_eq!( + ::std::mem::size_of::<__kernel_fsid_t>(), + 8usize, + concat!("Size of: ", stringify!(__kernel_fsid_t)) + ); + assert_eq!( + ::std::mem::align_of::<__kernel_fsid_t>(), + 4usize, + concat!("Alignment of ", stringify!(__kernel_fsid_t)) + ); + assert_eq!( + unsafe { &(*(0 as *const __kernel_fsid_t)).val as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(__kernel_fsid_t), + "::", + stringify!(val) + ) + ); +} +impl Clone for __kernel_fsid_t { + fn clone(&self) -> Self { + *self + } +} +pub type __kernel_off_t = __kernel_long_t; +pub type __kernel_loff_t = ::std::os::raw::c_longlong; +pub type __kernel_time_t = __kernel_long_t; +pub type __kernel_clock_t = __kernel_long_t; +pub type __kernel_timer_t = ::std::os::raw::c_int; +pub type __kernel_clockid_t = ::std::os::raw::c_int; +pub type __kernel_caddr_t = *mut ::std::os::raw::c_char; +pub type __kernel_uid16_t = ::std::os::raw::c_ushort; +pub type __kernel_gid16_t = ::std::os::raw::c_ushort; +pub type __le16 = __u16; +pub type __be16 = __u16; +pub type __le32 = __u32; +pub type __be32 = __u32; +pub type __le64 = __u64; +pub type __be64 = __u64; +pub type __sum16 = __u16; +pub type __wsum = __u32; +pub type __kernel_sa_family_t = ::std::os::raw::c_ushort; +#[repr(C)] +pub struct __kernel_sockaddr_storage { + pub ss_family: __kernel_sa_family_t, + pub __data: [::std::os::raw::c_char; 126usize], + pub __bindgen_align: [u64; 0usize], +} +#[test] +fn bindgen_test_layout___kernel_sockaddr_storage() { + assert_eq!( + ::std::mem::size_of::<__kernel_sockaddr_storage>(), + 128usize, + concat!("Size of: ", stringify!(__kernel_sockaddr_storage)) + ); + assert_eq!( + ::std::mem::align_of::<__kernel_sockaddr_storage>(), + 8usize, + concat!("Alignment of ", stringify!(__kernel_sockaddr_storage)) + ); + assert_eq!( + unsafe { &(*(0 as *const __kernel_sockaddr_storage)).ss_family as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(__kernel_sockaddr_storage), + "::", + stringify!(ss_family) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const __kernel_sockaddr_storage)).__data as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(__kernel_sockaddr_storage), + "::", + stringify!(__data) + ) + ); +} +impl Default for __kernel_sockaddr_storage { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +pub const IPPROTO_IP: _bindgen_ty_1 = 0; +pub const IPPROTO_ICMP: _bindgen_ty_1 = 1; +pub const IPPROTO_IGMP: _bindgen_ty_1 = 2; +pub const IPPROTO_IPIP: _bindgen_ty_1 = 4; +pub const IPPROTO_TCP: _bindgen_ty_1 = 6; +pub const IPPROTO_EGP: _bindgen_ty_1 = 8; +pub const IPPROTO_PUP: _bindgen_ty_1 = 12; +pub const IPPROTO_UDP: _bindgen_ty_1 = 17; +pub const IPPROTO_IDP: _bindgen_ty_1 = 22; +pub const IPPROTO_TP: _bindgen_ty_1 = 29; +pub const IPPROTO_DCCP: _bindgen_ty_1 = 33; +pub const IPPROTO_IPV6: _bindgen_ty_1 = 41; +pub const IPPROTO_RSVP: _bindgen_ty_1 = 46; +pub const IPPROTO_GRE: _bindgen_ty_1 = 47; +pub const IPPROTO_ESP: _bindgen_ty_1 = 50; +pub const IPPROTO_AH: _bindgen_ty_1 = 51; +pub const IPPROTO_MTP: _bindgen_ty_1 = 92; +pub const IPPROTO_BEETPH: _bindgen_ty_1 = 94; +pub const IPPROTO_ENCAP: _bindgen_ty_1 = 98; +pub const IPPROTO_PIM: _bindgen_ty_1 = 103; +pub const IPPROTO_COMP: _bindgen_ty_1 = 108; +pub const IPPROTO_SCTP: _bindgen_ty_1 = 132; +pub const IPPROTO_UDPLITE: _bindgen_ty_1 = 136; +pub const IPPROTO_MPLS: _bindgen_ty_1 = 137; +pub const IPPROTO_RAW: _bindgen_ty_1 = 255; +pub const IPPROTO_MAX: _bindgen_ty_1 = 256; +pub type _bindgen_ty_1 = ::std::os::raw::c_uint; +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct in_addr { + pub s_addr: __be32, +} +#[test] +fn bindgen_test_layout_in_addr() { + assert_eq!( + ::std::mem::size_of::(), + 4usize, + concat!("Size of: ", stringify!(in_addr)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(in_addr)) + ); + assert_eq!( + unsafe { &(*(0 as *const in_addr)).s_addr as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(in_addr), + "::", + stringify!(s_addr) + ) + ); +} +impl Clone for in_addr { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct ip_mreq { + pub imr_multiaddr: in_addr, + pub imr_interface: in_addr, +} +#[test] +fn bindgen_test_layout_ip_mreq() { + assert_eq!( + ::std::mem::size_of::(), + 8usize, + concat!("Size of: ", stringify!(ip_mreq)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(ip_mreq)) + ); + assert_eq!( + unsafe { &(*(0 as *const ip_mreq)).imr_multiaddr as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ip_mreq), + "::", + stringify!(imr_multiaddr) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ip_mreq)).imr_interface as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(ip_mreq), + "::", + stringify!(imr_interface) + ) + ); +} +impl Clone for ip_mreq { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct ip_mreqn { + pub imr_multiaddr: in_addr, + pub imr_address: in_addr, + pub imr_ifindex: ::std::os::raw::c_int, +} +#[test] +fn bindgen_test_layout_ip_mreqn() { + assert_eq!( + ::std::mem::size_of::(), + 12usize, + concat!("Size of: ", stringify!(ip_mreqn)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(ip_mreqn)) + ); + assert_eq!( + unsafe { &(*(0 as *const ip_mreqn)).imr_multiaddr as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ip_mreqn), + "::", + stringify!(imr_multiaddr) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ip_mreqn)).imr_address as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(ip_mreqn), + "::", + stringify!(imr_address) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ip_mreqn)).imr_ifindex as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(ip_mreqn), + "::", + stringify!(imr_ifindex) + ) + ); +} +impl Clone for ip_mreqn { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct ip_mreq_source { + pub imr_multiaddr: __be32, + pub imr_interface: __be32, + pub imr_sourceaddr: __be32, +} +#[test] +fn bindgen_test_layout_ip_mreq_source() { + assert_eq!( + ::std::mem::size_of::(), + 12usize, + concat!("Size of: ", stringify!(ip_mreq_source)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(ip_mreq_source)) + ); + assert_eq!( + unsafe { &(*(0 as *const ip_mreq_source)).imr_multiaddr as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ip_mreq_source), + "::", + stringify!(imr_multiaddr) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ip_mreq_source)).imr_interface as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(ip_mreq_source), + "::", + stringify!(imr_interface) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ip_mreq_source)).imr_sourceaddr as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(ip_mreq_source), + "::", + stringify!(imr_sourceaddr) + ) + ); +} +impl Clone for ip_mreq_source { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct ip_msfilter { + pub imsf_multiaddr: __be32, + pub imsf_interface: __be32, + pub imsf_fmode: __u32, + pub imsf_numsrc: __u32, + pub imsf_slist: [__be32; 1usize], +} +#[test] +fn bindgen_test_layout_ip_msfilter() { + assert_eq!( + ::std::mem::size_of::(), + 20usize, + concat!("Size of: ", stringify!(ip_msfilter)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(ip_msfilter)) + ); + assert_eq!( + unsafe { &(*(0 as *const ip_msfilter)).imsf_multiaddr as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(ip_msfilter), + "::", + stringify!(imsf_multiaddr) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ip_msfilter)).imsf_interface as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(ip_msfilter), + "::", + stringify!(imsf_interface) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ip_msfilter)).imsf_fmode as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(ip_msfilter), + "::", + stringify!(imsf_fmode) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ip_msfilter)).imsf_numsrc as *const _ as usize }, + 12usize, + concat!( + "Alignment of field: ", + stringify!(ip_msfilter), + "::", + stringify!(imsf_numsrc) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const ip_msfilter)).imsf_slist as *const _ as usize }, + 16usize, + concat!( + "Alignment of field: ", + stringify!(ip_msfilter), + "::", + stringify!(imsf_slist) + ) + ); +} +impl Clone for ip_msfilter { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +pub struct group_req { + pub gr_interface: __u32, + pub gr_group: __kernel_sockaddr_storage, +} +#[test] +fn bindgen_test_layout_group_req() { + assert_eq!( + ::std::mem::size_of::(), + 136usize, + concat!("Size of: ", stringify!(group_req)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(group_req)) + ); + assert_eq!( + unsafe { &(*(0 as *const group_req)).gr_interface as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(group_req), + "::", + stringify!(gr_interface) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const group_req)).gr_group as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(group_req), + "::", + stringify!(gr_group) + ) + ); +} +impl Default for group_req { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +#[repr(C)] +pub struct group_source_req { + pub gsr_interface: __u32, + pub gsr_group: __kernel_sockaddr_storage, + pub gsr_source: __kernel_sockaddr_storage, +} +#[test] +fn bindgen_test_layout_group_source_req() { + assert_eq!( + ::std::mem::size_of::(), + 264usize, + concat!("Size of: ", stringify!(group_source_req)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(group_source_req)) + ); + assert_eq!( + unsafe { &(*(0 as *const group_source_req)).gsr_interface as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(group_source_req), + "::", + stringify!(gsr_interface) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const group_source_req)).gsr_group as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(group_source_req), + "::", + stringify!(gsr_group) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const group_source_req)).gsr_source as *const _ as usize }, + 136usize, + concat!( + "Alignment of field: ", + stringify!(group_source_req), + "::", + stringify!(gsr_source) + ) + ); +} +impl Default for group_source_req { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +#[repr(C)] +pub struct group_filter { + pub gf_interface: __u32, + pub gf_group: __kernel_sockaddr_storage, + pub gf_fmode: __u32, + pub gf_numsrc: __u32, + pub gf_slist: [__kernel_sockaddr_storage; 1usize], +} +#[test] +fn bindgen_test_layout_group_filter() { + assert_eq!( + ::std::mem::size_of::(), + 272usize, + concat!("Size of: ", stringify!(group_filter)) + ); + assert_eq!( + ::std::mem::align_of::(), + 8usize, + concat!("Alignment of ", stringify!(group_filter)) + ); + assert_eq!( + unsafe { &(*(0 as *const group_filter)).gf_interface as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(group_filter), + "::", + stringify!(gf_interface) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const group_filter)).gf_group as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(group_filter), + "::", + stringify!(gf_group) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const group_filter)).gf_fmode as *const _ as usize }, + 136usize, + concat!( + "Alignment of field: ", + stringify!(group_filter), + "::", + stringify!(gf_fmode) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const group_filter)).gf_numsrc as *const _ as usize }, + 140usize, + concat!( + "Alignment of field: ", + stringify!(group_filter), + "::", + stringify!(gf_numsrc) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const group_filter)).gf_slist as *const _ as usize }, + 144usize, + concat!( + "Alignment of field: ", + stringify!(group_filter), + "::", + stringify!(gf_slist) + ) + ); +} +impl Default for group_filter { + fn default() -> Self { + unsafe { ::std::mem::zeroed() } + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct in_pktinfo { + pub ipi_ifindex: ::std::os::raw::c_int, + pub ipi_spec_dst: in_addr, + pub ipi_addr: in_addr, +} +#[test] +fn bindgen_test_layout_in_pktinfo() { + assert_eq!( + ::std::mem::size_of::(), + 12usize, + concat!("Size of: ", stringify!(in_pktinfo)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(in_pktinfo)) + ); + assert_eq!( + unsafe { &(*(0 as *const in_pktinfo)).ipi_ifindex as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(in_pktinfo), + "::", + stringify!(ipi_ifindex) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const in_pktinfo)).ipi_spec_dst as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(in_pktinfo), + "::", + stringify!(ipi_spec_dst) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const in_pktinfo)).ipi_addr as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(in_pktinfo), + "::", + stringify!(ipi_addr) + ) + ); +} +impl Clone for in_pktinfo { + fn clone(&self) -> Self { + *self + } +} +#[repr(C)] +#[derive(Debug, Default, Copy)] +pub struct sockaddr_in { + pub sin_family: __kernel_sa_family_t, + pub sin_port: __be16, + pub sin_addr: in_addr, + pub __pad: [::std::os::raw::c_uchar; 8usize], +} +#[test] +fn bindgen_test_layout_sockaddr_in() { + assert_eq!( + ::std::mem::size_of::(), + 16usize, + concat!("Size of: ", stringify!(sockaddr_in)) + ); + assert_eq!( + ::std::mem::align_of::(), + 4usize, + concat!("Alignment of ", stringify!(sockaddr_in)) + ); + assert_eq!( + unsafe { &(*(0 as *const sockaddr_in)).sin_family as *const _ as usize }, + 0usize, + concat!( + "Alignment of field: ", + stringify!(sockaddr_in), + "::", + stringify!(sin_family) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const sockaddr_in)).sin_port as *const _ as usize }, + 2usize, + concat!( + "Alignment of field: ", + stringify!(sockaddr_in), + "::", + stringify!(sin_port) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const sockaddr_in)).sin_addr as *const _ as usize }, + 4usize, + concat!( + "Alignment of field: ", + stringify!(sockaddr_in), + "::", + stringify!(sin_addr) + ) + ); + assert_eq!( + unsafe { &(*(0 as *const sockaddr_in)).__pad as *const _ as usize }, + 8usize, + concat!( + "Alignment of field: ", + stringify!(sockaddr_in), + "::", + stringify!(__pad) + ) + ); +} +impl Clone for sockaddr_in { + fn clone(&self) -> Self { + *self + } +} diff --git a/src/dragonball/src/dbs_utils/src/net/net_gen/mod.rs b/src/dragonball/src/dbs_utils/src/net/net_gen/mod.rs new file mode 100644 index 000000000..30758f351 --- /dev/null +++ b/src/dragonball/src/dbs_utils/src/net/net_gen/mod.rs @@ -0,0 +1,35 @@ +// Copyright 2023 Alibaba Cloud. All Rights Reserved. +// Copyright TUNTAP, 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. +// SPDX-License-Identifier: Apache-2.0 + +#![allow(clippy::all)] +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(missing_docs)] +#![allow(deref_nullptr)] + +// generated with bindgen /usr/include/linux/if.h --no-unstable-rust +// --constified-enum '*' --with-derive-default -- -D __UAPI_DEF_IF_IFNAMSIZ -D +// __UAPI_DEF_IF_NET_DEVICE_FLAGS -D __UAPI_DEF_IF_IFREQ -D __UAPI_DEF_IF_IFMAP +// Name is "iff" to avoid conflicting with "if" keyword. +// Generated against Linux 4.11 to include fix "uapi: fix linux/if.h userspace +// compilation errors". +// Manual fixup of ifrn_name to be of type c_uchar instead of c_char. +pub mod iff; +// generated with bindgen /usr/include/linux/if_tun.h --no-unstable-rust +// --constified-enum '*' --with-derive-default +pub mod if_tun; +// generated with bindgen /usr/include/linux/in.h --no-unstable-rust +// --constified-enum '*' --with-derive-default +// Name is "inn" to avoid conflicting with "in" keyword. +pub mod inn; +// generated with bindgen /usr/include/linux/sockios.h --no-unstable-rust +// --constified-enum '*' --with-derive-default +pub mod sockios; +pub use if_tun::*; +pub use iff::*; +pub use inn::*; +pub use sockios::*; diff --git a/src/dragonball/src/dbs_utils/src/net/net_gen/sockios.rs b/src/dragonball/src/dbs_utils/src/net/net_gen/sockios.rs new file mode 100644 index 000000000..18037c353 --- /dev/null +++ b/src/dragonball/src/dbs_utils/src/net/net_gen/sockios.rs @@ -0,0 +1,91 @@ +// Copyright 2023 Alibaba Cloud. All Rights Reserved. +// Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. +// SPDX-License-Identifier: Apache-2.0 + +/* automatically generated by rust-bindgen */ + +pub const FIOSETOWN: ::std::os::raw::c_uint = 35073; +pub const SIOCSPGRP: ::std::os::raw::c_uint = 35074; +pub const FIOGETOWN: ::std::os::raw::c_uint = 35075; +pub const SIOCGPGRP: ::std::os::raw::c_uint = 35076; +pub const SIOCATMARK: ::std::os::raw::c_uint = 35077; +pub const SIOCGSTAMP: ::std::os::raw::c_uint = 35078; +pub const SIOCGSTAMPNS: ::std::os::raw::c_uint = 35079; +pub const SOCK_IOC_TYPE: ::std::os::raw::c_uint = 137; +pub const SIOCADDRT: ::std::os::raw::c_uint = 35083; +pub const SIOCDELRT: ::std::os::raw::c_uint = 35084; +pub const SIOCRTMSG: ::std::os::raw::c_uint = 35085; +pub const SIOCGIFNAME: ::std::os::raw::c_uint = 35088; +pub const SIOCSIFLINK: ::std::os::raw::c_uint = 35089; +pub const SIOCGIFCONF: ::std::os::raw::c_uint = 35090; +pub const SIOCGIFFLAGS: ::std::os::raw::c_uint = 35091; +pub const SIOCSIFFLAGS: ::std::os::raw::c_uint = 35092; +pub const SIOCGIFADDR: ::std::os::raw::c_uint = 35093; +pub const SIOCSIFADDR: ::std::os::raw::c_uint = 35094; +pub const SIOCGIFDSTADDR: ::std::os::raw::c_uint = 35095; +pub const SIOCSIFDSTADDR: ::std::os::raw::c_uint = 35096; +pub const SIOCGIFBRDADDR: ::std::os::raw::c_uint = 35097; +pub const SIOCSIFBRDADDR: ::std::os::raw::c_uint = 35098; +pub const SIOCGIFNETMASK: ::std::os::raw::c_uint = 35099; +pub const SIOCSIFNETMASK: ::std::os::raw::c_uint = 35100; +pub const SIOCGIFMETRIC: ::std::os::raw::c_uint = 35101; +pub const SIOCSIFMETRIC: ::std::os::raw::c_uint = 35102; +pub const SIOCGIFMEM: ::std::os::raw::c_uint = 35103; +pub const SIOCSIFMEM: ::std::os::raw::c_uint = 35104; +pub const SIOCGIFMTU: ::std::os::raw::c_uint = 35105; +pub const SIOCSIFMTU: ::std::os::raw::c_uint = 35106; +pub const SIOCSIFNAME: ::std::os::raw::c_uint = 35107; +pub const SIOCSIFHWADDR: ::std::os::raw::c_uint = 35108; +pub const SIOCGIFENCAP: ::std::os::raw::c_uint = 35109; +pub const SIOCSIFENCAP: ::std::os::raw::c_uint = 35110; +pub const SIOCGIFHWADDR: ::std::os::raw::c_uint = 35111; +pub const SIOCGIFSLAVE: ::std::os::raw::c_uint = 35113; +pub const SIOCSIFSLAVE: ::std::os::raw::c_uint = 35120; +pub const SIOCADDMULTI: ::std::os::raw::c_uint = 35121; +pub const SIOCDELMULTI: ::std::os::raw::c_uint = 35122; +pub const SIOCGIFINDEX: ::std::os::raw::c_uint = 35123; +pub const SIOGIFINDEX: ::std::os::raw::c_uint = 35123; +pub const SIOCSIFPFLAGS: ::std::os::raw::c_uint = 35124; +pub const SIOCGIFPFLAGS: ::std::os::raw::c_uint = 35125; +pub const SIOCDIFADDR: ::std::os::raw::c_uint = 35126; +pub const SIOCSIFHWBROADCAST: ::std::os::raw::c_uint = 35127; +pub const SIOCGIFCOUNT: ::std::os::raw::c_uint = 35128; +pub const SIOCGIFBR: ::std::os::raw::c_uint = 35136; +pub const SIOCSIFBR: ::std::os::raw::c_uint = 35137; +pub const SIOCGIFTXQLEN: ::std::os::raw::c_uint = 35138; +pub const SIOCSIFTXQLEN: ::std::os::raw::c_uint = 35139; +pub const SIOCETHTOOL: ::std::os::raw::c_uint = 35142; +pub const SIOCGMIIPHY: ::std::os::raw::c_uint = 35143; +pub const SIOCGMIIREG: ::std::os::raw::c_uint = 35144; +pub const SIOCSMIIREG: ::std::os::raw::c_uint = 35145; +pub const SIOCWANDEV: ::std::os::raw::c_uint = 35146; +pub const SIOCOUTQNSD: ::std::os::raw::c_uint = 35147; +pub const SIOCGSKNS: ::std::os::raw::c_uint = 35148; +pub const SIOCDARP: ::std::os::raw::c_uint = 35155; +pub const SIOCGARP: ::std::os::raw::c_uint = 35156; +pub const SIOCSARP: ::std::os::raw::c_uint = 35157; +pub const SIOCDRARP: ::std::os::raw::c_uint = 35168; +pub const SIOCGRARP: ::std::os::raw::c_uint = 35169; +pub const SIOCSRARP: ::std::os::raw::c_uint = 35170; +pub const SIOCGIFMAP: ::std::os::raw::c_uint = 35184; +pub const SIOCSIFMAP: ::std::os::raw::c_uint = 35185; +pub const SIOCADDDLCI: ::std::os::raw::c_uint = 35200; +pub const SIOCDELDLCI: ::std::os::raw::c_uint = 35201; +pub const SIOCGIFVLAN: ::std::os::raw::c_uint = 35202; +pub const SIOCSIFVLAN: ::std::os::raw::c_uint = 35203; +pub const SIOCBONDENSLAVE: ::std::os::raw::c_uint = 35216; +pub const SIOCBONDRELEASE: ::std::os::raw::c_uint = 35217; +pub const SIOCBONDSETHWADDR: ::std::os::raw::c_uint = 35218; +pub const SIOCBONDSLAVEINFOQUERY: ::std::os::raw::c_uint = 35219; +pub const SIOCBONDINFOQUERY: ::std::os::raw::c_uint = 35220; +pub const SIOCBONDCHANGEACTIVE: ::std::os::raw::c_uint = 35221; +pub const SIOCBRADDBR: ::std::os::raw::c_uint = 35232; +pub const SIOCBRDELBR: ::std::os::raw::c_uint = 35233; +pub const SIOCBRADDIF: ::std::os::raw::c_uint = 35234; +pub const SIOCBRDELIF: ::std::os::raw::c_uint = 35235; +pub const SIOCSHWTSTAMP: ::std::os::raw::c_uint = 35248; +pub const SIOCGHWTSTAMP: ::std::os::raw::c_uint = 35249; +pub const SIOCDEVPRIVATE: ::std::os::raw::c_uint = 35312; +pub const SIOCPROTOPRIVATE: ::std::os::raw::c_uint = 35296; diff --git a/src/dragonball/src/dbs_utils/src/net/tap.rs b/src/dragonball/src/dbs_utils/src/net/tap.rs new file mode 100644 index 000000000..012cce494 --- /dev/null +++ b/src/dragonball/src/dbs_utils/src/net/tap.rs @@ -0,0 +1,471 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::fs::File; +use std::io::{Error as IoError, Read, Result as IoResult, Write}; +use std::net::UdpSocket; +use std::os::raw::*; +use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; + +use vmm_sys_util::ioctl::{ioctl_with_mut_ref, ioctl_with_ref, ioctl_with_val}; +use vmm_sys_util::{ioctl_ioc_nr, ioctl_iow_nr}; + +use crate::net::net_gen; + +// As defined in the Linux UAPI: +// https://elixir.bootlin.com/linux/v4.17/source/include/uapi/linux/if.h#L33 +pub(crate) const IFACE_NAME_MAX_LEN: usize = 16; + +/// List of errors the tap implementation can throw. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Failed to create a socket. + #[error("cannot create socket. {0}")] + CreateSocket(#[source] IoError), + + /// Unable to create tap interface. + #[error("cannot create tap devic. {0}")] + CreateTap(IoError), + + /// Invalid interface name. + #[error("invalid network interface name")] + InvalidIfname, + + /// ioctl failed. + #[error("failure while issue Tap ioctl command. {0}")] + IoctlError(#[source] IoError), + + /// Couldn't open /dev/net/tun. + #[error("cannot open tap device. {0}")] + OpenTun(#[source] IoError), +} + +pub type Result = ::std::result::Result; + +const TUNTAP: ::std::os::raw::c_uint = 84; +ioctl_iow_nr!(TUNSETIFF, TUNTAP, 202, ::std::os::raw::c_int); +ioctl_iow_nr!(TUNSETOFFLOAD, TUNTAP, 208, ::std::os::raw::c_uint); +ioctl_iow_nr!(TUNSETVNETHDRSZ, TUNTAP, 216, ::std::os::raw::c_int); + +/// Handle for a network tap interface. +/// +/// For now, this simply wraps the file descriptor for the tap device so methods +/// can run ioctls on the interface. The tap interface fd will be closed when +/// Tap goes out of scope, and the kernel will clean up the interface automatically. +#[derive(Debug)] +pub struct Tap { + /// tap device file handle + pub tap_file: File, + pub(crate) if_name: [u8; IFACE_NAME_MAX_LEN], + pub(crate) if_flags: std::os::raw::c_short, +} + +impl PartialEq for Tap { + fn eq(&self, other: &Tap) -> bool { + self.if_name == other.if_name + } +} + +fn create_socket() -> Result { + // This is safe since we check the return value. + let sock = unsafe { libc::socket(libc::AF_INET, libc::SOCK_DGRAM, 0) }; + if sock < 0 { + return Err(Error::CreateSocket(IoError::last_os_error())); + } + + // This is safe; nothing else will use or hold onto the raw sock fd. + Ok(unsafe { UdpSocket::from_raw_fd(sock) }) +} + +// Returns a byte vector representing the contents of a null terminated C string which +// contains if_name. +fn build_terminated_if_name(if_name: &str) -> Result<[u8; IFACE_NAME_MAX_LEN]> { + // Convert the string slice to bytes, and shadow the variable, + // since we no longer need the &str version. + let if_name = if_name.as_bytes(); + + if if_name.len() >= IFACE_NAME_MAX_LEN { + return Err(Error::InvalidIfname); + } + + let mut terminated_if_name = [b'\0'; IFACE_NAME_MAX_LEN]; + terminated_if_name[..if_name.len()].copy_from_slice(if_name); + + Ok(terminated_if_name) +} + +impl Tap { + /// Create a TUN/TAP device given the interface name. + /// # Arguments + /// + /// * `if_name` - the name of the interface. + /// # Example + /// + /// ```no_run + /// use dbs_utils::net::Tap; + /// Tap::open_named("doc-test-tap", false).unwrap(); + /// ``` + pub fn open_named(if_name: &str, multi_vq: bool) -> Result { + let terminated_if_name = build_terminated_if_name(if_name)?; + + // This is pretty messy because of the unions used by ifreq. Since we + // don't call as_mut on the same union field more than once, this block + // is safe. + let mut ifreq: net_gen::ifreq = Default::default(); + unsafe { + let ifrn_name = ifreq.ifr_ifrn.ifrn_name.as_mut(); + ifrn_name.copy_from_slice(terminated_if_name.as_ref()); + let ifru_flags = ifreq.ifr_ifru.ifru_flags.as_mut(); + *ifru_flags = (net_gen::IFF_TAP + | net_gen::IFF_NO_PI + | net_gen::IFF_VNET_HDR + | if multi_vq { + net_gen::IFF_MULTI_QUEUE + } else { + 0 + }) as c_short; + } + + Tap::create_tap_with_ifreq(&mut ifreq) + } + + fn create_tap_with_ifreq(ifreq: &mut net_gen::ifreq) -> Result { + let fd = unsafe { + // Open calls are safe because we give a constant null-terminated + // string and verify the result. + libc::open( + b"/dev/net/tun\0".as_ptr() as *const c_char, + libc::O_RDWR | libc::O_NONBLOCK | libc::O_CLOEXEC, + ) + }; + if fd < 0 { + return Err(Error::OpenTun(IoError::last_os_error())); + } + + // We just checked that the fd is valid. + let tuntap = unsafe { File::from_raw_fd(fd) }; + + // ioctl is safe since we call it with a valid tap fd and check the return + // value. + let ret = unsafe { ioctl_with_mut_ref(&tuntap, TUNSETIFF(), ifreq) }; + + if ret < 0 { + return Err(Error::CreateTap(IoError::last_os_error())); + } + + // Safe since only the name is accessed, and it's cloned out. + Ok(Tap { + tap_file: tuntap, + if_name: unsafe { *ifreq.ifr_ifrn.ifrn_name.as_ref() }, + if_flags: unsafe { *ifreq.ifr_ifru.ifru_flags.as_ref() }, + }) + } + + /// Change the origin tap into multiqueue taps. + pub fn into_mq_taps(self, vq_pairs: usize) -> Result> { + let mut taps = Vec::new(); + + if vq_pairs <= 1 { + taps.push(self); + return Ok(taps); + } + + // Add other socket into the origin tap interface + for _ in 0..vq_pairs - 1 { + let mut ifreq = self.get_ifreq(); + let tap = Tap::create_tap_with_ifreq(&mut ifreq)?; + + tap.enable()?; + + taps.push(tap); + } + + taps.insert(0, self); + Ok(taps) + } + + /// Set the offload flags for the tap interface. + pub fn set_offload(&self, flags: c_uint) -> Result<()> { + // ioctl is safe. Called with a valid tap fd, and we check the return. + let ret = unsafe { ioctl_with_val(&self.tap_file, TUNSETOFFLOAD(), c_ulong::from(flags)) }; + if ret < 0 { + return Err(Error::IoctlError(IoError::last_os_error())); + } + + Ok(()) + } + + /// Enable the tap interface. + pub fn enable(&self) -> Result<()> { + let sock = create_socket()?; + + let mut ifreq = self.get_ifreq(); + + // We only access one field of the ifru union, hence this is safe. + unsafe { + let ifru_flags = ifreq.ifr_ifru.ifru_flags.as_mut(); + *ifru_flags = + (net_gen::net_device_flags_IFF_UP | net_gen::net_device_flags_IFF_RUNNING) as i16; + } + + // ioctl is safe. Called with a valid sock fd, and we check the return. + let ret = + unsafe { ioctl_with_ref(&sock, c_ulong::from(net_gen::sockios::SIOCSIFFLAGS), &ifreq) }; + if ret < 0 { + return Err(Error::IoctlError(IoError::last_os_error())); + } + + Ok(()) + } + + /// Set the size of the vnet hdr. + pub fn set_vnet_hdr_size(&self, size: c_int) -> Result<()> { + // ioctl is safe. Called with a valid tap fd, and we check the return. + let ret = unsafe { ioctl_with_ref(&self.tap_file, TUNSETVNETHDRSZ(), &size) }; + if ret < 0 { + return Err(Error::IoctlError(IoError::last_os_error())); + } + + Ok(()) + } + + fn get_ifreq(&self) -> net_gen::ifreq { + let mut ifreq: net_gen::ifreq = Default::default(); + + // This sets the name of the interface, which is the only entry + // in a single-field union. + unsafe { + let ifrn_name = ifreq.ifr_ifrn.ifrn_name.as_mut(); + ifrn_name.clone_from_slice(&self.if_name); + + let flags = ifreq.ifr_ifru.ifru_flags.as_mut(); + *flags = self.if_flags; + } + + ifreq + } + + /// Get the origin flags when interface was created. + pub fn if_flags(&self) -> u32 { + self.if_flags as u32 + } +} + +impl Read for Tap { + fn read(&mut self, buf: &mut [u8]) -> IoResult { + self.tap_file.read(buf) + } +} + +impl Write for Tap { + fn write(&mut self, buf: &[u8]) -> IoResult { + self.tap_file.write(buf) + } + + fn flush(&mut self) -> IoResult<()> { + Ok(()) + } +} + +impl AsRawFd for Tap { + fn as_raw_fd(&self) -> RawFd { + self.tap_file.as_raw_fd() + } +} + +mod tests { + #![allow(dead_code)] + + use std::mem; + use std::net::Ipv4Addr; + use std::str; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use super::*; + + const SUBNET_MASK: &str = "255.255.255.0"; + const TAP_IP_PREFIX: &str = "192.168.241."; + const FAKE_MAC: &str = "12:34:56:78:9a:bc"; + + // We skip the first 10 bytes because the IFF_VNET_HDR flag is set when the interface + // is created, and the legacy header is 10 bytes long without a certain flag which + // is not set in Tap::new(). + const VETH_OFFSET: usize = 10; + static NEXT_IP: AtomicUsize = AtomicUsize::new(1); + + // Create a sockaddr_in from an IPv4 address, and expose it as + // an opaque sockaddr suitable for usage by socket ioctls. + fn create_sockaddr(ip_addr: Ipv4Addr) -> net_gen::sockaddr { + // IPv4 addresses big-endian (network order), but Ipv4Addr will give us + // a view of those bytes directly so we can avoid any endian trickiness. + let addr_in = net_gen::sockaddr_in { + sin_family: net_gen::AF_INET as u16, + sin_port: 0, + sin_addr: unsafe { mem::transmute(ip_addr.octets()) }, + __pad: [0; 8usize], + }; + + unsafe { mem::transmute(addr_in) } + } + impl Tap { + // We do not run unit tests in parallel so we should have no problem + // assigning the same IP. + + /// Create a new tap interface. + pub fn new() -> Result { + // The name of the tap should be {module_name}{index} so that + // we make sure it stays different when tests are run concurrently. + let next_ip = NEXT_IP.fetch_add(1, Ordering::SeqCst); + Self::open_named(&format!("dbs_tap{next_ip}"), false) + } + + /// Set the host-side IP address for the tap interface. + pub fn set_ip_addr(&self, ip_addr: Ipv4Addr) -> Result<()> { + let sock = create_socket()?; + let addr = create_sockaddr(ip_addr); + + let mut ifreq = self.get_ifreq(); + + // We only access one field of the ifru union, hence this is safe. + unsafe { + let ifru_addr = ifreq.ifr_ifru.ifru_addr.as_mut(); + *ifru_addr = addr; + } + + // ioctl is safe. Called with a valid sock fd, and we check the return. + let ret = unsafe { + ioctl_with_ref(&sock, c_ulong::from(net_gen::sockios::SIOCSIFADDR), &ifreq) + }; + if ret < 0 { + return Err(Error::IoctlError(IoError::last_os_error())); + } + + Ok(()) + } + + /// Set the netmask for the subnet that the tap interface will exist on. + pub fn set_netmask(&self, netmask: Ipv4Addr) -> Result<()> { + let sock = create_socket()?; + let addr = create_sockaddr(netmask); + + let mut ifreq = self.get_ifreq(); + + // We only access one field of the ifru union, hence this is safe. + unsafe { + let ifru_addr = ifreq.ifr_ifru.ifru_addr.as_mut(); + *ifru_addr = addr; + } + + // ioctl is safe. Called with a valid sock fd, and we check the return. + let ret = unsafe { + ioctl_with_ref( + &sock, + c_ulong::from(net_gen::sockios::SIOCSIFNETMASK), + &ifreq, + ) + }; + if ret < 0 { + return Err(Error::IoctlError(IoError::last_os_error())); + } + + Ok(()) + } + } + + fn tap_name_to_string(tap: &Tap) -> String { + let null_pos = tap.if_name.iter().position(|x| *x == 0).unwrap(); + str::from_utf8(&tap.if_name[..null_pos]) + .unwrap() + .to_string() + } + + #[test] + fn test_tap_name() { + // Sanity check that the assumed max iface name length is correct. + assert_eq!( + IFACE_NAME_MAX_LEN, + net_gen::ifreq__bindgen_ty_1::default() + .bindgen_union_field + .len() + ); + + // 16 characters - too long. + let name = "a123456789abcdef"; + match Tap::open_named(name, false) { + Err(Error::InvalidIfname) => (), + _ => panic!("Expected Error::InvalidIfname"), + }; + + // 15 characters - OK. + let name = "a123456789abcde"; + let tap = Tap::open_named(name, false).unwrap(); + assert_eq!( + name, + std::str::from_utf8(&tap.if_name[0..(IFACE_NAME_MAX_LEN - 1)]).unwrap() + ); + } + + #[test] + fn test_tap_partial_eq() { + assert_ne!(Tap::new().unwrap(), Tap::new().unwrap()); + } + + #[test] + fn test_tap_configure() { + // `fetch_add` adds to the current value, returning the previous value. + let next_ip = NEXT_IP.fetch_add(1, Ordering::SeqCst); + + let tap = Tap::new().unwrap(); + let ip_addr: Ipv4Addr = format!("{TAP_IP_PREFIX}{next_ip}").parse().unwrap(); + let netmask: Ipv4Addr = SUBNET_MASK.parse().unwrap(); + + let ret = tap.set_ip_addr(ip_addr); + assert!(ret.is_ok()); + let ret = tap.set_netmask(netmask); + assert!(ret.is_ok()); + } + + #[test] + fn test_set_options() { + // This line will fail to provide an initialized FD if the test is not run as root. + let tap = Tap::new().unwrap(); + tap.set_vnet_hdr_size(16).unwrap(); + tap.set_offload(0).unwrap(); + + let faulty_tap = Tap { + tap_file: unsafe { File::from_raw_fd(i32::MAX) }, + if_name: [0x01; 16], + if_flags: 0, + }; + assert!(faulty_tap.set_vnet_hdr_size(16).is_err()); + assert!(faulty_tap.set_offload(0).is_err()); + } + + #[test] + fn test_tap_enable() { + let tap = Tap::new().unwrap(); + let ret = tap.enable(); + assert!(ret.is_ok()); + } + + #[test] + fn test_tap_get_ifreq() { + let tap = Tap::new().unwrap(); + let ret = tap.get_ifreq(); + assert_eq!( + "__BindgenUnionField", + format!("{:?}", ret.ifr_ifrn.ifrn_name) + ); + } + + #[test] + fn test_raw_fd() { + let tap = Tap::new().unwrap(); + assert_eq!(tap.as_raw_fd(), tap.tap_file.as_raw_fd()); + } +} diff --git a/src/dragonball/src/dbs_utils/src/rate_limiter.rs b/src/dragonball/src/dbs_utils/src/rate_limiter.rs new file mode 100644 index 000000000..e99e2336c --- /dev/null +++ b/src/dragonball/src/dbs_utils/src/rate_limiter.rs @@ -0,0 +1,908 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![deny(missing_docs)] +//! # Rate Limiter +//! +//! Provides a rate limiter written in Rust useful for IO operations that need to +//! be throttled. +//! +//! ## Behavior +//! +//! The rate limiter starts off as 'unblocked' with two token buckets configured +//! with the values passed in the `RateLimiter::new()` constructor. +//! All subsequent accounting is done independently for each token bucket based +//! on the `TokenType` used. If any of the buckets runs out of budget, the limiter +//! goes in the 'blocked' state. At this point an internal timer is set up which +//! will later 'wake up' the user in order to retry sending data. The 'wake up' +//! notification will be dispatched as an event on the FD provided by the `AsRawFD` +//! trait implementation. +//! +//! The contract is that the user shall also call the `event_handler()` method on +//! receipt of such an event. +//! +//! The token buckets are replenished every time a `consume()` is called, before +//! actually trying to consume the requested amount of tokens. The amount of tokens +//! replenished is automatically calculated to respect the `complete_refill_time` +//! configuration parameter provided by the user. The token buckets will never +//! replenish above their respective `size`. +//! +//! Each token bucket can start off with a `one_time_burst` initial extra capacity +//! on top of their `size`. This initial extra credit does not replenish and +//! can be used for an initial burst of data. +//! +//! The granularity for 'wake up' events when the rate limiter is blocked is +//! currently hardcoded to `10 milliseconds`. +//! +//! ## Limitations +//! +//! This rate limiter implementation relies on the *Linux kernel's timerfd* so its +//! usage is limited to Linux systems. +//! +//! Another particularity of this implementation is that it is not self-driving. +//! It is meant to be used in an external event loop and thus implements the `AsRawFd` +//! trait and provides an *event-handler* as part of its API. This *event-handler* +//! needs to be called by the user on every event on the rate limiter's `AsRawFd` FD. + +use std::os::unix::io::{AsRawFd, RawFd}; +use std::time::{Duration, Instant}; +use std::{fmt, io}; + +use log::error; +use timerfd::{ClockId, SetTimeFlags, TimerFd, TimerState}; + +#[derive(Debug)] +/// Describes the errors that may occur while handling rate limiter events. +pub enum Error { + /// The event handler was called spuriously. + SpuriousRateLimiterEvent(&'static str), +} + +// Interval at which the refill timer will run when limiter is at capacity. +const REFILL_TIMER_INTERVAL_MS: u64 = 10; +const TIMER_REFILL_STATE: TimerState = + TimerState::Oneshot(Duration::from_millis(REFILL_TIMER_INTERVAL_MS)); + +const NANOSEC_IN_ONE_MILLISEC: u64 = 1_000_000; + +// Euclid's two-thousand-year-old algorithm for finding the greatest common divisor. +fn gcd(x: u64, y: u64) -> u64 { + let mut x = x; + let mut y = y; + while y != 0 { + let t = y; + y = x % y; + x = t; + } + x +} + +/// Enum describing the outcomes of a `reduce()` call on a `TokenBucket`. +#[derive(Clone, Debug, PartialEq)] +pub enum BucketReduction { + /// No enough tokens + Failure, + /// Part of the available tokens have been consumed. + Success, + /// A number of tokens `inner` times larger than the bucket size have been consumed. + OverConsumption(f64), +} + +/// TokenBucket provides a lower level interface to rate limiting with a +/// configurable capacity, refill-rate and initial burst. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct TokenBucket { + // Bucket defining traits. + size: u64, + // Initial burst size. + initial_one_time_burst: u64, + // Complete refill time in milliseconds. + refill_time: u64, + + // Internal state descriptors. + + // Number of free initial tokens, that can be consumed at no cost. + one_time_burst: u64, + // Current token budget. + budget: u64, + // Last time this token bucket saw activity. + last_update: Instant, + + // Fields used for pre-processing optimizations. + processed_capacity: u64, + processed_refill_time: u64, +} + +impl TokenBucket { + /// Creates a `TokenBucket` wrapped in an `Option`. + /// + /// TokenBucket created is of `size` total capacity and takes `complete_refill_time_ms` + /// milliseconds to go from zero tokens to total capacity. The `one_time_burst` is initial + /// extra credit on top of total capacity, that does not replenish and which can be used + /// for an initial burst of data. + /// + /// If the `size` or the `complete refill time` are zero, then `None` is returned. + pub fn new(size: u64, one_time_burst: u64, complete_refill_time_ms: u64) -> Self { + // If either token bucket capacity or refill time is 0, disable limiting. + debug_assert!(size != 0 && complete_refill_time_ms != 0); + + // Formula for computing current refill amount: + // refill_token_count = (delta_time * size) / (complete_refill_time_ms * 1_000_000) + // In order to avoid overflows, simplify the fractions by computing greatest common divisor. + + let complete_refill_time_ns = complete_refill_time_ms * NANOSEC_IN_ONE_MILLISEC; + // Get the greatest common factor between `size` and `complete_refill_time_ns`. + let common_factor = gcd(size, complete_refill_time_ns); + // The division will be exact since `common_factor` is a factor of `size`. + let processed_capacity: u64 = size / common_factor; + // The division will be exact since `common_factor` is a factor of `complete_refill_time_ns`. + let processed_refill_time: u64 = complete_refill_time_ns / common_factor; + + TokenBucket { + size, + one_time_burst, + initial_one_time_burst: one_time_burst, + refill_time: complete_refill_time_ms, + // Start off full. + budget: size, + // Last updated is now. + last_update: Instant::now(), + processed_capacity, + processed_refill_time, + } + } + + // Replenishes token bucket based on elapsed time. Should only be called internally by `Self`. + fn auto_replenish(&mut self) { + // Compute time passed since last refill/update. + let time_delta = self.last_update.elapsed().as_nanos() as u64; + self.last_update = Instant::now(); + + // At each 'time_delta' nanoseconds the bucket should refill with: + // refill_amount = (time_delta * size) / (complete_refill_time_ms * 1_000_000) + // `processed_capacity` and `processed_refill_time` are the result of simplifying above + // fraction formula with their greatest-common-factor. + let tokens = (time_delta * self.processed_capacity) / self.processed_refill_time; + self.budget = std::cmp::min(self.budget + tokens, self.size); + } + + /// Attempts to consume `tokens` from the bucket and returns whether the action succeeded. + pub fn reduce(&mut self, mut tokens: u64) -> BucketReduction { + // First things first: consume the one-time-burst budget. + if self.one_time_burst > 0 { + // We still have burst budget for *all* tokens requests. + if self.one_time_burst >= tokens { + self.one_time_burst -= tokens; + self.last_update = Instant::now(); + // No need to continue to the refill process, we still have burst budget to consume from. + return BucketReduction::Success; + } else { + // We still have burst budget for *some* of the tokens requests. + // The tokens left unfulfilled will be consumed from current `self.budget`. + tokens -= self.one_time_burst; + self.one_time_burst = 0; + } + } + + if tokens > self.budget { + // Hit the bucket bottom, let's auto-replenish and try again. + self.auto_replenish(); + + // This operation requests a bandwidth higher than the bucket size + if tokens > self.size { + error!( + "Consumed {} tokens from bucket of size {}", + tokens, self.size + ); + // Empty the bucket and report an overconsumption of + // (remaining tokens / size) times larger than the bucket size + tokens -= self.budget; + self.budget = 0; + return BucketReduction::OverConsumption(tokens as f64 / self.size as f64); + } + + if tokens > self.budget { + // Still not enough tokens, consume() fails, return false. + return BucketReduction::Failure; + } + } + + self.budget -= tokens; + BucketReduction::Success + } + + /// "Manually" adds tokens to bucket. + pub fn force_replenish(&mut self, tokens: u64) { + // This means we are still during the burst interval. + // Of course there is a very small chance that the last reduce() also used up burst + // budget which should now be replenished, but for performance and code-complexity + // reasons we're just gonna let that slide since it's practically inconsequential. + if self.one_time_burst > 0 { + self.one_time_burst += tokens; + return; + } + self.budget = std::cmp::min(self.budget + tokens, self.size); + } + + /// Returns the capacity of the token bucket. + pub fn capacity(&self) -> u64 { + self.size + } + + /// Returns the remaining one time burst budget. + pub fn one_time_burst(&self) -> u64 { + self.one_time_burst + } + + /// Returns the time in milliseconds required to to completely fill the bucket. + pub fn refill_time_ms(&self) -> u64 { + self.refill_time + } + + /// Returns the current budget (one time burst allowance notwithstanding). + pub fn budget(&self) -> u64 { + self.budget + } + + /// Returns the initially configured one time burst budget. + pub fn initial_one_time_burst(&self) -> u64 { + self.initial_one_time_burst + } +} + +/// Enum that describes the type of token used. +pub enum TokenType { + /// Token type used for bandwidth limiting. + Bytes, + /// Token type used for operations/second limiting. + Ops, +} + +/// Enum that describes the type of token bucket update. +#[derive(Clone, Debug)] +pub enum BucketUpdate { + /// No Update - same as before. + None, + /// Rate Limiting is disabled on this bucket. + Disabled, + /// Rate Limiting enabled with updated bucket. + Update(TokenBucket), +} + +/// Rate Limiter that works on both bandwidth and ops/s limiting. +/// +/// Bandwidth (bytes/s) and ops/s limiting can be used at the same time or individually. +/// +/// Implementation uses a single timer through TimerFd to refresh either or +/// both token buckets. +/// +/// Its internal buckets are 'passively' replenished as they're being used (as +/// part of `consume()` operations). +/// A timer is enabled and used to 'actively' replenish the token buckets when +/// limiting is in effect and `consume()` operations are disabled. +/// +/// RateLimiters will generate events on the FDs provided by their `AsRawFd` trait +/// implementation. These events are meant to be consumed by the user of this struct. +/// On each such event, the user must call the `event_handler()` method. +pub struct RateLimiter { + /// Bandwidth limit in bytes/s + bandwidth: Option, + /// Operate limit in ops/s + ops: Option, + /// Timer handle + timer_fd: TimerFd, + /// Internal flag that quickly determines timer state. + timer_active: bool, +} + +impl PartialEq for RateLimiter { + fn eq(&self, other: &RateLimiter) -> bool { + self.bandwidth == other.bandwidth && self.ops == other.ops + } +} + +impl fmt::Debug for RateLimiter { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "RateLimiter {{ bandwidth: {:?}, ops: {:?} }}", + self.bandwidth, self.ops + ) + } +} + +impl RateLimiter { + /// This function creates a `TokenBucket` wrapped in an `Option` with a given total capacity, + /// one time burst, and complete refill time (in miliseconds). If the total capacity or the + /// complete refill time are zero, then `None` is returned. + pub fn make_bucket( + total_capacity: u64, + one_time_burst: u64, + complete_refill_time_ms: u64, + ) -> Option { + // If either token bucket capacity or refill time is 0, disable limiting. + if total_capacity != 0 && complete_refill_time_ms != 0 { + Some(TokenBucket::new( + total_capacity, + one_time_burst, + complete_refill_time_ms, + )) + } else { + None + } + } + /// Creates a new Rate Limiter that can limit on both bytes/s and ops/s. + /// + /// # Arguments + /// + /// * `bytes_total_capacity` - the total capacity of the `TokenType::Bytes` token bucket. + /// * `bytes_one_time_burst` - initial extra credit on top of `bytes_total_capacity`, + /// that does not replenish and which can be used for an initial burst of data. + /// * `bytes_complete_refill_time_ms` - number of milliseconds for the `TokenType::Bytes` + /// token bucket to go from zero Bytes to `bytes_total_capacity` Bytes. + /// * `ops_total_capacity` - the total capacity of the `TokenType::Ops` token bucket. + /// * `ops_one_time_burst` - initial extra credit on top of `ops_total_capacity`, + /// that does not replenish and which can be used for an initial burst of data. + /// * `ops_complete_refill_time_ms` - number of milliseconds for the `TokenType::Ops` token + /// bucket to go from zero Ops to `ops_total_capacity` Ops. + /// + /// If either bytes/ops *size* or *refill_time* are **zero**, the limiter + /// is **disabled** for that respective token type. + /// + /// # Errors + /// + /// If the timerfd creation fails, an error is returned. + pub fn new( + bytes_total_capacity: u64, + bytes_one_time_burst: u64, + bytes_complete_refill_time_ms: u64, + ops_total_capacity: u64, + ops_one_time_burst: u64, + ops_complete_refill_time_ms: u64, + ) -> io::Result { + let bytes_token_bucket = Self::make_bucket( + bytes_total_capacity, + bytes_one_time_burst, + bytes_complete_refill_time_ms, + ); + + let ops_token_bucket = Self::make_bucket( + ops_total_capacity, + ops_one_time_burst, + ops_complete_refill_time_ms, + ); + + // We'll need a timer_fd, even if our current config effectively disables rate limiting, + // because `Self::update_buckets()` might re-enable it later, and we might be + // seccomp-blocked from creating the timer_fd at that time. + let timer_fd = TimerFd::new_custom(ClockId::Monotonic, true, true)?; + + Ok(RateLimiter { + bandwidth: bytes_token_bucket, + ops: ops_token_bucket, + timer_fd, + timer_active: false, + }) + } + + // Arm the timer of the rate limiter with the provided `TimerState`. + fn activate_timer(&mut self, timer_state: TimerState) { + // Register the timer; don't care about its previous state + self.timer_fd.set_state(timer_state, SetTimeFlags::Default); + self.timer_active = true; + } + + /// Attempts to consume tokens and returns whether that is possible. + /// + /// If rate limiting is disabled on provided `token_type`, this function will always succeed. + pub fn consume(&mut self, tokens: u64, token_type: TokenType) -> bool { + // If the timer is active, we can't consume tokens from any bucket and the function fails. + if self.timer_active { + return false; + } + + // Identify the required token bucket. + let token_bucket = match token_type { + TokenType::Bytes => self.bandwidth.as_mut(), + TokenType::Ops => self.ops.as_mut(), + }; + // Try to consume from the token bucket. + if let Some(bucket) = token_bucket { + let refill_time = bucket.refill_time_ms(); + match bucket.reduce(tokens) { + // When we report budget is over, there will be no further calls here, + // register a timer to replenish the bucket and resume processing; + // make sure there is only one running timer for this limiter. + BucketReduction::Failure => { + if !self.timer_active { + self.activate_timer(TIMER_REFILL_STATE); + } + false + } + // The operation succeeded and further calls can be made. + BucketReduction::Success => true, + // The operation succeeded as the tokens have been consumed + // but the timer still needs to be armed. + BucketReduction::OverConsumption(ratio) => { + // The operation "borrowed" a number of tokens `ratio` times + // greater than the size of the bucket, and since it takes + // `refill_time` milliseconds to fill an empty bucket, in + // order to enforce the bandwidth limit we need to prevent + // further calls to the rate limiter for + // `ratio * refill_time` milliseconds. + self.activate_timer(TimerState::Oneshot(Duration::from_millis( + (ratio * refill_time as f64) as u64, + ))); + true + } + } + } else { + // If bucket is not present rate limiting is disabled on token type, + // consume() will always succeed. + true + } + } + + /// Adds tokens of `token_type` to their respective bucket. + /// + /// Can be used to *manually* add tokens to a bucket. Useful for reverting a + /// `consume()` if needed. + pub fn manual_replenish(&mut self, tokens: u64, token_type: TokenType) { + // Identify the required token bucket. + let token_bucket = match token_type { + TokenType::Bytes => self.bandwidth.as_mut(), + TokenType::Ops => self.ops.as_mut(), + }; + // Add tokens to the token bucket. + if let Some(bucket) = token_bucket { + bucket.force_replenish(tokens); + } + } + + /// Returns whether this rate limiter is blocked. + /// + /// The limiter 'blocks' when a `consume()` operation fails because there was not enough + /// budget for it. + /// An event will be generated on the exported FD when the limiter 'unblocks'. + pub fn is_blocked(&self) -> bool { + self.timer_active + } + + /// This function needs to be called every time there is an event on the + /// FD provided by this object's `AsRawFd` trait implementation. + /// + /// # Errors + /// + /// If the rate limiter is disabled or is not blocked, an error is returned. + pub fn event_handler(&mut self) -> Result<(), Error> { + match self.timer_fd.read() { + 0 => Err(Error::SpuriousRateLimiterEvent( + "Rate limiter event handler called without a present timer", + )), + _ => { + self.timer_active = false; + Ok(()) + } + } + } + + /// Updates the parameters of the token buckets associated with this RateLimiter. + // TODO: Please note that, right now, the buckets become full after being updated. + pub fn update_buckets(&mut self, bytes: BucketUpdate, ops: BucketUpdate) { + match bytes { + BucketUpdate::Disabled => self.bandwidth = None, + BucketUpdate::Update(tb) => self.bandwidth = Some(tb), + BucketUpdate::None => (), + }; + match ops { + BucketUpdate::Disabled => self.ops = None, + BucketUpdate::Update(tb) => self.ops = Some(tb), + BucketUpdate::None => (), + }; + } + /// Returns an immutable view of the inner bandwidth token bucket. + pub fn bandwidth(&self) -> Option<&TokenBucket> { + self.bandwidth.as_ref() + } + + /// Returns an immutable view of the inner ops token bucket. + pub fn ops(&self) -> Option<&TokenBucket> { + self.ops.as_ref() + } +} + +impl AsRawFd for RateLimiter { + /// Provides a FD which needs to be monitored for POLLIN events. + /// + /// This object's `event_handler()` method must be called on such events. + /// + /// Will return a negative value if rate limiting is disabled on both + /// token types. + fn as_raw_fd(&self) -> RawFd { + self.timer_fd.as_raw_fd() + } +} + +impl Default for RateLimiter { + /// Default RateLimiter is a no-op limiter with infinite budget. + fn default() -> Self { + // Safe to unwrap since this will not attempt to create timer_fd. + RateLimiter::new(0, 0, 0, 0, 0, 0).expect("Failed to build default RateLimiter") + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::thread; + use std::time::Duration; + const TEST_REFILL_TIMER_INTERVAL_MS: u64 = 100; + impl TokenBucket { + // Resets the token bucket: budget set to max capacity and last-updated set to now. + fn reset(&mut self) { + self.budget = self.size; + self.last_update = Instant::now(); + } + + fn get_last_update(&self) -> &Instant { + &self.last_update + } + + fn get_processed_capacity(&self) -> u64 { + self.processed_capacity + } + + fn get_processed_refill_time(&self) -> u64 { + self.processed_refill_time + } + + // After a restore, we cannot be certain that the last_update field has the same value. + pub fn partial_eq(&self, other: &TokenBucket) -> bool { + (other.capacity() == self.capacity()) + && (other.one_time_burst() == self.one_time_burst()) + && (other.refill_time_ms() == self.refill_time_ms()) + && (other.budget() == self.budget()) + } + } + + impl RateLimiter { + fn get_token_bucket(&self, token_type: TokenType) -> Option<&TokenBucket> { + match token_type { + TokenType::Bytes => self.bandwidth.as_ref(), + TokenType::Ops => self.ops.as_ref(), + } + } + } + + #[test] + fn test_token_bucket_create() { + let before = Instant::now(); + let tb = TokenBucket::new(1000, 0, 1000); + assert_eq!(tb.capacity(), 1000); + assert_eq!(tb.budget(), 1000); + assert_eq!(tb.initial_one_time_burst(), 0); + assert!(*tb.get_last_update() >= before); + let after = Instant::now(); + assert!(*tb.get_last_update() <= after); + assert_eq!(tb.get_processed_capacity(), 1); + assert_eq!(tb.get_processed_refill_time(), 1_000_000); + } + + #[test] + fn test_token_bucket_preprocess() { + let tb = TokenBucket::new(1000, 0, 1000); + assert_eq!(tb.get_processed_capacity(), 1); + assert_eq!(tb.get_processed_refill_time(), NANOSEC_IN_ONE_MILLISEC); + + let thousand = 1000; + let tb = TokenBucket::new(3 * 7 * 11 * 19 * thousand, 0, 7 * 11 * 13 * 17); + assert_eq!(tb.get_processed_capacity(), 3 * 19); + assert_eq!( + tb.get_processed_refill_time(), + 13 * 17 * (NANOSEC_IN_ONE_MILLISEC / thousand) + ); + } + + #[test] + fn test_token_bucket_reduce() { + // token bucket with capacity 1000 and refill time of 1000 milliseconds + // allowing rate of 1 token/ms. + let capacity = 1000; + let refill_ms = 1000; + let mut tb = TokenBucket::new(capacity, 0, refill_ms as u64); + + assert_eq!(tb.reduce(123), BucketReduction::Success); + assert_eq!(tb.budget(), capacity - 123); + assert_eq!(tb.reduce(capacity), BucketReduction::Failure); + + // Since the CI machine might be slow, we should sleep less milliseconds here than desired 123 ms to avoid errors caused by CI machines. + thread::sleep(Duration::from_millis(80)); + assert_eq!(tb.reduce(1), BucketReduction::Success); + assert_eq!(tb.reduce(100), BucketReduction::Success); + assert_eq!(tb.reduce(capacity), BucketReduction::Failure); + + // token bucket with capacity 1000 and refill time of 1000 milliseconds + let mut tb = TokenBucket::new(1000, 1100, 1000); + // safely assuming the thread can run these 3 commands in less than 500ms + assert_eq!(tb.reduce(1000), BucketReduction::Success); + assert_eq!(tb.one_time_burst(), 100); + assert_eq!(tb.reduce(500), BucketReduction::Success); + assert_eq!(tb.one_time_burst(), 0); + assert_eq!(tb.reduce(500), BucketReduction::Success); + assert_eq!(tb.reduce(500), BucketReduction::Failure); + thread::sleep(Duration::from_millis(500)); + assert_eq!(tb.reduce(500), BucketReduction::Success); + thread::sleep(Duration::from_millis(1000)); + assert_eq!(tb.reduce(2500), BucketReduction::OverConsumption(1.5)); + + let before = Instant::now(); + tb.reset(); + assert_eq!(tb.capacity(), 1000); + assert_eq!(tb.budget(), 1000); + assert!(*tb.get_last_update() >= before); + let after = Instant::now(); + assert!(*tb.get_last_update() <= after); + } + + #[test] + fn test_rate_limiter_default() { + let mut l = RateLimiter::default(); + + // limiter should not be blocked + assert!(!l.is_blocked()); + // limiter should be disabled so consume(whatever) should work + assert!(l.consume(u64::max_value(), TokenType::Ops)); + assert!(l.consume(u64::max_value(), TokenType::Bytes)); + // calling the handler without there having been an event should error + assert!(l.event_handler().is_err()); + assert_eq!( + format!("{:?}", l.event_handler().err().unwrap()), + "SpuriousRateLimiterEvent(\ + \"Rate limiter event handler called without a present timer\")" + ); + } + + #[test] + fn test_rate_limiter_new() { + let l = RateLimiter::new(1000, 1001, 1002, 1003, 1004, 1005).unwrap(); + + let bw = l.bandwidth.unwrap(); + assert_eq!(bw.capacity(), 1000); + assert_eq!(bw.one_time_burst(), 1001); + assert_eq!(bw.initial_one_time_burst(), 1001); + assert_eq!(bw.refill_time_ms(), 1002); + assert_eq!(bw.budget(), 1000); + + let ops = l.ops.unwrap(); + assert_eq!(ops.capacity(), 1003); + assert_eq!(ops.one_time_burst(), 1004); + assert_eq!(ops.initial_one_time_burst(), 1004); + assert_eq!(ops.refill_time_ms(), 1005); + assert_eq!(ops.budget(), 1003); + } + + #[test] + fn test_rate_limiter_manual_replenish() { + // rate limiter with limit of 1000 bytes/s and 1000 ops/s + let mut l = RateLimiter::new(1000, 0, 1000, 1000, 0, 1000).unwrap(); + + // consume 123 bytes + assert!(l.consume(123, TokenType::Bytes)); + l.manual_replenish(23, TokenType::Bytes); + { + let bytes_tb = l.get_token_bucket(TokenType::Bytes).unwrap(); + assert_eq!(bytes_tb.budget(), 900); + } + // consume 123 ops + assert!(l.consume(123, TokenType::Ops)); + l.manual_replenish(23, TokenType::Ops); + { + let bytes_tb = l.get_token_bucket(TokenType::Ops).unwrap(); + assert_eq!(bytes_tb.budget(), 900); + } + } + + #[test] + fn test_rate_limiter_bandwidth() { + // rate limiter with limit of 1000 bytes/s + let mut l = RateLimiter::new(1000, 0, 1000, 0, 0, 0).unwrap(); + + // limiter should not be blocked + assert!(!l.is_blocked()); + // raw FD for this disabled should be valid + assert!(l.as_raw_fd() > 0); + + // ops/s limiter should be disabled so consume(whatever) should work + assert!(l.consume(u64::max_value(), TokenType::Ops)); + + // do full 1000 bytes + assert!(l.consume(1000, TokenType::Bytes)); + // try and fail on another 100 + assert!(!l.consume(100, TokenType::Bytes)); + // since consume failed, limiter should be blocked now + assert!(l.is_blocked()); + // wait half the timer period + thread::sleep(Duration::from_millis(TEST_REFILL_TIMER_INTERVAL_MS / 2)); + // limiter should still be blocked + assert!(l.is_blocked()); + // wait the other half of the timer period + thread::sleep(Duration::from_millis(TEST_REFILL_TIMER_INTERVAL_MS / 2)); + // the timer_fd should have an event on it by now + assert!(l.event_handler().is_ok()); + // limiter should now be unblocked + assert!(!l.is_blocked()); + // try and succeed on another 100 bytes this time + assert!(l.consume(100, TokenType::Bytes)); + } + + #[test] + fn test_rate_limiter_ops() { + // rate limiter with limit of 1000 ops/s + let mut l = RateLimiter::new(0, 0, 0, 1000, 0, 1000).unwrap(); + + // limiter should not be blocked + assert!(!l.is_blocked()); + // raw FD for this disabled should be valid + assert!(l.as_raw_fd() > 0); + + // bytes/s limiter should be disabled so consume(whatever) should work + assert!(l.consume(u64::max_value(), TokenType::Bytes)); + + // do full 1000 ops + assert!(l.consume(1000, TokenType::Ops)); + // try and fail on another 100 + assert!(!l.consume(100, TokenType::Ops)); + // since consume failed, limiter should be blocked now + assert!(l.is_blocked()); + // wait half the timer period + thread::sleep(Duration::from_millis(TEST_REFILL_TIMER_INTERVAL_MS / 2)); + // limiter should still be blocked + assert!(l.is_blocked()); + // wait the other half of the timer period + thread::sleep(Duration::from_millis(TEST_REFILL_TIMER_INTERVAL_MS / 2)); + // the timer_fd should have an event on it by now + assert!(l.event_handler().is_ok()); + // limiter should now be unblocked + assert!(!l.is_blocked()); + // try and succeed on another 100 ops this time + assert!(l.consume(100, TokenType::Ops)); + } + + #[test] + fn test_rate_limiter_full() { + // rate limiter with limit of 1000 bytes/s and 1000 ops/s + let mut l = RateLimiter::new(1000, 0, 1000, 1000, 0, 1000).unwrap(); + + // limiter should not be blocked + assert!(!l.is_blocked()); + // raw FD for this disabled should be valid + assert!(l.as_raw_fd() > 0); + + // do full 1000 bytes + assert!(l.consume(1000, TokenType::Ops)); + // do full 1000 bytes + assert!(l.consume(1000, TokenType::Bytes)); + // try and fail on another 100 ops + assert!(!l.consume(100, TokenType::Ops)); + // try and fail on another 100 bytes + assert!(!l.consume(100, TokenType::Bytes)); + // since consume failed, limiter should be blocked now + assert!(l.is_blocked()); + // wait half the timer period + thread::sleep(Duration::from_millis(TEST_REFILL_TIMER_INTERVAL_MS / 2)); + // limiter should still be blocked + assert!(l.is_blocked()); + // wait the other half of the timer period + thread::sleep(Duration::from_millis(TEST_REFILL_TIMER_INTERVAL_MS / 2)); + // the timer_fd should have an event on it by now + assert!(l.event_handler().is_ok()); + // limiter should now be unblocked + assert!(!l.is_blocked()); + // try and succeed on another 100 ops this time + assert!(l.consume(100, TokenType::Ops)); + // try and succeed on another 100 bytes this time + assert!(l.consume(100, TokenType::Bytes)); + } + + #[test] + fn test_rate_limiter_overconsumption() { + // initialize the rate limiter + let mut l = RateLimiter::new(1000, 0, 1000, 1000, 0, 1000).unwrap(); + // try to consume 2.5x the bucket size + // we are "borrowing" 1.5x the bucket size in tokens since + // the bucket is full + assert!(l.consume(2500, TokenType::Bytes)); + + // check that even after a whole second passes, the rate limiter + // is still blocked + thread::sleep(Duration::from_millis(1000)); + assert!(l.event_handler().is_err()); + assert!(l.is_blocked()); + + // after 1.5x the replenish time has passed, the rate limiter + // is available again + thread::sleep(Duration::from_millis(500)); + assert!(l.event_handler().is_ok()); + assert!(!l.is_blocked()); + + // reset the rate limiter + let mut l = RateLimiter::new(1000, 0, 1000, 1000, 0, 1000).unwrap(); + // try to consume 1.5x the bucket size + // we are "borrowing" 1.5x the bucket size in tokens since + // the bucket is full, should arm the timer to 0.5x replenish + // time, which is 500 ms + assert!(l.consume(1500, TokenType::Bytes)); + + // check that after more than the minimum refill time, + // the rate limiter is still blocked + thread::sleep(Duration::from_millis(200)); + assert!(l.event_handler().is_err()); + assert!(l.is_blocked()); + + // try to consume some tokens, which should fail as the timer + // is still active + assert!(!l.consume(100, TokenType::Bytes)); + assert!(l.event_handler().is_err()); + assert!(l.is_blocked()); + + // check that after the minimum refill time, the timer was not + // overwritten and the rate limiter is still blocked from the + // borrowing we performed earlier + thread::sleep(Duration::from_millis(90)); + assert!(l.event_handler().is_err()); + assert!(l.is_blocked()); + assert!(!l.consume(100, TokenType::Bytes)); + + // after waiting out the full duration, rate limiter should be + // availale again + thread::sleep(Duration::from_millis(210)); + assert!(l.event_handler().is_ok()); + assert!(!l.is_blocked()); + assert!(l.consume(100, TokenType::Bytes)); + } + + #[test] + fn test_update_buckets() { + let mut x = RateLimiter::new(1000, 2000, 1000, 10, 20, 1000).unwrap(); + + let initial_bw = x.bandwidth.clone(); + let initial_ops = x.ops.clone(); + + x.update_buckets(BucketUpdate::None, BucketUpdate::None); + assert_eq!(x.bandwidth, initial_bw); + assert_eq!(x.ops, initial_ops); + + let new_bw = RateLimiter::make_bucket(123, 0, 57).unwrap(); + let new_ops = RateLimiter::make_bucket(321, 12346, 89).unwrap(); + x.update_buckets( + BucketUpdate::Update(new_bw.clone()), + BucketUpdate::Update(new_ops.clone()), + ); + + // We have manually adjust the last_update field, because it changes when update_buckets() + // constructs new buckets (and thus gets a different value for last_update). We do this so + // it makes sense to test the following assertions. + x.bandwidth.as_mut().unwrap().last_update = new_bw.last_update; + x.ops.as_mut().unwrap().last_update = new_ops.last_update; + + assert_eq!(x.bandwidth, Some(new_bw)); + assert_eq!(x.ops, Some(new_ops)); + + x.update_buckets(BucketUpdate::Disabled, BucketUpdate::Disabled); + assert_eq!(x.bandwidth, None); + assert_eq!(x.ops, None); + } + + #[test] + fn test_rate_limiter_debug() { + let l = RateLimiter::new(1, 2, 3, 4, 5, 6).unwrap(); + assert_eq!( + format!("{l:?}"), + format!( + "RateLimiter {{ bandwidth: {:?}, ops: {:?} }}", + l.bandwidth(), + l.ops() + ), + ); + } +} diff --git a/src/dragonball/src/dbs_utils/src/time.rs b/src/dragonball/src/dbs_utils/src/time.rs new file mode 100644 index 000000000..899007f9d --- /dev/null +++ b/src/dragonball/src/dbs_utils/src/time.rs @@ -0,0 +1,258 @@ +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::fmt; + +/// Constant to convert seconds to nanoseconds. +pub const NANOS_PER_SECOND: u64 = 1_000_000_000; +/// Constant to convert milliseconds to nanoseconds. +pub const NANOS_PER_MILLISECOND: u64 = 1_000_000; + +/// Wrapper over `libc::clockid_t` to specify Linux Kernel clock source. +pub enum ClockType { + /// Equivalent to `libc::CLOCK_MONOTONIC`. + Monotonic, + /// Equivalent to `libc::CLOCK_REALTIME`. + Real, + /// Equivalent to `libc::CLOCK_PROCESS_CPUTIME_ID`. + ProcessCpu, + /// Equivalent to `libc::CLOCK_THREAD_CPUTIME_ID`. + ThreadCpu, +} + +impl From for libc::clockid_t { + fn from(clock_type: ClockType) -> Self { + match clock_type { + ClockType::Monotonic => libc::CLOCK_MONOTONIC, + ClockType::Real => libc::CLOCK_REALTIME, + ClockType::ProcessCpu => libc::CLOCK_PROCESS_CPUTIME_ID, + ClockType::ThreadCpu => libc::CLOCK_THREAD_CPUTIME_ID, + } + } +} + +/// Structure representing the date in local time with nanosecond precision. +pub struct LocalTime { + /// Seconds in current minute. + sec: i32, + /// Minutes in current hour. + min: i32, + /// Hours in current day, 24H format. + hour: i32, + /// Days in current month. + mday: i32, + /// Months in current year. + mon: i32, + /// Years passed since 1900 BC. + year: i32, + /// Nanoseconds in current second. + nsec: i64, +} + +impl LocalTime { + /// Returns the [LocalTime](struct.LocalTime.html) structure for the calling moment. + pub fn now() -> LocalTime { + let mut timespec = libc::timespec { + tv_sec: 0, + tv_nsec: 0, + }; + let mut tm: libc::tm = libc::tm { + tm_sec: 0, + tm_min: 0, + tm_hour: 0, + tm_mday: 0, + tm_mon: 0, + tm_year: 0, + tm_wday: 0, + tm_yday: 0, + tm_isdst: 0, + tm_gmtoff: 0, + tm_zone: std::ptr::null(), + }; + + // Safe because the parameters are valid. + unsafe { + libc::clock_gettime(libc::CLOCK_REALTIME, &mut timespec); + libc::localtime_r(×pec.tv_sec, &mut tm); + } + + LocalTime { + sec: tm.tm_sec, + min: tm.tm_min, + hour: tm.tm_hour, + mday: tm.tm_mday, + mon: tm.tm_mon, + year: tm.tm_year, + nsec: timespec.tv_nsec, + } + } +} + +impl fmt::Display for LocalTime { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{}-{:02}-{:02}T{:02}:{:02}:{:02}.{:09}", + self.year + 1900, + self.mon + 1, + self.mday, + self.hour, + self.min, + self.sec, + self.nsec + ) + } +} + +/// Holds a micro-second resolution timestamp with both the real time and cpu time. +#[derive(Clone)] +pub struct TimestampUs { + /// Real time in microseconds. + pub time_us: u64, + /// Cpu time in microseconds. + pub cputime_us: u64, +} + +impl Default for TimestampUs { + fn default() -> TimestampUs { + TimestampUs { + time_us: get_time_us(ClockType::Monotonic), + cputime_us: get_time_us(ClockType::ProcessCpu), + } + } +} + +/// Get process CPU time in us. +pub fn now_cputime_us() -> u64 { + get_time_us(ClockType::ProcessCpu) +} + +/// Returns a timestamp in nanoseconds from a monotonic clock. +/// +/// Uses `_rdstc` on `x86_64` and [`get_time`](fn.get_time.html) on other architectures. +pub fn timestamp_cycles() -> u64 { + #[cfg(target_arch = "x86_64")] + // Safe because there's nothing that can go wrong with this call. + unsafe { + std::arch::x86_64::_rdtsc() + } + #[cfg(not(target_arch = "x86_64"))] + { + get_time_ns(ClockType::Monotonic) + } +} + +/// Returns a timestamp in nanoseconds based on the provided clock type. +/// +/// # Arguments +/// +/// * `clock_type` - Identifier of the Linux Kernel clock on which to act. +pub fn get_time_ns(clock_type: ClockType) -> u64 { + let mut time_struct = libc::timespec { + tv_sec: 0, + tv_nsec: 0, + }; + // Safe because the parameters are valid. + unsafe { libc::clock_gettime(clock_type.into(), &mut time_struct) }; + seconds_to_nanoseconds(time_struct.tv_sec).expect("Time conversion overflow") as u64 + + (time_struct.tv_nsec as u64) +} + +/// Returns a timestamp in microseconds based on the provided clock type. +/// +/// # Arguments +/// +/// * `clock_type` - Identifier of the Linux Kernel clock on which to act. +pub fn get_time_us(clock_type: ClockType) -> u64 { + get_time_ns(clock_type) / 1000 +} + +/// Returns a timestamp in milliseconds based on the provided clock type. +/// +/// # Arguments +/// +/// * `clock_type` - Identifier of the Linux Kernel clock on which to act. +pub fn get_time_ms(clock_type: ClockType) -> u64 { + get_time_ns(clock_type) / NANOS_PER_MILLISECOND +} + +/// Converts a timestamp in seconds to an equivalent one in nanoseconds. +/// Returns `None` if the conversion overflows. +/// +/// # Arguments +/// +/// * `value` - Timestamp in seconds. +pub fn seconds_to_nanoseconds(value: i64) -> Option { + value.checked_mul(NANOS_PER_SECOND as i64) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_time() { + for _ in 0..1000 { + assert!(get_time_ns(ClockType::Monotonic) <= get_time_ns(ClockType::Monotonic)); + } + + for _ in 0..1000 { + assert!(get_time_ns(ClockType::ProcessCpu) <= get_time_ns(ClockType::ProcessCpu)); + } + + for _ in 0..1000 { + assert!(get_time_ns(ClockType::ThreadCpu) <= get_time_ns(ClockType::ThreadCpu)); + } + + assert_ne!(get_time_ns(ClockType::Real), 0); + assert_ne!(get_time_us(ClockType::Real), 0); + assert!(get_time_ns(ClockType::Real) / 1000 <= get_time_us(ClockType::Real)); + assert!( + get_time_ns(ClockType::Real) / NANOS_PER_MILLISECOND <= get_time_ms(ClockType::Real) + ); + } + + #[test] + fn test_local_time_display() { + let local_time = LocalTime { + sec: 30, + min: 15, + hour: 10, + mday: 4, + mon: 6, + year: 119, + nsec: 123_456_789, + }; + assert_eq!( + String::from("2019-07-04T10:15:30.123456789"), + local_time.to_string() + ); + + let local_time = LocalTime { + sec: 5, + min: 5, + hour: 5, + mday: 23, + mon: 7, + year: 44, + nsec: 123, + }; + assert_eq!( + String::from("1944-08-23T05:05:05.000000123"), + local_time.to_string() + ); + + let local_time = LocalTime::now(); + assert!(local_time.mon >= 0 && local_time.mon <= 11); + } + + #[test] + fn test_seconds_to_nanoseconds() { + assert_eq!( + seconds_to_nanoseconds(100).unwrap() as u64, + 100 * NANOS_PER_SECOND + ); + + assert!(seconds_to_nanoseconds(9_223_372_037).is_none()); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/Cargo.toml b/src/dragonball/src/dbs_virtio_devices/Cargo.toml new file mode 100644 index 000000000..c26b5ffd2 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/Cargo.toml @@ -0,0 +1,52 @@ +[package] +name = "dbs-virtio-devices" +version = "0.3.1" +authors = ["Alibaba Dragonball Team"] +license = "Apache-2.0 AND BSD-3-Clause" +edition = "2018" +description = "Virtio device backend driver framework and device drivers" +homepage = "https://github.com/openanolis/dragonball-sandbox" +repository = "https://github.com/openanolis/dragonball-sandbox/tree/main/crates/dbs-virtio-devices" +keywords = ["dragonball", "secure-sandbox", "devices", "virtio"] +readme = "README.md" + +[dependencies] +byteorder = "1.4.3" +caps = "0.5.3" +dbs-device = { path = "../dbs_device" } +dbs-interrupt = { path = "../dbs_interrupt", features = ["kvm-legacy-irq", "kvm-msi-irq"] } +dbs-utils = { path = "../dbs_utils" } +epoll = ">=4.3.1, <4.3.2" +io-uring = "0.5.2" +fuse-backend-rs = { version = "0.10.0", optional = true } +kvm-bindings = "0.6.0" +kvm-ioctls = "0.12.0" +libc = "0.2.119" +log = "0.4.14" +nix = "0.24.3" +nydus-api = "0.3.0" +nydus-rafs = "0.3.1" +nydus-storage = "0.6.3" +rlimit = "0.7.0" +serde = "1.0.27" +serde_json = "1.0.9" +thiserror = "1" +threadpool = "1" +virtio-bindings = "0.1.0" +virtio-queue = "0.6.0" +vmm-sys-util = "0.11.0" +vm-memory = { version = "0.9.0", features = [ "backend-mmap" ] } +sendfd = "0.4.3" + +[dev-dependencies] +vm-memory = { version = "0.9.0", features = [ "backend-mmap", "backend-atomic" ] } + +[features] +virtio-mmio = [] +virtio-vsock = ["virtio-mmio"] +virtio-net = ["virtio-mmio"] +virtio-blk = ["virtio-mmio"] +virtio-fs = ["virtio-mmio", "fuse-backend-rs/virtiofs", "nydus-rafs/virtio-fs"] +virtio-fs-pro = ["virtio-fs", "nydus-storage/backend-registry", "nydus-storage/backend-oss"] +virtio-mem = ["virtio-mmio"] +virtio-balloon = ["virtio-mmio"] diff --git a/src/dragonball/src/dbs_virtio_devices/LICENSE b/src/dragonball/src/dbs_virtio_devices/LICENSE new file mode 120000 index 000000000..30cff7403 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/src/dragonball/src/dbs_virtio_devices/README.md b/src/dragonball/src/dbs_virtio_devices/README.md new file mode 100644 index 000000000..1cc9f320e --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/README.md @@ -0,0 +1,11 @@ +# dbs-virtio-devices + +`dbs-virtio-devices` provides emulation for virtio devices. + +## Acknowledgement + +Part of the code is derived from the [Firecracker](https://github.com/firecracker-microvm/firecracker) project. + +## License + +This project is licensed under [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). diff --git a/src/dragonball/src/dbs_virtio_devices/THIRD-PARTY b/src/dragonball/src/dbs_virtio_devices/THIRD-PARTY new file mode 120000 index 000000000..301d0a498 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/THIRD-PARTY @@ -0,0 +1 @@ +../../THIRD-PARTY \ No newline at end of file diff --git a/src/dragonball/src/dbs_virtio_devices/src/balloon.rs b/src/dragonball/src/dbs_virtio_devices/src/balloon.rs new file mode 100644 index 000000000..8ddad0bf7 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/balloon.rs @@ -0,0 +1,1005 @@ +// Copyright (C) 2020 Alibaba Cloud Computing. All rights reserved. +// Copyright (c) 2020 Ant Financial +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#![allow(dead_code)] + +use std::any::Any; +use std::cmp; +use std::convert::TryFrom; +use std::io::{self, Write}; +use std::marker::PhantomData; +use std::mem::size_of; +use std::ops::Deref; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::atomic::AtomicBool; +use std::sync::{Arc, Mutex}; + +use dbs_device::resources::ResourceConstraint; +use dbs_interrupt::{InterruptNotifier, NoopNotifier}; +use dbs_utils::epoll_manager::{ + EpollManager, EventOps, EventSet, Events, MutEventSubscriber, SubscriberId, +}; +use log::{debug, error, info, trace}; +use virtio_bindings::bindings::virtio_blk::VIRTIO_F_VERSION_1; +use virtio_queue::{QueueOwnedT, QueueSync, QueueT}; +use vm_memory::{ + ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryRegion, + GuestRegionMmap, MemoryRegionAddress, +}; + +use crate::device::{VirtioDevice, VirtioDeviceConfig, VirtioDeviceInfo, VirtioQueueConfig}; +use crate::{ + ActivateResult, ConfigError, ConfigResult, DbsGuestAddressSpace, Error, Result, TYPE_BALLOON, +}; + +const BALLOON_DRIVER_NAME: &str = "virtio-balloon"; + +// Supported fields in the configuration space: +const CONFIG_SPACE_SIZE: usize = 16; + +const QUEUE_SIZE: u16 = 128; +const NUM_QUEUES: usize = 2; +const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE; NUM_QUEUES]; +const PMD_SHIFT: u64 = 21; +const PMD_SIZE: u64 = 1 << PMD_SHIFT; + +// New descriptors are pending on the virtio queue. +const INFLATE_QUEUE_AVAIL_EVENT: u32 = 0; +// New descriptors are pending on the virtio queue. +const DEFLATE_QUEUE_AVAIL_EVENT: u32 = 1; +// New descriptors are pending on the virtio queue. +const REPORTING_QUEUE_AVAIL_EVENT: u32 = 2; +// The device has been dropped. +const KILL_EVENT: u32 = 3; +// The device should be paused. +const PAUSE_EVENT: u32 = 4; +const BALLOON_EVENTS_COUNT: u32 = 5; + +// Page shift in the host. +const PAGE_SHIFT: u32 = 12; +// Huge Page shift in the host. +const HUGE_PAGE_SHIFT: u32 = 21; + +// Size of a PFN in the balloon interface. +const VIRTIO_BALLOON_PFN_SHIFT: u64 = 12; +// feature to deflate balloon on OOM +const VIRTIO_BALLOON_F_DEFLATE_ON_OOM: usize = 2; +// feature to enable free page reporting +const VIRTIO_BALLOON_F_REPORTING: usize = 5; + +// The PAGE_REPORTING_CAPACITY of CLH is set to 32. +// This value is got from patch in https://patchwork.kernel.org/patch/11377073/. +// But dragonball reporting capacity is set to 128 in before. +// So I keep 128. +const PAGE_REPORTING_CAPACITY: u16 = 128; + +#[derive(Debug, thiserror::Error)] +pub enum BalloonError {} + +pub type BalloonResult = std::result::Result; + +// Got from include/uapi/linux/virtio_balloon.h +#[repr(C, packed)] +#[derive(Copy, Clone, Debug, Default, PartialEq)] +pub struct VirtioBalloonConfig { + // Number of pages host wants Guest to give up. + pub(crate) num_pages: u32, + // Number of pages we've actually got in balloon. + pub(crate) actual: u32, +} + +// Safe because it only has data and has no implicit padding. +unsafe impl ByteValued for VirtioBalloonConfig {} + +pub struct BalloonEpollHandler< + AS: GuestAddressSpace, + Q: QueueT + Send = QueueSync, + R: GuestMemoryRegion = GuestRegionMmap, +> { + pub(crate) config: VirtioDeviceConfig, + pub(crate) inflate: VirtioQueueConfig, + pub(crate) deflate: VirtioQueueConfig, + pub(crate) reporting: Option>, + balloon_config: Arc>, +} + +impl + BalloonEpollHandler +{ + fn process_reporting_queue(&mut self) -> bool { + if let Some(queue) = &mut self.reporting { + if let Err(e) = queue.consume_event() { + error!("Failed to get reporting queue event: {:?}", e); + return false; + } + let mut used_desc_heads = [(0, 0); QUEUE_SIZE as usize]; + let mut used_count = 0; + let conf = &mut self.config; + let guard = conf.lock_guest_memory(); + let mem = guard.deref().memory(); + + let mut queue_guard = queue.queue_mut().lock(); + + let mut iter = match queue_guard.iter(mem) { + Err(e) => { + error!("virtio-balloon: failed to process reporting queue. {}", e); + return false; + } + Ok(iter) => iter, + }; + + for mut desc_chain in &mut iter { + let mut next_desc = desc_chain.next(); + let mut len = 0; + while let Some(avail_desc) = next_desc { + if avail_desc.len() as usize % size_of::() != 0 { + error!("the request size {} is not right", avail_desc.len()); + break; + } + let size = avail_desc.len(); + let addr = avail_desc.addr(); + len += size; + + if let Some(region) = mem.find_region(addr) { + let host_addr = match mem.get_host_address(addr) { + Ok(v) => v, + Err(e) => { + error!("virtio-balloon get host address failed! addr:{:x} size: {:x} error:{:?}", addr.0, size, e); + break; + } + }; + if region.file_offset().is_some() { + // when guest memory has file backend we use fallocate free memory + let file_offset = region.file_offset().unwrap(); + let file_fd = file_offset.file().as_raw_fd(); + let file_start = file_offset.start(); + let mode = libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE; + let start_addr = + region.get_host_address(MemoryRegionAddress(0)).unwrap(); + let offset = file_start as i64 + host_addr as i64 - start_addr as i64; + if let Err(e) = Self::do_fallocate(file_fd, offset, size as i64, mode) { + info!( + "virtio-balloon reporting failed fallocate guest address: {:x} offset: {:x} size {:x} fd {:?}", + addr.0, + offset, + size, + file_fd + ); + error!("fallocate get error {}", e); + } + } else { + // when guest memory have no file backend or comes from we use madvise free memory + let advise = libc::MADV_DONTNEED; + if let Err(e) = Self::do_madvise( + host_addr as *mut libc::c_void, + size as usize, + advise, + ) { + info!( + "guest address: {:?} host address: {:?} size {:?} advise {:?}", + addr, + host_addr, + 1 << PAGE_SHIFT, + advise + ); + error!("madvise get error {}", e); + } + } + } + next_desc = desc_chain.next(); + } + used_desc_heads[used_count] = (desc_chain.head_index(), len); + used_count += 1; + } + + drop(queue_guard); + + for &(desc_index, len) in &used_desc_heads[..used_count] { + queue.add_used(mem, desc_index, len); + } + if used_count > 0 { + match queue.notify() { + Ok(_v) => true, + Err(e) => { + error!( + "{}: Failed to signal device change event: {}", + BALLOON_DRIVER_NAME, e + ); + false + } + } + } else { + true + } + } else { + error!( + "{}: Invalid event: Free pages reporting was not configured", + BALLOON_DRIVER_NAME + ); + false + } + } + + fn process_queue(&mut self, idx: u32) -> bool { + let conf = &mut self.config; + + let queue = match idx { + INFLATE_QUEUE_AVAIL_EVENT => &mut self.inflate, + DEFLATE_QUEUE_AVAIL_EVENT => &mut self.deflate, + _ => { + error!("{}: unsupport idx {}", BALLOON_DRIVER_NAME, idx); + return false; + } + }; + + if let Err(e) = queue.consume_event() { + error!( + "{}: Failed to get idx {} queue event: {:?}", + BALLOON_DRIVER_NAME, idx, e + ); + return false; + } + + let mut advice = match idx { + INFLATE_QUEUE_AVAIL_EVENT => libc::MADV_DONTNEED, + DEFLATE_QUEUE_AVAIL_EVENT => libc::MADV_WILLNEED, + _ => { + error!( + "{}: balloon idx: {:?} is not right", + BALLOON_DRIVER_NAME, idx + ); + return false; + } + }; + + let mut used_desc_heads = [0; QUEUE_SIZE as usize]; + let mut used_count = 0; + let guard = conf.lock_guest_memory(); + let mem = guard.deref().memory(); + + let mut queue_guard = queue.queue_mut().lock(); + + let mut iter = match queue_guard.iter(mem) { + Err(e) => { + error!("virtio-balloon: failed to process queue. {}", e); + return false; + } + Ok(iter) => iter, + }; + + for mut desc_chain in &mut iter { + let avail_desc = match desc_chain.next() { + Some(avail_desc) => avail_desc, + None => { + error!( + "{}: Failed to parse balloon available descriptor chain", + BALLOON_DRIVER_NAME + ); + return false; + } + }; + + if avail_desc.is_write_only() { + error!( + "{}: The head contains the request type is not right", + BALLOON_DRIVER_NAME + ); + continue; + } + let avail_desc_len = avail_desc.len(); + if avail_desc_len as usize % size_of::() != 0 { + error!( + "{}: the request size {} is not right", + BALLOON_DRIVER_NAME, avail_desc_len + ); + continue; + } + + let mut offset = 0u64; + while offset < avail_desc_len as u64 { + // Get pfn + let pfn: u32 = match mem.read_obj(GuestAddress(avail_desc.addr().0 + offset)) { + Ok(ret) => ret, + Err(e) => { + error!( + "{}: Fail to read addr {}: {:?}", + BALLOON_DRIVER_NAME, + avail_desc.addr().0 + offset, + e + ); + break; + } + }; + offset += size_of::() as u64; + + // Get pfn_len + let pfn_len = match idx { + INFLATE_QUEUE_AVAIL_EVENT | DEFLATE_QUEUE_AVAIL_EVENT => 1 << PAGE_SHIFT, + _ => { + error!( + "{}: balloon idx: {:?} is not right", + BALLOON_DRIVER_NAME, idx + ); + return false; + } + }; + + trace!( + "{}: process_queue pfn {} len {}", + BALLOON_DRIVER_NAME, + pfn, + pfn_len + ); + + let guest_addr = (pfn as u64) << VIRTIO_BALLOON_PFN_SHIFT; + + if let Some(region) = mem.find_region(GuestAddress(guest_addr)) { + let host_addr = mem.get_host_address(GuestAddress(guest_addr)).unwrap(); + if advice == libc::MADV_DONTNEED && region.file_offset().is_some() { + advice = libc::MADV_REMOVE; + } + if let Err(e) = Self::do_madvise( + host_addr as *mut libc::c_void, + pfn_len as libc::size_t, + advice, + ) { + info!( + "{}: guest address: {:?} host address: {:?} size {:?} advise {:?}", + BALLOON_DRIVER_NAME, guest_addr, host_addr, pfn_len, advice + ); + error!("{}: madvise get error {}", BALLOON_DRIVER_NAME, e); + } + } else { + error!( + "{}: guest address 0x{:x} size {:?} advise {:?} is not available", + BALLOON_DRIVER_NAME, guest_addr, pfn_len, advice + ); + } + } + + used_desc_heads[used_count] = desc_chain.head_index(); + used_count += 1; + } + + drop(queue_guard); + + for &desc_index in &used_desc_heads[..used_count] { + queue.add_used(mem, desc_index, 0); + } + if used_count > 0 { + match queue.notify() { + Ok(_v) => true, + Err(e) => { + error!( + "{}: Failed to signal device queue event: {}", + BALLOON_DRIVER_NAME, e + ); + false + } + } + } else { + true + } + } + + fn do_madvise( + addr: *mut libc::c_void, + size: libc::size_t, + advise: libc::c_int, + ) -> std::result::Result<(), io::Error> { + let res = unsafe { libc::madvise(addr, size, advise) }; + if res != 0 { + return Err(io::Error::last_os_error()); + } + Ok(()) + } + + fn do_fallocate( + file_fd: RawFd, + offset: libc::off_t, + len: libc::off_t, + mode: libc::c_int, + ) -> std::result::Result<(), io::Error> { + let res = unsafe { libc::fallocate(file_fd, mode, offset, len) }; + if res != 0 { + return Err(io::Error::last_os_error()); + } + Ok(()) + } +} + +impl MutEventSubscriber + for BalloonEpollHandler +where + AS: 'static + GuestAddressSpace + Send + Sync, +{ + fn init(&mut self, ops: &mut EventOps) { + trace!( + target: BALLOON_DRIVER_NAME, + "{}: BalloonEpollHandler::init()", + BALLOON_DRIVER_NAME, + ); + let events = Events::with_data( + self.inflate.eventfd.as_ref(), + INFLATE_QUEUE_AVAIL_EVENT, + EventSet::IN, + ); + if let Err(e) = ops.add(events) { + error!( + "{}: failed to register INFLATE QUEUE event, {:?}", + BALLOON_DRIVER_NAME, e + ); + } + + let events = Events::with_data( + self.deflate.eventfd.as_ref(), + DEFLATE_QUEUE_AVAIL_EVENT, + EventSet::IN, + ); + if let Err(e) = ops.add(events) { + error!( + "{}: failed to register deflate queue event, {:?}", + BALLOON_DRIVER_NAME, e + ); + } + + if let Some(reporting) = &self.reporting { + let events = Events::with_data( + reporting.eventfd.as_ref(), + REPORTING_QUEUE_AVAIL_EVENT, + EventSet::IN, + ); + if let Err(e) = ops.add(events) { + error!( + "{}: failed to register reporting queue event, {:?}", + BALLOON_DRIVER_NAME, e + ); + } + } + } + + fn process(&mut self, events: Events, _ops: &mut EventOps) { + let guard = self.config.lock_guest_memory(); + let _mem = guard.deref(); + let idx = events.data(); + + trace!( + target: BALLOON_DRIVER_NAME, + "{}: BalloonEpollHandler::process() idx {}", + BALLOON_DRIVER_NAME, + idx + ); + match idx { + INFLATE_QUEUE_AVAIL_EVENT | DEFLATE_QUEUE_AVAIL_EVENT => { + if !self.process_queue(idx) { + error!("{}: Failed to handle {} queue", BALLOON_DRIVER_NAME, idx); + } + } + REPORTING_QUEUE_AVAIL_EVENT => { + if !self.process_reporting_queue() { + error!("Failed to handle reporting queue"); + } + } + KILL_EVENT => { + debug!("kill_evt received"); + } + _ => { + error!("{}: unknown idx {}", BALLOON_DRIVER_NAME, idx); + } + } + } +} + +fn page_number_to_mib(number: u64) -> u64 { + number << PAGE_SHIFT >> 10 >> 10 +} + +fn mib_to_page_number(mib: u64) -> u64 { + mib << 10 << 10 >> PAGE_SHIFT +} + +/// Virtio device for exposing entropy to the guest OS through virtio. +pub struct Balloon { + pub(crate) device_info: VirtioDeviceInfo, + pub(crate) config: Arc>, + pub(crate) paused: Arc, + pub(crate) device_change_notifier: Arc, + pub(crate) subscriber_id: Option, + pub(crate) phantom: PhantomData, +} + +#[derive(Copy, Clone, Debug, Default, PartialEq)] +pub struct BalloonConfig { + pub f_deflate_on_oom: bool, + pub f_reporting: bool, +} + +impl Balloon { + // Create a new virtio-balloon. + pub fn new(epoll_mgr: EpollManager, cfg: BalloonConfig) -> Result { + let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; + + let mut queue_sizes = QUEUE_SIZES.to_vec(); + + if cfg.f_deflate_on_oom { + avail_features |= 1u64 << VIRTIO_BALLOON_F_DEFLATE_ON_OOM; + } + if cfg.f_reporting { + avail_features |= 1u64 << VIRTIO_BALLOON_F_REPORTING; + queue_sizes.push(PAGE_REPORTING_CAPACITY); + } + + let config = VirtioBalloonConfig::default(); + + Ok(Balloon { + device_info: VirtioDeviceInfo::new( + BALLOON_DRIVER_NAME.to_string(), + avail_features, + Arc::new(queue_sizes), + config.as_slice().to_vec(), + epoll_mgr, + ), + config: Arc::new(Mutex::new(config)), + paused: Arc::new(AtomicBool::new(false)), + device_change_notifier: Arc::new(NoopNotifier::new()), + subscriber_id: None, + phantom: PhantomData, + }) + } + + pub fn set_size(&self, size_mb: u64) -> Result<()> { + let num_pages = mib_to_page_number(size_mb); + + let balloon_config = &mut self.config.lock().unwrap(); + balloon_config.num_pages = num_pages as u32; + if let Err(e) = self.device_change_notifier.notify() { + error!( + "{}: failed to signal device change event: {}", + BALLOON_DRIVER_NAME, e + ); + return Err(Error::IOError(e)); + } + + Ok(()) + } +} + +impl VirtioDevice for Balloon +where + AS: DbsGuestAddressSpace, + Q: QueueT + Send + 'static, + R: GuestMemoryRegion + Sync + Send + 'static, +{ + fn device_type(&self) -> u32 { + TYPE_BALLOON + } + + fn queue_max_sizes(&self) -> &[u16] { + &self.device_info.queue_sizes + } + + fn get_avail_features(&self, page: u32) -> u32 { + self.device_info.get_avail_features(page) + } + + fn set_acked_features(&mut self, page: u32, value: u32) { + trace!( + target: BALLOON_DRIVER_NAME, + "{}: VirtioDevice::set_acked_features({}, 0x{:x})", + BALLOON_DRIVER_NAME, + page, + value + ); + self.device_info.set_acked_features(page, value) + } + + fn read_config(&mut self, offset: u64, mut data: &mut [u8]) -> ConfigResult { + trace!( + target: BALLOON_DRIVER_NAME, + "{}: VirtioDevice::read_config(0x{:x}, {:?})", + BALLOON_DRIVER_NAME, + offset, + data + ); + let config = &self.config.lock().unwrap(); + let config_space = config.as_slice().to_vec(); + let config_len = config_space.len() as u64; + if offset >= config_len { + error!( + "{}: config space read request out of range, offset {}", + BALLOON_DRIVER_NAME, offset + ); + return Err(ConfigError::InvalidOffset(offset)); + } + if let Some(end) = offset.checked_add(data.len() as u64) { + // This write can't fail, offset and end are checked against config_len. + data.write_all(&config_space[offset as usize..cmp::min(end, config_len) as usize]) + .unwrap(); + } + Ok(()) + } + + fn write_config(&mut self, offset: u64, data: &[u8]) -> ConfigResult { + let config = &mut self.config.lock().unwrap(); + let config_slice = config.as_mut_slice(); + let Ok(start) = usize::try_from(offset) else { + error!("Failed to write config space"); + return Err(ConfigError::InvalidOffset(offset)); + }; + let Some(dst) = start.checked_add(data.len()) + .and_then(|end| config_slice.get_mut(start..end)) else + { + error!("Failed to write config space"); + return Err(ConfigError::InvalidOffsetPlusDataLen(offset + data.len() as u64)); + }; + dst.copy_from_slice(data); + Ok(()) + } + + fn activate(&mut self, mut config: VirtioDeviceConfig) -> ActivateResult { + self.device_info.check_queue_sizes(&config.queues)?; + self.device_change_notifier = config.device_change_notifier.clone(); + + trace!( + "{}: activate acked_features 0x{:x}", + BALLOON_DRIVER_NAME, + self.device_info.acked_features + ); + + let inflate = config.queues.remove(0); + let deflate = config.queues.remove(0); + let mut reporting = None; + if (self.device_info.acked_features & (1u64 << VIRTIO_BALLOON_F_REPORTING)) != 0 { + reporting = Some(config.queues.remove(0)); + } + + let handler = Box::new(BalloonEpollHandler { + config, + inflate, + deflate, + reporting, + balloon_config: self.config.clone(), + }); + + self.subscriber_id = Some(self.device_info.register_event_handler(handler)); + + Ok(()) + } + + fn get_resource_requirements( + &self, + requests: &mut Vec, + use_generic_irq: bool, + ) { + requests.push(ResourceConstraint::LegacyIrq { irq: None }); + if use_generic_irq { + requests.push(ResourceConstraint::GenericIrq { + size: (self.device_info.queue_sizes.len() + 1) as u32, + }); + } + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + +#[cfg(test)] +pub(crate) mod tests { + use dbs_device::resources::DeviceResources; + use dbs_utils::epoll_manager::SubscriberOps; + use kvm_ioctls::Kvm; + use vm_memory::GuestMemoryMmap; + use vmm_sys_util::eventfd::EventFd; + + use super::*; + use crate::tests::VirtQueue; + + fn create_balloon_epoll_handler() -> BalloonEpollHandler> { + let mem = Arc::new(GuestMemoryMmap::from_ranges(&[(GuestAddress(0x0), 0x10000)]).unwrap()); + let queues = vec![VirtioQueueConfig::create(128, 0).unwrap()]; + let resources = DeviceResources::new(); + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + + let config = VirtioDeviceConfig::new( + mem, + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + + let inflate = VirtioQueueConfig::create(128, 0).unwrap(); + let deflate = VirtioQueueConfig::create(128, 0).unwrap(); + let reporting = Some(VirtioQueueConfig::create(128, 0).unwrap()); + let balloon_config = Arc::new(Mutex::new(VirtioBalloonConfig::default())); + + BalloonEpollHandler { + config, + inflate, + deflate, + reporting, + balloon_config, + } + } + + #[test] + fn test_balloon_page_number_to_mib() { + assert_eq!(page_number_to_mib(1024), 4); + assert_eq!(page_number_to_mib(1023), 3); + assert_eq!(page_number_to_mib(0), 0); + } + + #[test] + fn test_balloon_mib_to_page_number() { + assert_eq!(mib_to_page_number(4), 1024); + assert_eq!(mib_to_page_number(2), 512); + assert_eq!(mib_to_page_number(0), 0); + } + + #[test] + fn test_balloon_virtio_device_normal() { + let epoll_mgr = EpollManager::default(); + let config = BalloonConfig { + f_deflate_on_oom: true, + f_reporting: true, + }; + + let mut dev = Balloon::>::new(epoll_mgr, config).unwrap(); + + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::device_type(&dev), + TYPE_BALLOON + ); + + let queue_size = vec![128, 128, 128]; + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::queue_max_sizes( + &dev + ), + &queue_size[..] + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 0), + dev.device_info.get_avail_features(0) + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 1), + dev.device_info.get_avail_features(1) + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 2), + dev.device_info.get_avail_features(2) + ); + VirtioDevice::>, QueueSync, GuestRegionMmap>::set_acked_features( + &mut dev, 2, 0, + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 2), + 0, + ); + let config: [u8; 8] = [0; 8]; + VirtioDevice::>, QueueSync, GuestRegionMmap>::write_config( + &mut dev, 0, &config, + ) + .unwrap(); + let mut data: [u8; 8] = [1; 8]; + VirtioDevice::>, QueueSync, GuestRegionMmap>::read_config( + &mut dev, 0, &mut data, + ) + .unwrap(); + assert_eq!(config, data); + } + + #[test] + fn test_balloon_virtio_device_active() { + let epoll_mgr = EpollManager::default(); + + // check queue sizes error + { + let config = BalloonConfig { + f_deflate_on_oom: true, + f_reporting: true, + }; + + let mut dev = Balloon::>::new(epoll_mgr.clone(), config).unwrap(); + let queues = vec![ + VirtioQueueConfig::::create(16, 0).unwrap(), + VirtioQueueConfig::::create(16, 0).unwrap(), + ]; + + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::>>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + assert!(dev.activate(config).is_err()); + } + // Success + { + let config = BalloonConfig { + f_deflate_on_oom: true, + f_reporting: true, + }; + + let mut dev = Balloon::>::new(epoll_mgr, config).unwrap(); + + let queues = vec![ + VirtioQueueConfig::::create(128, 0).unwrap(), + VirtioQueueConfig::::create(128, 0).unwrap(), + VirtioQueueConfig::::create(128, 0).unwrap(), + ]; + + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::>>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + assert!(dev.activate(config).is_ok()); + } + } + + #[test] + fn test_balloon_set_size() { + let epoll_mgr = EpollManager::default(); + let config = BalloonConfig { + f_deflate_on_oom: true, + f_reporting: true, + }; + + let dev = Balloon::>::new(epoll_mgr, config).unwrap(); + let size = 1024; + assert!(dev.set_size(size).is_ok()); + } + + #[test] + fn test_balloon_epoll_handler_handle_event() { + let handler = create_balloon_epoll_handler(); + let event_fd = EventFd::new(0).unwrap(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_op = inner_mgr.event_ops(id).unwrap(); + let event_set = EventSet::EDGE_TRIGGERED; + let mut handler = create_balloon_epoll_handler(); + + // test for INFLATE_QUEUE_AVAIL_EVENT + let events = Events::with_data(&event_fd, INFLATE_QUEUE_AVAIL_EVENT, event_set); + handler.process(events, &mut event_op); + + // test for DEFLATE_QUEUE_AVAIL_EVENT + let events = Events::with_data(&event_fd, DEFLATE_QUEUE_AVAIL_EVENT, event_set); + handler.process(events, &mut event_op); + + // test for REPORTING_QUEUE_AVAIL_EVENT + let events = Events::with_data(&event_fd, REPORTING_QUEUE_AVAIL_EVENT, event_set); + handler.process(events, &mut event_op); + + // test for KILL_EVENT + let events = Events::with_data(&event_fd, KILL_EVENT, event_set); + handler.process(events, &mut event_op); + + // test for unknown event + let events = Events::with_data(&event_fd, BALLOON_EVENTS_COUNT + 10, event_set); + handler.process(events, &mut event_op); + } + + #[test] + fn test_balloon_epoll_handler_process_report_queue() { + let mut handler = create_balloon_epoll_handler(); + let m = &handler.config.vm_as.clone(); + + // Failed to get reporting queue event + assert!(!handler.process_reporting_queue()); + + // No reporting queue + handler.reporting = None; + assert!(!handler.process_reporting_queue()); + + let vq = VirtQueue::new(GuestAddress(0), m, 16); + let q = vq.create_queue(); + vq.avail.idx().store(1); + vq.avail.ring(0).store(0); + vq.dtable(0).set(0x2000, 0x1000, 0, 0); + let queue_config = VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + ); + assert!(queue_config.generate_event().is_ok()); + handler.reporting = Some(queue_config); + //Success + assert!(handler.process_reporting_queue()); + } + + #[test] + fn test_balloon_epoll_handler_process_queue() { + let mut handler = create_balloon_epoll_handler(); + let m = &handler.config.vm_as.clone(); + // invalid idx + { + let vq = VirtQueue::new(GuestAddress(0), m, 16); + let q = vq.create_queue(); + vq.avail.idx().store(1); + vq.avail.ring(0).store(0); + vq.dtable(0).set(0x2000, 0x1000, 0, 0); + let queue_config = VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + ); + assert!(queue_config.generate_event().is_ok()); + handler.inflate = queue_config; + assert!(!handler.process_queue(10)); + } + // INFLATE_QUEUE_AVAIL_EVENT + { + let vq = VirtQueue::new(GuestAddress(0), m, 16); + let q = vq.create_queue(); + vq.avail.idx().store(1); + vq.avail.ring(0).store(0); + vq.dtable(0).set(0x2000, 0x1000, 0, 0); + vq.dtable(0).set(0x2000, 0x1000, 0, 0); + let queue_config = VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + ); + assert!(queue_config.generate_event().is_ok()); + handler.inflate = queue_config; + assert!(handler.process_queue(INFLATE_QUEUE_AVAIL_EVENT)); + } + // DEFLATE_QUEUE_AVAIL_EVENT + { + let vq = VirtQueue::new(GuestAddress(0), m, 16); + let q = vq.create_queue(); + vq.avail.idx().store(1); + vq.avail.ring(0).store(0); + vq.dtable(0).set(0x2000, 0x1000, 0, 0); + let queue_config = VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + ); + assert!(queue_config.generate_event().is_ok()); + handler.deflate = queue_config; + assert!(handler.process_queue(DEFLATE_QUEUE_AVAIL_EVENT)); + } + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/block/device.rs b/src/dragonball/src/dbs_virtio_devices/src/block/device.rs new file mode 100644 index 000000000..8caeef3b9 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/block/device.rs @@ -0,0 +1,1362 @@ +// Copyright 2019-2020 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::any::Any; +use std::collections::HashMap; +use std::io::{Seek, SeekFrom}; +use std::marker::PhantomData; +use std::sync::{mpsc, Arc}; +use std::thread; + +use dbs_device::resources::ResourceConstraint; +use dbs_utils::{ + epoll_manager::{EpollManager, SubscriberId}, + rate_limiter::{BucketUpdate, RateLimiter}, +}; +use log::{debug, error, info, warn}; +use virtio_bindings::bindings::virtio_blk::*; +use virtio_queue::QueueT; +use vm_memory::GuestMemoryRegion; +use vmm_sys_util::eventfd::{EventFd, EFD_NONBLOCK}; + +use crate::{ + ActivateError, ActivateResult, ConfigResult, DbsGuestAddressSpace, Error, Result, VirtioDevice, + VirtioDeviceConfig, VirtioDeviceInfo, TYPE_BLOCK, +}; + +use super::{ + BlockEpollHandler, InnerBlockEpollHandler, KillEvent, Ufile, BLK_DRIVER_NAME, SECTOR_SHIFT, + SECTOR_SIZE, +}; + +/// Supported fields in the configuration space: +/// - 64-bit disk size +/// - 32-bit size max +/// - 32-bit seg max +/// - 16-bit num_queues at offset 34 +const CONFIG_SPACE_SIZE: usize = 64; + +/// Max segments in a data request. +const CONFIG_MAX_SEG: u32 = 16; + +fn build_device_id(disk_image: &dyn Ufile) -> Vec { + let mut default_disk_image_id = vec![0; VIRTIO_BLK_ID_BYTES as usize]; + match disk_image.get_device_id() { + Err(_) => warn!("Could not generate device id. We'll use a default."), + Ok(m) => { + // The kernel only knows to read a maximum of VIRTIO_BLK_ID_BYTES. + // This will also zero out any leftover bytes. + let disk_id = m.as_bytes(); + let bytes_to_copy = std::cmp::min(disk_id.len(), VIRTIO_BLK_ID_BYTES as usize); + default_disk_image_id[..bytes_to_copy].clone_from_slice(&disk_id[..bytes_to_copy]) + } + } + default_disk_image_id +} + +/// Virtio device for exposing block level read/write operations on a host file. +pub struct Block { + pub(crate) device_info: VirtioDeviceInfo, + disk_images: Vec>, + rate_limiters: Vec, + queue_sizes: Arc>, + subscriber_id: Option, + kill_evts: Vec, + evt_senders: Vec>, + epoll_threads: Vec>, + phantom: PhantomData, +} + +impl Block { + /// Create a new virtio block device that operates on the given file. + /// + /// The given file must be seekable and sizable. + pub fn new( + mut disk_images: Vec>, + is_disk_read_only: bool, + queue_sizes: Arc>, + epoll_mgr: EpollManager, + rate_limiters: Vec, + ) -> Result { + let num_queues = disk_images.len(); + + if num_queues == 0 { + return Err(Error::InvalidInput); + } + + let disk_image = &mut disk_images[0]; + + let disk_size = disk_image.seek(SeekFrom::End(0)).map_err(Error::IOError)?; + if disk_size % SECTOR_SIZE != 0 { + warn!( + "Disk size {} is not a multiple of sector size {}; \ + the remainder will not be visible to the guest.", + disk_size, SECTOR_SIZE + ); + } + let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; + avail_features |= 1u64 << VIRTIO_BLK_F_SIZE_MAX; + avail_features |= 1u64 << VIRTIO_BLK_F_SEG_MAX; + + if is_disk_read_only { + avail_features |= 1u64 << VIRTIO_BLK_F_RO; + }; + + if num_queues > 1 { + avail_features |= 1u64 << VIRTIO_BLK_F_MQ; + } + + let config_space = + Self::build_config_space(disk_size, disk_image.get_max_size(), num_queues as u16); + + Ok(Block { + device_info: VirtioDeviceInfo::new( + BLK_DRIVER_NAME.to_string(), + avail_features, + queue_sizes.clone(), + config_space, + epoll_mgr, + ), + disk_images, + rate_limiters, + queue_sizes, + subscriber_id: None, + phantom: PhantomData, + evt_senders: Vec::with_capacity(num_queues), + kill_evts: Vec::with_capacity(num_queues), + epoll_threads: Vec::with_capacity(num_queues), + }) + } + + fn build_config_space(disk_size: u64, max_size: u32, num_queues: u16) -> Vec { + // The disk size field of the configuration space, which uses the first two words. + // If the image is not a multiple of the sector size, the tail bits are not exposed. + // The config space is little endian. + let mut config = Vec::with_capacity(CONFIG_SPACE_SIZE); + let num_sectors = disk_size >> SECTOR_SHIFT; + for i in 0..8 { + config.push((num_sectors >> (8 * i)) as u8); + } + + // The max_size field of the configuration space. + for i in 0..4 { + config.push((max_size >> (8 * i)) as u8); + } + + // The max_seg field of the configuration space. + let max_segs = CONFIG_MAX_SEG; + for i in 0..4 { + config.push((max_segs >> (8 * i)) as u8); + } + + for _i in 0..18 { + config.push(0_u8); + } + + for i in 0..2 { + config.push((num_queues >> (8 * i)) as u8); + } + + config + } + + pub fn set_patch_rate_limiters(&self, bytes: BucketUpdate, ops: BucketUpdate) -> Result<()> { + if self.evt_senders.is_empty() + || self.kill_evts.is_empty() + || self.evt_senders.len() != self.kill_evts.len() + { + error!("virtio-blk: failed to establish channel to send rate-limiter patch data"); + return Err(Error::InternalError); + } + + for sender in self.evt_senders.iter() { + if sender + .send(KillEvent::BucketUpdate(bytes.clone(), ops.clone())) + .is_err() + { + error!("virtio-blk: failed to send rate-limiter patch data"); + return Err(Error::InternalError); + } + } + + for kill_evt in self.kill_evts.iter() { + if let Err(e) = kill_evt.write(1) { + error!( + "virtio-blk: failed to write rate-limiter patch event {:?}", + e + ); + return Err(Error::InternalError); + } + } + + Ok(()) + } +} + +impl VirtioDevice for Block +where + AS: DbsGuestAddressSpace, + Q: QueueT + Send + 'static, + R: GuestMemoryRegion + Sync + Send + 'static, +{ + fn device_type(&self) -> u32 { + TYPE_BLOCK + } + + fn queue_max_sizes(&self) -> &[u16] { + &self.queue_sizes + } + + fn get_avail_features(&self, page: u32) -> u32 { + self.device_info.get_avail_features(page) + } + + fn set_acked_features(&mut self, page: u32, value: u32) { + self.device_info.set_acked_features(page, value) + } + + fn read_config(&mut self, offset: u64, data: &mut [u8]) -> ConfigResult { + self.device_info.read_config(offset, data) + } + + fn write_config(&mut self, offset: u64, data: &[u8]) -> ConfigResult { + self.device_info.write_config(offset, data) + } + + fn activate(&mut self, mut config: VirtioDeviceConfig) -> ActivateResult { + self.device_info.check_queue_sizes(&config.queues[..])?; + + if self.disk_images.len() != config.queues.len() { + error!( + "The disk images number: {} is not equal to queues number: {}", + self.disk_images.len(), + config.queues.len() + ); + return Err(ActivateError::InternalError); + } + let mut kill_evts = Vec::with_capacity(self.queue_sizes.len()); + + let mut i = 0; + // first to reverse the queue's order, thus to make sure the following + // pop queue got the right queue order. + config.queues.reverse(); + while let Some(queue) = config.queues.pop() { + let disk_image = self.disk_images.pop().unwrap(); + let disk_image_id = build_device_id(disk_image.as_ref()); + + let data_desc_vec = + vec![Vec::with_capacity(CONFIG_MAX_SEG as usize); self.queue_sizes[0] as usize]; + let iovecs_vec = + vec![Vec::with_capacity(CONFIG_MAX_SEG as usize); self.queue_sizes[0] as usize]; + + let rate_limiter = self.rate_limiters.pop().unwrap_or_default(); + + let (evt_sender, evt_receiver) = mpsc::channel(); + self.evt_senders.push(evt_sender); + + let kill_evt = EventFd::new(EFD_NONBLOCK)?; + + let mut handler = Box::new(InnerBlockEpollHandler { + rate_limiter, + disk_image, + disk_image_id, + pending_req_map: HashMap::new(), + data_desc_vec, + iovecs_vec, + evt_receiver, + vm_as: config.vm_as.clone(), + queue, + kill_evt: kill_evt.try_clone().unwrap(), + }); + + kill_evts.push(kill_evt.try_clone().unwrap()); + self.kill_evts.push(kill_evt); + + thread::Builder::new() + .name(format!("{}_q{}", "blk_iothread", i)) + .spawn(move || { + if let Err(e) = handler.run() { + error!("Error running worker: {:?}", e); + } + }) + .map(|thread| self.epoll_threads.push(thread)) + .map_err(|e| { + error!("failed to clone the virtio-block epoll thread: {}", e); + ActivateError::InternalError + })?; + + i += 1; + } + let block_handler = Box::new(BlockEpollHandler { + kill_evts, + evt_senders: self.evt_senders.clone(), + config, + }); + + // subscribe this handler for io drain. + self.subscriber_id = Some(self.device_info.register_event_handler(block_handler)); + + Ok(()) + } + + fn reset(&mut self) -> ActivateResult { + Ok(()) + } + + fn remove(&mut self) { + // if the subsriber_id is invalid, it has not been activated yet. + if let Some(subscriber_id) = self.subscriber_id { + // Remove BlockEpollHandler from event manager, so it could be dropped and the resources + // could be freed, e.g. close disk_image, so vmm won't hold the backend file. + match self.device_info.remove_event_handler(subscriber_id) { + Ok(_) => debug!("virtio-blk: removed subscriber_id {:?}", subscriber_id), + Err(e) => { + warn!("virtio-blk: failed to remove event handler: {:?}", e); + } + } + } + + for sender in self.evt_senders.iter() { + if sender.send(KillEvent::Kill).is_err() { + error!("virtio-blk: failed to send kill event to epoller thread"); + } + } + + // notify the io threads handlers to terminate. + for kill_evt in self.kill_evts.iter() { + if let Err(e) = kill_evt.write(1) { + error!("virtio-blk: failed to write kill event {:?}", e); + } + } + + while let Some(thread) = self.epoll_threads.pop() { + if let Err(e) = thread.join() { + error!("virtio-blk: failed to reap the io threads: {:?}", e); + } else { + info!("io thread got reaped."); + } + } + + self.subscriber_id = None; + } + + fn get_resource_requirements( + &self, + requests: &mut Vec, + use_generic_irq: bool, + ) { + requests.push(ResourceConstraint::LegacyIrq { irq: None }); + if use_generic_irq { + requests.push(ResourceConstraint::GenericIrq { + size: (self.queue_sizes.len() + 1) as u32, + }); + } + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + +#[cfg(test)] +mod tests { + use std::io::{self, Read, Seek, SeekFrom, Write}; + use std::os::unix::io::RawFd; + + use dbs_device::resources::DeviceResources; + use dbs_interrupt::NoopNotifier; + use dbs_utils::rate_limiter::{TokenBucket, TokenType}; + use kvm_ioctls::Kvm; + use virtio_queue::QueueSync; + use vm_memory::{Bytes, GuestAddress, GuestMemoryMmap, GuestRegionMmap}; + use vmm_sys_util::eventfd::EventFd; + + use crate::epoll_helper::*; + use crate::tests::{VirtQueue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; + use crate::{Error as VirtIoError, VirtioQueueConfig}; + + use super::*; + use crate::block::*; + + pub(super) struct DummyFile { + pub(super) device_id: Option, + pub(super) capacity: u64, + pub(super) have_complete_io: bool, + pub(super) max_size: u32, + pub(super) flush_error: bool, + } + + impl DummyFile { + pub(super) fn new() -> Self { + DummyFile { + device_id: None, + capacity: 0, + have_complete_io: false, + max_size: 0x100000, + flush_error: false, + } + } + } + + impl Read for DummyFile { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + Ok(buf.len()) + } + } + + impl Write for DummyFile { + fn write(&mut self, buf: &[u8]) -> io::Result { + Ok(buf.len()) + } + + fn flush(&mut self) -> io::Result<()> { + if self.flush_error { + Err(io::Error::new(io::ErrorKind::Other, "test flush error")) + } else { + Ok(()) + } + } + } + impl Seek for DummyFile { + fn seek(&mut self, _pos: SeekFrom) -> io::Result { + Ok(0) + } + } + + impl Ufile for DummyFile { + fn get_capacity(&self) -> u64 { + self.capacity + } + + fn get_max_size(&self) -> u32 { + self.max_size + } + + fn get_device_id(&self) -> io::Result { + match &self.device_id { + Some(id) => Ok(id.to_string()), + None => Err(io::Error::new(io::ErrorKind::Other, "dummy_error")), + } + } + + // std err + fn get_data_evt_fd(&self) -> RawFd { + 2 + } + + fn io_read_submit( + &mut self, + _offset: i64, + _iovecs: &mut Vec, + _aio_data: u16, + ) -> io::Result { + Ok(0) + } + + fn io_write_submit( + &mut self, + _offset: i64, + _iovecs: &mut Vec, + _aio_data: u16, + ) -> io::Result { + Ok(0) + } + + fn io_complete(&mut self) -> io::Result> { + let mut v = Vec::new(); + if self.have_complete_io { + v.push((0, 1)); + } + Ok(v) + } + } + + #[test] + fn test_block_build_device_id() { + let device_id = "dummy_device_id"; + let mut file = DummyFile::new(); + file.device_id = Some(device_id.to_string()); + let disk_image: Box = Box::new(file); + let disk_id = build_device_id(disk_image.as_ref()); + assert_eq!(disk_id.len() as u32, VIRTIO_BLK_ID_BYTES); + let disk_image: Box = Box::new(DummyFile::new()); + let disk_id2 = build_device_id(disk_image.as_ref()); + assert_eq!(disk_id2.len() as u32, VIRTIO_BLK_ID_BYTES); + assert_ne!(disk_id, disk_id2); + } + + #[test] + fn test_block_request_parse() { + let m = &GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let vq = VirtQueue::new(GuestAddress(0), m, 16); + let mut data_descs = Vec::with_capacity(CONFIG_MAX_SEG as usize); + + assert!(vq.end().0 < 0x1000); + + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + + { + let mut q = vq.create_queue(); + data_descs.clear(); + // write only request type descriptor + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_OUT, GuestAddress(0x1000)) + .unwrap(); + m.write_obj::(114, GuestAddress(0x1000 + 8)).unwrap(); + assert!(matches!( + Request::parse(&mut q.pop_descriptor_chain(m).unwrap(), &mut data_descs, 32), + Err(Error::UnexpectedWriteOnlyDescriptor) + )); + } + + { + let mut q = vq.create_queue(); + data_descs.clear(); + // chain too short; no status_desc + vq.dtable(0).flags().store(0); + assert!(matches!( + Request::parse(&mut q.pop_descriptor_chain(m).unwrap(), &mut data_descs, 32), + Err(Error::DescriptorChainTooShort) + )); + } + + { + let mut q = vq.create_queue(); + data_descs.clear(); + // chain too short; no data desc + vq.dtable(0).flags().store(VIRTQ_DESC_F_NEXT); + vq.dtable(1).set(0x2000, 0x1000, 0, 2); + assert!(matches!( + Request::parse(&mut q.pop_descriptor_chain(m).unwrap(), &mut data_descs, 32), + Err(Error::DescriptorChainTooShort) + )); + } + + { + let mut q = vq.create_queue(); + data_descs.clear(); + // write only data for OUT + vq.dtable(1) + .flags() + .store(VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE); + vq.dtable(2).set(0x3000, 0, 0, 0); + assert!(matches!( + Request::parse(&mut q.pop_descriptor_chain(m).unwrap(), &mut data_descs, 32), + Err(Error::UnexpectedWriteOnlyDescriptor) + )); + } + + { + let mut q = vq.create_queue(); + data_descs.clear(); + // read only data for OUT + m.write_obj::(VIRTIO_BLK_T_OUT, GuestAddress(0x1000)) + .unwrap(); + vq.dtable(1) + .flags() + .store(VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE); + assert!(matches!( + Request::parse(&mut q.pop_descriptor_chain(m).unwrap(), &mut data_descs, 32), + Err(Error::UnexpectedWriteOnlyDescriptor) + )); + } + + { + let mut q = vq.create_queue(); + data_descs.clear(); + // length too big data for OUT + m.write_obj::(VIRTIO_BLK_T_OUT, GuestAddress(0x1000)) + .unwrap(); + vq.dtable(1).flags().store(VIRTQ_DESC_F_NEXT); + vq.dtable(1).len().store(64); + assert!(matches!( + Request::parse(&mut q.pop_descriptor_chain(m).unwrap(), &mut data_descs, 32), + Err(Error::DescriptorLengthTooBig) + )); + } + + { + let mut q = vq.create_queue(); + data_descs.clear(); + // read only data for IN + m.write_obj::(VIRTIO_BLK_T_IN, GuestAddress(0x1000)) + .unwrap(); + vq.dtable(1).flags().store(VIRTQ_DESC_F_NEXT); + assert!(matches!( + Request::parse(&mut q.pop_descriptor_chain(m).unwrap(), &mut data_descs, 32), + Err(Error::UnexpectedReadOnlyDescriptor) + )); + } + + { + let mut q = vq.create_queue(); + data_descs.clear(); + // length too big data for IN + m.write_obj::(VIRTIO_BLK_T_IN, GuestAddress(0x1000)) + .unwrap(); + vq.dtable(1) + .flags() + .store(VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE); + vq.dtable(1).len().store(64); + assert!(matches!( + Request::parse(&mut q.pop_descriptor_chain(m).unwrap(), &mut data_descs, 32), + Err(Error::DescriptorLengthTooBig) + )); + } + + { + let mut q = vq.create_queue(); + data_descs.clear(); + // data desc write only and request type is getDeviceId + m.write_obj::(VIRTIO_BLK_T_GET_ID, GuestAddress(0x1000)) + .unwrap(); + vq.dtable(1) + .flags() + .store(VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE); + assert!(matches!( + Request::parse(&mut q.pop_descriptor_chain(m).unwrap(), &mut data_descs, 32), + Err(Error::UnexpectedReadOnlyDescriptor) + )); + } + + { + let mut q = vq.create_queue(); + data_descs.clear(); + // status desc read only + vq.dtable(2).flags().store(0); + assert!(matches!( + Request::parse(&mut q.pop_descriptor_chain(m).unwrap(), &mut data_descs, 32), + Err(Error::UnexpectedReadOnlyDescriptor) + )); + } + + { + let mut q = vq.create_queue(); + data_descs.clear(); + // status desc too small + vq.dtable(2).flags().store(VIRTQ_DESC_F_WRITE); + vq.dtable(2).len().store(0); + assert!(matches!( + Request::parse(&mut q.pop_descriptor_chain(m).unwrap(), &mut data_descs, 32), + Err(Error::DescriptorLengthTooSmall) + )); + } + + { + let mut q = vq.create_queue(); + data_descs.clear(); + // should be OK now + vq.dtable(2).len().store(0x1000); + let r = Request::parse(&mut q.pop_descriptor_chain(m).unwrap(), &mut data_descs, 32) + .unwrap(); + + assert_eq!(r.request_type, RequestType::GetDeviceID); + assert_eq!(r.sector, 114); + assert_eq!(data_descs[0].data_addr, 0x2000); + assert_eq!(data_descs[0].data_len, 0x40); + assert_eq!(r.status_addr, GuestAddress(0x3000)); + } + } + + #[test] + fn test_block_request_execute() { + let m = &GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let vq = VirtQueue::new(GuestAddress(0), m, 16); + let mut data_descs = Vec::with_capacity(CONFIG_MAX_SEG as usize); + assert!(vq.end().0 < 0x1000); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + + let mut file = DummyFile::new(); + file.capacity = 4096; + let mut disk: Box = Box::new(file); + let disk_id = build_device_id(disk.as_ref()); + + { + // RequestType::In + let mut q = vq.create_queue(); + data_descs.clear(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_IN, GuestAddress(0x1000)) + .unwrap(); + let req = Request::parse( + &mut q.pop_descriptor_chain(m).unwrap(), + &mut data_descs, + 0x100000, + ) + .unwrap(); + assert!(req.execute(&mut disk, m, &data_descs, &disk_id).is_ok()); + } + + { + // RequestType::Out + let mut q = vq.create_queue(); + data_descs.clear(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1).set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_OUT, GuestAddress(0x1000)) + .unwrap(); + let req = Request::parse( + &mut q.pop_descriptor_chain(m).unwrap(), + &mut data_descs, + 0x100000, + ) + .unwrap(); + assert!(req.execute(&mut disk, m, &data_descs, &disk_id).is_ok()); + } + + { + // RequestType::Flush + let mut q = vq.create_queue(); + data_descs.clear(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1).set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_FLUSH, GuestAddress(0x1000)) + .unwrap(); + let req = Request::parse( + &mut q.pop_descriptor_chain(m).unwrap(), + &mut data_descs, + 0x100000, + ) + .unwrap(); + assert!(req.execute(&mut disk, m, &data_descs, &disk_id).is_ok()); + } + + { + // RequestType::GetDeviceID + let mut q = vq.create_queue(); + data_descs.clear(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_GET_ID, GuestAddress(0x1000)) + .unwrap(); + let req = Request::parse( + &mut q.pop_descriptor_chain(m).unwrap(), + &mut data_descs, + 0x100000, + ) + .unwrap(); + assert!(req.execute(&mut disk, m, &data_descs, &disk_id).is_ok()); + } + + { + // RequestType::unsupport + let mut q = vq.create_queue(); + data_descs.clear(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_GET_ID + 10, GuestAddress(0x1000)) + .unwrap(); + let req = Request::parse( + &mut q.pop_descriptor_chain(m).unwrap(), + &mut data_descs, + 0x100000, + ) + .unwrap(); + match req.execute(&mut disk, m, &data_descs, &disk_id) { + Err(ExecuteError::Unsupported(n)) => assert_eq!(n, VIRTIO_BLK_T_GET_ID + 10), + _ => panic!(), + } + } + } + + #[test] + fn test_block_request_update_status() { + let m = Arc::new(GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap()); + let vq = VirtQueue::new(GuestAddress(0), &m, 16); + let mut data_descs = Vec::with_capacity(CONFIG_MAX_SEG as usize); + assert!(vq.end().0 < 0x1000); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + let mut q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_IN, GuestAddress(0x1000)) + .unwrap(); + let req = Request::parse( + &mut q.pop_descriptor_chain(m.as_ref()).unwrap(), + &mut data_descs, + 0x100000, + ) + .unwrap(); + req.update_status(m.as_ref(), 0); + } + + #[test] + fn test_block_request_check_capacity() { + let m = &GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let vq = VirtQueue::new(GuestAddress(0), m, 16); + let mut data_descs = Vec::with_capacity(CONFIG_MAX_SEG as usize); + assert!(vq.end().0 < 0x1000); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + + let mut disk: Box = Box::new(DummyFile::new()); + let disk_id = build_device_id(disk.as_ref()); + let mut q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_IN, GuestAddress(0x1000)) + .unwrap(); + let req = Request::parse( + &mut q.pop_descriptor_chain(m).unwrap(), + &mut data_descs, + 0x100000, + ) + .unwrap(); + assert!(matches!( + req.execute(&mut disk, m, &data_descs, &disk_id), + Err(ExecuteError::BadRequest(VirtIoError::InvalidOffset)) + )); + + let mut file = DummyFile::new(); + file.capacity = 4096; + let mut disk: Box = Box::new(file); + let mut q = vq.create_queue(); + data_descs.clear(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_IN, GuestAddress(0x1000)) + .unwrap(); + let req = Request::parse( + &mut q.pop_descriptor_chain(m).unwrap(), + &mut data_descs, + 0x100000, + ) + .unwrap(); + assert!(req.check_capacity(&mut disk, &data_descs).is_ok()); + } + + #[test] + fn test_block_virtio_device_normal() { + let device_id = "dummy_device_id"; + let epoll_mgr = EpollManager::default(); + + let mut file = DummyFile::new(); + println!("max size {}", file.max_size); + file.device_id = Some(device_id.to_string()); + let disk_image: Box = Box::new(file); + let mut dev = Block::>::new( + vec![disk_image], + true, + Arc::new(vec![128]), + epoll_mgr, + vec![], + ) + .unwrap(); + + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::device_type(&dev), + TYPE_BLOCK + ); + let queue_size = vec![128]; + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::queue_max_sizes( + &dev + ), + &queue_size[..] + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 0), + dev.device_info.get_avail_features(0) + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 1), + dev.device_info.get_avail_features(1) + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 2), + dev.device_info.get_avail_features(2) + ); + let mut config: [u8; 1] = [0]; + VirtioDevice::>, QueueSync, GuestRegionMmap>::read_config( + &mut dev, + 0, + &mut config, + ) + .unwrap(); + let config: [u8; 16] = [0; 16]; + VirtioDevice::>, QueueSync, GuestRegionMmap>::write_config( + &mut dev, 0, &config, + ) + .unwrap(); + } + + #[test] + fn test_block_virtio_device_active() { + let device_id = "dummy_device_id"; + let epoll_mgr = EpollManager::default(); + + { + // check_queue_sizes error + let mut file = DummyFile::new(); + file.device_id = Some(device_id.to_string()); + let disk_image: Box = Box::new(file); + let mut dev = Block::>>::new( + vec![disk_image], + true, + Arc::new(vec![128]), + epoll_mgr.clone(), + vec![], + ) + .unwrap(); + + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues = Vec::new(); + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::>>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + + assert!(matches!( + dev.activate(config), + Err(ActivateError::InvalidParam) + )); + } + + { + // test no disk_image + let mut file = DummyFile::new(); + file.device_id = Some(device_id.to_string()); + let disk_image: Box = Box::new(file); + let mut dev = Block::new( + vec![disk_image], + true, + Arc::new(vec![128]), + epoll_mgr.clone(), + vec![], + ) + .unwrap(); + dev.disk_images = vec![]; + + let mem = GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues = vec![VirtioQueueConfig::::create(256, 0).unwrap()]; + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::>>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + + assert!(matches!( + dev.activate(config), + Err(ActivateError::InternalError) + )); + } + + { + // Ok + let mut file = DummyFile::new(); + file.device_id = Some(device_id.to_string()); + let disk_image: Box = Box::new(file); + let mut dev = Block::new( + vec![disk_image], + true, + Arc::new(vec![128]), + epoll_mgr, + vec![], + ) + .unwrap(); + + let mem = GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues = vec![VirtioQueueConfig::::create(256, 0).unwrap()]; + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::>>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + + dev.activate(config).unwrap(); + } + } + + #[test] + fn test_block_set_patch_rate_limiters() { + let device_id = "dummy_device_id"; + let epoll_mgr = EpollManager::default(); + let mut file = DummyFile::new(); + file.device_id = Some(device_id.to_string()); + let disk_image: Box = Box::new(file); + let mut dev = Block::>::new( + vec![disk_image], + true, + Arc::new(vec![128]), + epoll_mgr, + vec![], + ) + .unwrap(); + + let (sender, _receiver) = mpsc::channel(); + dev.evt_senders = vec![sender]; + let event = EventFd::new(0).unwrap(); + dev.kill_evts = vec![event]; + + assert!(dev + .set_patch_rate_limiters(BucketUpdate::None, BucketUpdate::None) + .is_ok()); + } + + fn get_block_epoll_handler_with_file( + file: DummyFile, + ) -> InnerBlockEpollHandler, QueueSync> { + let mem = Arc::new(GuestMemoryMmap::from_ranges(&[(GuestAddress(0x0), 0x10000)]).unwrap()); + let queue = VirtioQueueConfig::create(256, 0).unwrap(); + let rate_limiter = RateLimiter::default(); + let disk_image: Box = Box::new(file); + let disk_image_id = build_device_id(disk_image.as_ref()); + + let data_desc_vec = vec![Vec::with_capacity(CONFIG_MAX_SEG as usize); 256]; + let iovecs_vec = vec![Vec::with_capacity(CONFIG_MAX_SEG as usize); 256]; + + let (_, evt_receiver) = mpsc::channel(); + + InnerBlockEpollHandler { + disk_image, + disk_image_id, + rate_limiter, + pending_req_map: HashMap::new(), + data_desc_vec, + iovecs_vec, + + kill_evt: EventFd::new(0).unwrap(), + evt_receiver, + + vm_as: mem, + queue, + } + } + + fn get_block_epoll_handler() -> InnerBlockEpollHandler, QueueSync> { + let mut file = DummyFile::new(); + file.capacity = 0x100000; + get_block_epoll_handler_with_file(file) + } + + #[test] + fn test_block_get_patch_rate_limiters() { + let mut handler = get_block_epoll_handler(); + let tokenbucket = TokenBucket::new(1, 1, 4); + + handler.get_patch_rate_limiters( + BucketUpdate::None, + BucketUpdate::Update(tokenbucket.clone()), + ); + assert_eq!(handler.rate_limiter.ops().unwrap(), &tokenbucket); + } + + #[test] + fn test_block_epoll_handler_handle_event() { + let mut handler = get_block_epoll_handler(); + let mut helper = EpollHelper::new().unwrap(); + + // test for QUEUE_AVAIL_EVENT + let events = epoll::Event::new(epoll::Events::EPOLLIN, QUEUE_AVAIL_EVENT as u64); + handler.handle_event(&mut helper, &events); + handler.queue.generate_event().unwrap(); + handler.handle_event(&mut helper, &events); + + // test for RATE_LIMITER_EVENT + let events = epoll::Event::new(epoll::Events::EPOLLIN, RATE_LIMITER_EVENT as u64); + handler.handle_event(&mut helper, &events); + + // test for END_IO_EVENT + let events = epoll::Event::new(epoll::Events::EPOLLIN, END_IO_EVENT as u64); + handler.handle_event(&mut helper, &events); + } + + #[test] + #[should_panic] + fn test_block_epoll_handler_handle_unknown_event() { + let mut handler = get_block_epoll_handler(); + let mut helper = EpollHelper::new().unwrap(); + + // test for unknown event + let events = epoll::Event::new(epoll::Events::EPOLLIN, KILL_EVENT as u64 + 10); + handler.handle_event(&mut helper, &events); + } + + #[test] + fn test_block_epoll_handler_process_queue() { + { + let mut file = DummyFile::new(); + file.capacity = 0x100000; + // set disk max_size to 0 will cause Request parse error + file.max_size = 0; + let mut handler = get_block_epoll_handler_with_file(file); + + let m = &handler.vm_as.clone(); + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + let q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_IN, GuestAddress(0x1000)) + .unwrap(); + + handler.queue = VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + ); + assert!(handler.process_queue()); + } + + { + // will cause check_capacity error + let file = DummyFile::new(); + let mut handler = get_block_epoll_handler_with_file(file); + let m = &handler.vm_as.clone(); + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + let q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_IN, GuestAddress(0x1000)) + .unwrap(); + + handler.queue = VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + ); + assert!(handler.process_queue()); + let err_info: u32 = handler.vm_as.read_obj(GuestAddress(0x3000)).unwrap(); + assert_eq!(err_info, VIRTIO_BLK_S_IOERR); + } + + { + // test io submit + let mut file = DummyFile::new(); + file.capacity = 0x100000; + let mut handler = get_block_epoll_handler_with_file(file); + let m = &handler.vm_as.clone(); + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + let q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_IN, GuestAddress(0x1000)) + .unwrap(); + + handler.queue = VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + ); + assert!(!handler.process_queue()); + assert_eq!(handler.pending_req_map.len(), 1); + } + + { + // test for other execute type (not IN/OUT) + let mut file = DummyFile::new(); + file.capacity = 0x100000; + let mut handler = get_block_epoll_handler_with_file(file); + let m = &handler.vm_as.clone(); + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + let q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_FLUSH, GuestAddress(0x1000)) + .unwrap(); + + handler.queue = VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + ); + assert!(handler.process_queue()); + let err_info: u32 = handler.vm_as.read_obj(GuestAddress(0x3000)).unwrap(); + assert_eq!(err_info, VIRTIO_BLK_S_OK); + } + + { + // test for other execute type (not IN/OUT) : error + let mut file = DummyFile::new(); + file.capacity = 0x100000; + file.flush_error = true; + let mut handler = get_block_epoll_handler_with_file(file); + let m = &handler.vm_as.clone(); + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + let q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_FLUSH, GuestAddress(0x1000)) + .unwrap(); + + handler.queue = VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + ); + assert!(handler.process_queue()); + let err_info: u32 = handler.vm_as.read_obj(GuestAddress(0x3000)).unwrap(); + assert_eq!(err_info, VIRTIO_BLK_S_IOERR); + } + + { + // test for other execute type (not IN/OUT) : non_supported + let mut file = DummyFile::new(); + file.capacity = 0x100000; + let mut handler = get_block_epoll_handler_with_file(file); + let m = &handler.vm_as.clone(); + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + let q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_FLUSH + 10, GuestAddress(0x1000)) + .unwrap(); + + handler.queue = VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + ); + assert!(handler.process_queue()); + let err_info: u32 = handler.vm_as.read_obj(GuestAddress(0x3000)).unwrap(); + assert_eq!(err_info, VIRTIO_BLK_S_UNSUPP); + } + + { + // test for rate limiter + let mut file = DummyFile::new(); + file.capacity = 0x100000; + let mut handler = get_block_epoll_handler_with_file(file); + handler.rate_limiter = RateLimiter::new(0, 0, 0, 1, 0, 100).unwrap(); + handler.rate_limiter.consume(1, TokenType::Ops); + let m = &handler.vm_as.clone(); + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + let q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_FLUSH, GuestAddress(0x1000)) + .unwrap(); + + handler.queue = VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + ); + assert!(!handler.process_queue()); + // test if rate limited + assert!(handler.rate_limiter.is_blocked()); + } + } + + #[test] + fn test_block_epoll_handler_io_complete() { + let m = &GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + // no data + let mut handler = get_block_epoll_handler(); + let mut data_descs = Vec::with_capacity(CONFIG_MAX_SEG as usize); + assert!(handler.io_complete().is_ok()); + + // have data + let mut file = DummyFile::new(); + file.have_complete_io = true; + let disk_image = Box::new(file); + handler.disk_image = disk_image; + + // no data in pending_req_map + assert!(matches!(handler.io_complete(), Err(Error::InternalError))); + + // data in pending_req_map + let vq = VirtQueue::new(GuestAddress(0), m, 16); + assert!(vq.end().0 < 0x1000); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + let mut q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x0, 1, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(VIRTIO_BLK_T_IN, GuestAddress(0x1000)) + .unwrap(); + let req = Request::parse( + &mut q.pop_descriptor_chain(m).unwrap(), + &mut data_descs, + 0x100000, + ) + .unwrap(); + handler.pending_req_map.insert(0, req); + handler.io_complete().unwrap(); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/block/handler.rs b/src/dragonball/src/dbs_virtio_devices/src/block/handler.rs new file mode 100644 index 000000000..08d4d6432 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/block/handler.rs @@ -0,0 +1,451 @@ +// Copyright 2019-2020 Alibnc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::collections::HashMap; +use std::ops::Deref; +use std::os::unix::io::AsRawFd; +use std::sync::mpsc::{Receiver, Sender}; + +use dbs_utils::{ + epoll_manager::{EventOps, Events, MutEventSubscriber}, + rate_limiter::{BucketUpdate, RateLimiter, TokenType}, +}; +use log::{debug, error, info, warn}; +use virtio_bindings::bindings::virtio_blk::*; +use virtio_queue::{Queue, QueueOwnedT, QueueT}; +use vm_memory::{Bytes, GuestAddress, GuestMemory, GuestMemoryRegion, GuestRegionMmap}; +use vmm_sys_util::eventfd::EventFd; + +use crate::{ + epoll_helper::{EpollHelper, EpollHelperError, EpollHelperHandler}, + DbsGuestAddressSpace, Error, Result, VirtioDeviceConfig, VirtioQueueConfig, +}; + +use super::{ExecuteError, IoDataDesc, KillEvent, Request, RequestType, Ufile, SECTOR_SHIFT}; + +// New descriptors are pending on the virtio queue. +pub const QUEUE_AVAIL_EVENT: u32 = 0; +// Rate limiter budget is now available. +pub const RATE_LIMITER_EVENT: u32 = 1; +// Some AIO requests have been completed. Used to support Linux AIO/TDC AIO. +pub const END_IO_EVENT: u32 = 2; +// trigger the thread to deal with some specific event +pub const KILL_EVENT: u32 = 4; + +pub(crate) struct InnerBlockEpollHandler { + pub(crate) disk_image: Box, + pub(crate) disk_image_id: Vec, + pub(crate) rate_limiter: RateLimiter, + pub(crate) pending_req_map: HashMap, + pub(crate) data_desc_vec: Vec>, + pub(crate) iovecs_vec: Vec>, + pub(crate) kill_evt: EventFd, + pub(crate) evt_receiver: Receiver, + + pub(crate) vm_as: AS, + pub(crate) queue: VirtioQueueConfig, +} + +impl InnerBlockEpollHandler { + pub(crate) fn process_queue(&mut self) -> bool { + let as_mem = self.vm_as.memory(); + let mem = as_mem.deref(); + let mut queue = self.queue.queue_mut().lock(); + + let mut iter = match queue.iter(mem) { + Err(e) => { + error!("virtio-blk: failed to iterate queue. {}", e); + return false; + } + Ok(iter) => iter, + }; + + // Used to collect used descriptors. (index, size) + let mut used_desc_vec: Vec<(u16, u32)> = Vec::new(); + let mut rate_limited = false; + + 'next_desc: for mut desc_chain in &mut iter { + // Safe to index data_desc_vec with index, as index has been checked in iterator + let index = desc_chain.head_index(); + let data_descs = &mut self.data_desc_vec[index as usize]; + let iovecs = &mut self.iovecs_vec[index as usize]; + data_descs.clear(); + iovecs.clear(); + match Request::parse(&mut desc_chain, data_descs, self.disk_image.get_max_size()) { + Err(e) => { + // It's caused by invalid request from guest, simple... + debug!("Failed to parse available descriptor chain: {:?}", e); + used_desc_vec.push((index, 0)); + } + Ok(req) => { + if Self::trigger_rate_limit(&mut self.rate_limiter, &req, data_descs) { + // stop processing the queue + rate_limited = true; + break 'next_desc; + } + // We try processing READ/WRITE requests using AIO first, and fallback to + // synchronous processing if it fails. + match Self::process_aio_request( + &req, + data_descs, + iovecs, + &mut self.disk_image, + mem.deref(), + ) { + Ok(submited) => { + if submited { + self.pending_req_map.insert(req.request_index, req.clone()); + continue 'next_desc; + } + // Else not Submited, fallback to synchronous processing + } + Err(_e) => { + req.update_status(mem.deref(), VIRTIO_BLK_S_IOERR); + used_desc_vec.push((index, 0)); + continue 'next_desc; + } + } + // Synchronously execute the request + // Take a new immutable data_descs reference, as previous mutable one may have + // been consumed. + let data_descs = &self.data_desc_vec[req.request_index as usize]; + match Self::process_request( + &req, + &data_descs[..], + &mut self.disk_image, + &self.disk_image_id, + mem.deref(), + ) { + Ok(num_bytes_to_mem) => { + used_desc_vec.push((index, num_bytes_to_mem)); + } + Err(_e) => { + //METRICS.block.execute_fails.inc(); + used_desc_vec.push((index, 0)); + } + } + } + } + } + if rate_limited { + // If rate limiting kicked in, queue had advanced one element that we aborted + // processing; go back one element so it can be processed next time. + // TODO: log rate limit message or METRIC + iter.go_to_previous_position(); + } + drop(queue); + if !used_desc_vec.is_empty() { + for entry in &used_desc_vec { + self.queue.add_used(mem, entry.0, entry.1); + } + true + } else { + false + } + } + + fn trigger_rate_limit( + rate_limiter: &mut RateLimiter, + req: &Request, + data_descs: &[IoDataDesc], + ) -> bool { + // If limiter.consume() fails it means there is no more TokenType::Ops budget + // and rate limiting is in effect. + if !rate_limiter.consume(1, TokenType::Ops) { + // stop processing the queue + return true; + } + // Exercise the rate limiter only if this request is of data transfer type. + if req.request_type == RequestType::In || req.request_type == RequestType::Out { + // If limiter.consume() fails it means there is no more TokenType::Bytes + // budget and rate limiting is in effect. + + if !rate_limiter.consume(u64::from(req.data_len(data_descs)), TokenType::Bytes) { + // Revert the OPS consume(). + rate_limiter.manual_replenish(1, TokenType::Ops); + return true; + } + } + false + } + + fn process_request( + req: &Request, + data_descs: &[IoDataDesc], + disk_image: &mut Box, + disk_image_id: &[u8], + mem: &M, + ) -> std::result::Result { + match req.execute(disk_image, mem.deref(), data_descs, disk_image_id) { + Ok(l) => { + req.update_status(mem.deref(), VIRTIO_BLK_S_OK); + Ok(l) + } + Err(e) => { + let err_code = match &e { + ExecuteError::BadRequest(e) => { + // It's caused by invalid request from guest, simple... + debug!("Failed to execute GetDeviceID request: {:?}", e); + VIRTIO_BLK_S_IOERR + } + ExecuteError::Flush(e) => { + // only temporary errors are possible here + // TODO recovery + debug!("Failed to execute Flush request: {:?}", e); + VIRTIO_BLK_S_IOERR + } + ExecuteError::Read(e) | ExecuteError::Write(e) => { + // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + // The error recovery policy here is a little messy. + // We can't tell the error type from the returned error code + // and no easy way to recover. + // Hopefully AIO are used and read/write requests never ever + // reaches here when TDC live upgrading is enabled. + // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + warn!("virtio-blk: Failed to execute Read/Write request: {:?}", e); + VIRTIO_BLK_S_IOERR + } + ExecuteError::Seek(e) => { + // It's caused by invalid request from guest, simple... + warn!( + "virtio-blk: Failed to execute out-of-boundary request: {:?}", + e + ); + VIRTIO_BLK_S_IOERR + } + ExecuteError::GetDeviceID(e) => { + // It's caused by invalid request from guest, simple... + warn!("virtio-blk: Failed to execute GetDeviceID request: {:?}", e); + VIRTIO_BLK_S_IOERR + } + ExecuteError::Unsupported(e) => { + // It's caused by invalid request from guest, simple... + warn!("virtio-blk: Failed to execute request: {:?}", e); + VIRTIO_BLK_S_UNSUPP + } + }; + + req.update_status(mem.deref(), err_code); + Err(e) + } + } + } + + // TODO: We should hide the logic of this function inside the Ufile implementation, + // instead of appearing here. + fn process_aio_request( + req: &Request, + data_descs: &[IoDataDesc], + iovecs: &mut Vec, + disk_image: &mut Box, + mem: &M, + ) -> std::result::Result { + if req.request_type != RequestType::In && req.request_type != RequestType::Out { + return Ok(false); + } + + req.check_capacity(disk_image, data_descs).map_err(|e| { + // It's caused by invalid request from guest, simple... + debug!("Failed to get buffer address for request"); + e + })?; + + for io in data_descs { + let host_addr = mem + .get_host_address(GuestAddress(io.data_addr)) + .map_err(|e| { + // It's caused by invalid request from guest, simple... + warn!( + "virtio-blk: Failed to get buffer guest address {:?} for request {:?}", + io.data_addr, req + ); + ExecuteError::BadRequest(Error::GuestMemory(e)) + })?; + iovecs.push(IoDataDesc { + data_addr: host_addr as u64, + data_len: io.data_len, + }); + } + + let submiter: fn( + &mut (dyn Ufile + 'static), + i64, + &mut Vec, + u16, + ) -> std::io::Result = match req.request_type { + RequestType::In => Ufile::io_read_submit, + RequestType::Out => Ufile::io_write_submit, + _ => panic!( + "virtio-blk: unexpected request type {:?} in async I/O", + req.request_type + ), + }; + + match submiter( + disk_image.as_mut(), + (req.sector << SECTOR_SHIFT) as i64, + iovecs, + req.request_index, + ) { + Ok(_) => { + // The request has been queued waiting for process + Ok(true) + } + Err(e) => { + warn!("virtio-blk: submit request {:?} error. {}", req, e); + // Failure may be caused by: + // no enough resource to queue the AIO request + // TODO recover + + // Now fallback to synchronous processing + Ok(false) + } + } + } + + pub(crate) fn io_complete(&mut self) -> Result<()> { + let as_mem = self.vm_as.memory(); + let mem: &AS::M = as_mem.deref(); + let iovs = self.disk_image.io_complete()?; + + // No data to handle + if iovs.is_empty() { + return Ok(()); + } + + for (index, res2) in &iovs { + match self.pending_req_map.remove(index) { + Some(req) => { + // Just ignore the result of write_obj(). Though we have validated + // request.status_addr, but we have released and reacquired the + // guest memory object and the guest may have hot-removed the + // memory maliciously. + let _ = mem.write_obj(*res2 as u8, req.status_addr); + let data_descs = &self.data_desc_vec[req.request_index as usize]; + let len = match req.request_type { + RequestType::In => req.data_len(data_descs), + RequestType::Out => 0, + _ => panic!( + "virtio-blk: unexpected request type {:?} in async I/O completion", + req.request_type + ), + }; + self.queue.add_used(mem, req.request_index, len); + } + None => { + error!("virtio-blk: Cant't find request for AIO completion event."); + // We have run into inconsistent state, let the device manager to do recovery. + return Err(Error::InternalError); + } + } + } + self.queue.notify() + } + + pub(crate) fn get_patch_rate_limiters(&mut self, bytes: BucketUpdate, ops: BucketUpdate) { + self.rate_limiter.update_buckets(bytes, ops); + info!( + "virtio-blk: Update rate limiter for block device {:?}", + String::from_utf8(self.disk_image_id.clone()) + ); + } + + pub(crate) fn run(&mut self) -> std::result::Result<(), EpollHelperError> { + let mut helper = EpollHelper::new()?; + helper.add_event(self.queue.eventfd.as_raw_fd(), QUEUE_AVAIL_EVENT)?; + helper.add_event_custom( + self.disk_image.get_data_evt_fd(), + END_IO_EVENT, + epoll::Events::EPOLLIN | epoll::Events::EPOLLET, + )?; + + helper.add_event(self.rate_limiter.as_raw_fd(), RATE_LIMITER_EVENT)?; + + helper.add_event(self.kill_evt.as_raw_fd(), KILL_EVENT)?; + + helper.run(self)?; + + Ok(()) + } +} + +impl EpollHelperHandler for InnerBlockEpollHandler { + fn handle_event(&mut self, _helper: &mut EpollHelper, event: &epoll::Event) -> bool { + let slot = event.data as u32; + match slot { + QUEUE_AVAIL_EVENT => { + if let Err(e) = self.queue.consume_event() { + error!("virtio-blk: failed to get queue event: {:?}", e); + return true; + } else if self.rate_limiter.is_blocked() { + // While limiter is blocked, don't process any more requests. + } else if self.process_queue() { + self.queue + .notify() + .expect("virtio-blk: failed to notify guest"); + } + } + END_IO_EVENT => { + // NOTE: Here we should drain io event fd, but different Ufile implementations + // may use different Events, and complete may depend on the count of reads from + // within io event. so leave it to IoEngine::complete to drain event fd. + // io_complete() only returns permanent errors. + self.io_complete() + .expect("virtio-blk: failed to complete IO requests"); + } + RATE_LIMITER_EVENT => { + // Upon rate limiter event, call the rate limiter handler + // and restart processing the queue. + if self.rate_limiter.event_handler().is_ok() && self.process_queue() { + self.queue + .notify() + .expect("virtio-blk: failed to notify guest"); + } + } + KILL_EVENT => { + let _ = self.kill_evt.read(); + while let Ok(evt) = self.evt_receiver.try_recv() { + match evt { + KillEvent::Kill => { + info!("virtio-blk: KILL_EVENT received, stopping inner epoll handler loop"); + + return true; + } + KillEvent::BucketUpdate(bytes, ops) => { + info!( + "virtio-blk: patch the io limiter bucket: {:?}, {:?}", + &bytes, &ops + ); + self.get_patch_rate_limiters(bytes, ops); + } + } + } + } + _ => panic!("virtio_blk: unknown event slot {}", slot), + } + false + } +} + +#[allow(dead_code)] +pub(crate) struct BlockEpollHandler< + AS: DbsGuestAddressSpace, + Q: QueueT + Send = Queue, + R: GuestMemoryRegion = GuestRegionMmap, +> { + pub(crate) evt_senders: Vec>, + pub(crate) kill_evts: Vec, + pub(crate) config: VirtioDeviceConfig, +} + +impl MutEventSubscriber + for BlockEpollHandler +{ + // a dumb impl for BlockEpollHandler to registe event manager for io drain. + fn process(&mut self, _events: Events, _ops: &mut EventOps) {} + fn init(&mut self, _ops: &mut EventOps) {} +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/block/mod.rs b/src/dragonball/src/dbs_virtio_devices/src/block/mod.rs new file mode 100644 index 000000000..a98d159b5 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/block/mod.rs @@ -0,0 +1,30 @@ +// Copyright 2019-2020 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +mod device; +pub use self::device::*; +mod handler; +pub(crate) use self::handler::*; +mod request; +pub(crate) use self::request::*; +mod ufile; +pub use self::ufile::*; + +use dbs_utils::rate_limiter::BucketUpdate; + +/// Block deriver name. +pub const BLK_DRIVER_NAME: &str = "virtio-blk"; + +pub(crate) const SECTOR_SHIFT: u8 = 9; +/// The size of sector +pub const SECTOR_SIZE: u64 = (0x01u64) << (SECTOR_SHIFT as u64); + +pub(crate) enum KillEvent { + Kill, + BucketUpdate(BucketUpdate, BucketUpdate), +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/block/request.rs b/src/dragonball/src/dbs_virtio_devices/src/block/request.rs new file mode 100644 index 000000000..6a85fcf81 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/block/request.rs @@ -0,0 +1,305 @@ +// Copyright 2019-2020 Alibaba Cloud. All rights reserved. +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::io::{self, Seek, SeekFrom, Write}; +use std::ops::Deref; +use std::result; + +use log::error; +use virtio_bindings::bindings::virtio_blk::*; +use virtio_queue::{Descriptor, DescriptorChain}; +use vm_memory::{ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryError}; + +use crate::{ + block::{ufile::Ufile, SECTOR_SHIFT, SECTOR_SIZE}, + Error, Result, +}; + +/// Error executing request. +#[derive(Debug)] +pub(crate) enum ExecuteError { + BadRequest(Error), + Flush(io::Error), + Read(GuestMemoryError), + Seek(io::Error), + Write(GuestMemoryError), + GetDeviceID(GuestMemoryError), + Unsupported(u32), +} + +/// Type of request from driver to device. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum RequestType { + /// Read request. + In, + /// Write request. + Out, + /// Flush request. + Flush, + /// Get device ID request. + GetDeviceID, + /// Unsupported request. + Unsupported(u32), +} + +impl From for RequestType { + fn from(value: u32) -> Self { + match value { + VIRTIO_BLK_T_IN => RequestType::In, + VIRTIO_BLK_T_OUT => RequestType::Out, + VIRTIO_BLK_T_FLUSH => RequestType::Flush, + VIRTIO_BLK_T_GET_ID => RequestType::GetDeviceID, + t => RequestType::Unsupported(t), + } + } +} + +/// The request header represents the mandatory fields of each block device request. +/// +/// A request header contains the following fields: +/// * request_type: an u32 value mapping to a read, write or flush operation. +/// * reserved: 32 bits are reserved for future extensions of the Virtio Spec. +/// * sector: an u64 value representing the offset where a read/write is to occur. +/// +/// The header simplifies reading the request from memory as all request follow +/// the same memory layout. +#[derive(Copy, Clone, Default)] +#[repr(C)] +struct RequestHeader { + request_type: u32, + _reserved: u32, + sector: u64, +} + +// Safe because RequestHeader only contains plain data. +unsafe impl ByteValued for RequestHeader {} + +impl RequestHeader { + /// Reads the request header from GuestMemory starting at `addr`. + /// + /// Virtio 1.0 specifies that the data is transmitted by the driver in little-endian + /// format. Firecracker currently runs only on little endian platforms so we don't + /// need to do an explicit little endian read as all reads are little endian by default. + /// When running on a big endian platform, this code should not compile, and support + /// for explicit little endian reads is required. + #[cfg(target_endian = "little")] + fn read_from(memory: &M, addr: GuestAddress) -> Result { + memory.read_obj(addr).map_err(Error::GuestMemory) + } +} + +/// IO Data descriptor. +#[derive(Clone, Debug)] +#[repr(C)] +pub struct IoDataDesc { + pub data_addr: u64, + pub data_len: usize, +} + +/// The block request. +#[derive(Clone, Debug)] +pub struct Request { + /// The type of the request. + pub(crate) request_type: RequestType, + /// The offset of the request. + pub(crate) sector: u64, + pub(crate) status_addr: GuestAddress, + pub(crate) request_index: u16, +} + +impl Request { + /// Parses a `desc_chain` and returns the associated `Request`. + pub(crate) fn parse( + desc_chain: &mut DescriptorChain, + data_descs: &mut Vec, + max_size: u32, + ) -> Result + where + M: Deref, + M::Target: GuestMemory, + { + let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?; + // The head contains the request type which MUST be readable. + if desc.is_write_only() { + return Err(Error::UnexpectedWriteOnlyDescriptor); + } + + let request_header = RequestHeader::read_from(desc_chain.memory(), desc.addr())?; + let mut req = Request { + request_type: RequestType::from(request_header.request_type), + sector: request_header.sector, + status_addr: GuestAddress(0), + request_index: desc_chain.head_index(), + }; + let status_desc; + let mut desc = desc_chain + .next() + .ok_or(Error::DescriptorChainTooShort) + .map_err(|e| { + error!("virtio-blk: Request {:?} has only head descriptor", req); + e + })?; + if !desc.has_next() { + status_desc = desc; + // Only flush requests are allowed to skip the data descriptor. + if req.request_type != RequestType::Flush { + error!("virtio-blk: Request {:?} need a data descriptor", req); + return Err(Error::DescriptorChainTooShort); + } + } else { + while desc.has_next() { + req.check_request(desc, max_size)?; + data_descs.push(IoDataDesc { + data_addr: desc.addr().0, + data_len: desc.len() as usize, + }); + desc = desc_chain + .next() + .ok_or(Error::DescriptorChainTooShort) + .map_err(|e| { + error!("virtio-blk: descriptor chain corrupted"); + e + })?; + } + status_desc = desc; + } + + // The status MUST always be writable and the guest address must be accessible. + if !status_desc.is_write_only() { + return Err(Error::UnexpectedReadOnlyDescriptor); + } + if status_desc.len() < 1 { + return Err(Error::DescriptorLengthTooSmall); + } + if !desc_chain.memory().address_in_range(status_desc.addr()) { + return Err(Error::InvalidGuestAddress(status_desc.addr())); + } + req.status_addr = status_desc.addr(); + + Ok(req) + } + + pub(crate) fn check_request(&self, desc: Descriptor, max_size: u32) -> Result<()> { + match self.request_type { + RequestType::Out => { + if desc.is_write_only() { + error!( + "virtio-blk: Request {:?} sees unexpected write-only descriptor", + self + ); + return Err(Error::UnexpectedWriteOnlyDescriptor); + } else if desc.len() > max_size { + error!( + "virtio-blk: Request {:?} size is greater than disk size ({} > {})", + self, + desc.len(), + max_size + ); + return Err(Error::DescriptorLengthTooBig); + } + } + RequestType::In => { + if !desc.is_write_only() { + error!( + "virtio-blk: Request {:?} sees unexpected read-only descriptor for read", + self + ); + return Err(Error::UnexpectedReadOnlyDescriptor); + } else if desc.len() > max_size { + error!( + "virtio-blk: Request {:?} size is greater than disk size ({} > {})", + self, + desc.len(), + max_size + ); + return Err(Error::DescriptorLengthTooBig); + } + } + RequestType::GetDeviceID if !desc.is_write_only() => { + error!( + "virtio-blk: Request {:?} sees unexpected read-only descriptor for GetDeviceID", + self + ); + return Err(Error::UnexpectedReadOnlyDescriptor); + } + _ => {} + } + Ok(()) + } + + pub(crate) fn execute( + &self, + disk: &mut Box, + mem: &M, + data_descs: &[IoDataDesc], + disk_id: &[u8], + ) -> result::Result { + self.check_capacity(disk, data_descs)?; + disk.seek(SeekFrom::Start(self.sector << SECTOR_SHIFT)) + .map_err(ExecuteError::Seek)?; + let mut len = 0; + for io in data_descs { + match self.request_type { + RequestType::In => { + mem.read_from(GuestAddress(io.data_addr), disk, io.data_len) + .map_err(ExecuteError::Read)?; + len += io.data_len; + } + RequestType::Out => { + mem.write_to(GuestAddress(io.data_addr), disk, io.data_len) + .map_err(ExecuteError::Write)?; + } + RequestType::Flush => match disk.flush() { + Ok(_) => {} + Err(e) => return Err(ExecuteError::Flush(e)), + }, + RequestType::GetDeviceID => { + if io.data_len < disk_id.len() { + return Err(ExecuteError::BadRequest(Error::InvalidOffset)); + } + mem.write_slice(disk_id, GuestAddress(io.data_addr)) + .map_err(ExecuteError::GetDeviceID)?; + // TODO: dragonball returns 0 here, check which value to return? + return Ok(disk_id.len() as u32); + } + RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)), + }; + } + + Ok(len as u32) + } + + pub(crate) fn check_capacity( + &self, + disk: &mut Box, + data_descs: &[IoDataDesc], + ) -> result::Result<(), ExecuteError> { + for d in data_descs { + let mut top = (d.data_len as u64 + SECTOR_SIZE - 1) & !(SECTOR_SIZE - 1u64); + + top = top + .checked_add(self.sector << SECTOR_SHIFT) + .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?; + if top > disk.get_capacity() { + return Err(ExecuteError::BadRequest(Error::InvalidOffset)); + } + } + + Ok(()) + } + + pub(crate) fn update_status(&self, mem: &M, status: u32) { + // Safe to unwrap because we have validated request.status_addr in parse() + mem.write_obj(status as u8, self.status_addr).unwrap(); + } + + // Return total IO length of all segments. Assume the req has been checked and is valid. + pub(crate) fn data_len(&self, data_descs: &[IoDataDesc]) -> u32 { + let mut len = 0; + for d in data_descs { + len += d.data_len; + } + len as u32 + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/block/ufile/aio.rs b/src/dragonball/src/dbs_virtio_devices/src/block/ufile/aio.rs new file mode 100644 index 000000000..418f29c6f --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/block/ufile/aio.rs @@ -0,0 +1,173 @@ +// Copyright 2022 Alibaba Cloud. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use std::io; +use std::os::unix::io::{AsRawFd, RawFd}; + +use vmm_sys_util::aio::{IoContext, IoControlBlock, IoEvent, IOCB_FLAG_RESFD}; +use vmm_sys_util::aio::{IOCB_CMD_PREADV, IOCB_CMD_PWRITEV}; +use vmm_sys_util::eventfd::EventFd; + +use super::IoEngine; +use crate::block::IoDataDesc; + +/// Use AIO to perform asynchronous IO requests. +pub struct Aio { + fd: RawFd, + aio_evtfd: EventFd, + aio_context: IoContext, +} + +impl Aio { + /// Creates a new Aio instence. + /// + /// # Arguments + /// * `nr_events`: maximum number of concurrently processing IO operations. + pub fn new(fd: RawFd, nr_events: u32) -> io::Result { + let aio_context = IoContext::new(nr_events)?; + Ok(Self { + fd, + aio_evtfd: EventFd::new(0)?, + aio_context, + }) + } +} + +impl IoEngine for Aio { + fn event_fd(&self) -> &EventFd { + &self.aio_evtfd + } + + // NOTE: aio doesn't seem to support negative offsets. + fn readv( + &mut self, + offset: i64, + iovecs: &mut Vec, + user_data: u64, + ) -> io::Result { + let iocbs = [&mut IoControlBlock { + aio_fildes: self.fd as u32, + aio_lio_opcode: IOCB_CMD_PREADV as u16, + aio_resfd: self.aio_evtfd.as_raw_fd() as u32, + aio_flags: IOCB_FLAG_RESFD, + aio_buf: iovecs.as_mut_ptr() as u64, + aio_offset: offset, + aio_nbytes: iovecs.len() as u64, + aio_data: user_data, + ..Default::default() + }]; + + self.aio_context.submit(&iocbs[..]) + } + + fn writev( + &mut self, + offset: i64, + iovecs: &mut Vec, + user_data: u64, + ) -> io::Result { + let iocbs = [&mut IoControlBlock { + aio_fildes: self.fd as u32, + aio_lio_opcode: IOCB_CMD_PWRITEV as u16, + aio_resfd: self.aio_evtfd.as_raw_fd() as u32, + aio_flags: IOCB_FLAG_RESFD, + aio_buf: iovecs.as_mut_ptr() as u64, + aio_offset: offset, + aio_nbytes: iovecs.len() as u64, + aio_data: user_data, + ..Default::default() + }]; + + self.aio_context.submit(&iocbs[..]) + } + + // For currently supported LocalFile and TdcFile backend, it must not return temporary errors + // and may only return permanent errors. So the virtio-blk driver layer will not try to + // recover and only pass errors up onto the device manager. When changing the error handling + // policy, please do help to update BlockEpollHandler::io_complete(). + fn complete(&mut self) -> io::Result> { + let count = self.aio_evtfd.read()?; + let mut v = Vec::with_capacity(count as usize); + if count > 0 { + let mut events = + vec![ + unsafe { std::mem::MaybeUninit::::zeroed().assume_init() }; + count as usize + ]; + while v.len() < count as usize { + let r = self.aio_context.get_events(1, &mut events[0..], None)?; + for event in events.iter().take(r) { + let index = event.data; + let res2 = event.res; + v.push((index, res2)); + } + } + } + Ok(v) + } +} + +#[cfg(test)] +mod tests { + use std::io::{Read, Seek, SeekFrom, Write}; + + use vmm_sys_util::tempfile::TempFile; + + use super::*; + + #[test] + fn aio_engine() { + let temp_file = TempFile::new().unwrap(); + let mut aio = Aio::new(temp_file.as_file().as_raw_fd(), 128).unwrap(); + let buf = vec![0xffu8; 0x1000]; + aio.writev( + 0, + &mut vec![IoDataDesc { + data_addr: buf.as_ptr() as u64, + data_len: 0x10, + }], + 0x123, + ) + .unwrap(); + let com_res = aio.complete().unwrap(); + for cr in com_res { + assert_eq!(cr.0, 0x123); + assert_eq!(cr.1, 0x10); + } + let mut rbuf = vec![0u8; 0x100]; + let rn = temp_file.as_file().read(&mut rbuf).unwrap(); + assert_eq!(rn, 0x10); + assert_eq!(&rbuf[..0x10], &vec![0xff; 0x10]); + + //temp_file.as_file().seek(SeekFrom::End(0x20)).unwrap(); + temp_file.as_file().seek(SeekFrom::Start(0x120)).unwrap(); + temp_file.as_file().write_all(&[0xeeu8; 0x20]).unwrap(); + + let rbuf = vec![0u8; 0x100]; + let ret = aio.readv( + -0x20, + &mut vec![IoDataDesc { + data_addr: rbuf.as_ptr() as u64, + data_len: 0x20, + }], + 0x456, + ); + assert_eq!(ret.unwrap_err().kind(), io::ErrorKind::InvalidInput); + aio.readv( + 0x120, + &mut vec![IoDataDesc { + data_addr: rbuf.as_ptr() as u64, + data_len: 0x20, + }], + 0x456, + ) + .unwrap(); + let com_res = aio.complete().unwrap(); + for cr in com_res { + assert_eq!(cr.0, 0x456); + assert_eq!(cr.1, 0x20); + } + assert_eq!(&rbuf[..0x20], &vec![0xee; 0x20]); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/block/ufile/io_uring.rs b/src/dragonball/src/dbs_virtio_devices/src/block/ufile/io_uring.rs new file mode 100644 index 000000000..d7eb30868 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/block/ufile/io_uring.rs @@ -0,0 +1,263 @@ +// Copyright 2022 Alibaba Cloud. All rights reserved. +// Copyright © 2021 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::io; +use std::os::unix::io::{AsRawFd, RawFd}; + +use io_uring::{opcode, squeue, types, Probe}; +use log::info; +use vmm_sys_util::eventfd::{EventFd, EFD_NONBLOCK}; + +use super::IoEngine; +use crate::block::IoDataDesc; + +/// Use io_uring to perform asynchronous IO requests. +pub struct IoUring { + fd: RawFd, + io_uring: io_uring::IoUring, + evtfd: EventFd, +} + +impl IoUring { + /// Creates a new IoUring instance. + /// + /// # Arguments + /// * `entries`: size of queue, and its value should be the power of two. + pub fn new(fd: RawFd, entries: u32) -> io::Result { + let io_uring = io_uring::IoUring::new(entries)?; + let evtfd = EventFd::new(EFD_NONBLOCK)?; + + // Register the io_uring eventfd that will notify when something in + // the completion queue is ready. + io_uring.submitter().register_eventfd(evtfd.as_raw_fd())?; + + Ok(Self { + fd, + evtfd, + io_uring, + }) + } + + /// Check if io_uring for block device can be used on the current system, as + /// it correctly supports the expected io_uring features. + pub fn is_supported() -> bool { + let error_msg = "io_uring not supported:"; + + // Check we can create an io_uring instance, which effectively verifies + // that io_uring_setup() syscall is supported. + let io_uring = match io_uring::IoUring::new(1) { + Ok(io_uring) => io_uring, + Err(e) => { + info!("{} failed to create io_uring instance: {}", error_msg, e); + return false; + } + }; + + let submitter = io_uring.submitter(); + + let mut probe = Probe::new(); + + // Check we can register a probe to validate supported operations. + match submitter.register_probe(&mut probe) { + Ok(_) => {} + Err(e) => { + info!("{} failed to register a probe: {}", error_msg, e); + return false; + } + } + + // Check IORING_OP_READ is supported + if !probe.is_supported(opcode::Read::CODE) { + info!("{} IORING_OP_READ operation not supported", error_msg); + return false; + } + + // Check IORING_OP_WRITE is supported + if !probe.is_supported(opcode::Write::CODE) { + info!("{} IORING_OP_WRITE operation not supported", error_msg); + return false; + } + + true + } +} + +impl IoEngine for IoUring { + fn event_fd(&self) -> &EventFd { + &self.evtfd + } + + fn readv( + &mut self, + offset: i64, + iovecs: &mut Vec, + user_data: u64, + ) -> io::Result { + let (submit, mut sq, _cq) = self.io_uring.split(); + + // Safe because we know the file descriptor is valid and we + // relied on vm-memory to provide the buffer address. + let _ = unsafe { + sq.push( + &opcode::Readv::new( + types::Fd(self.fd), + iovecs.as_ptr() as *const libc::iovec, + iovecs.len() as u32, + ) + .offset(offset) + .build() + .flags(squeue::Flags::ASYNC) + .user_data(user_data), + ) + }; + + // Update the submission queue and submit new operations to the + // io_uring instance. + sq.sync(); + submit.submit() + } + + fn writev( + &mut self, + offset: i64, + iovecs: &mut Vec, + user_data: u64, + ) -> io::Result { + let (submit, mut sq, _cq) = self.io_uring.split(); + + // Safe because we know the file descriptor is valid and we + // relied on vm-memory to provide the buffer address. + let _ = unsafe { + sq.push( + &opcode::Writev::new( + types::Fd(self.fd), + iovecs.as_ptr() as *const libc::iovec, + iovecs.len() as u32, + ) + .offset(offset) + .build() + .flags(squeue::Flags::ASYNC) + .user_data(user_data), + ) + }; + + // Update the submission queue and submit new operations to the + // io_uring instance. + sq.sync(); + submit.submit() + } + + fn complete(&mut self) -> io::Result> { + let _ = self.evtfd.read()?; + let mut completion_list = Vec::new(); + + let cq = self.io_uring.completion(); + for cq_entry in cq { + completion_list.push((cq_entry.user_data(), cq_entry.result() as i64)); + } + + Ok(completion_list) + } +} + +#[cfg(test)] +mod tests { + use std::io::{Read, Seek, SeekFrom, Write}; + + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::epoll_helper::*; + + struct TestHandler; + + impl EpollHelperHandler for TestHandler { + fn handle_event(&mut self, _helper: &mut EpollHelper, event: &epoll::Event) -> bool { + let slot = event.data as u32; + slot == 0xfeed + } + } + + #[test] + fn iouring_engine() { + if !IoUring::is_supported() { + return; + } + let temp_file = TempFile::new().unwrap(); + let mut uring = IoUring::new(temp_file.as_file().as_raw_fd(), 128).unwrap(); + + let mut helper = EpollHelper::new().unwrap(); + helper + .add_event(uring.event_fd().as_raw_fd(), 0xfeed) + .unwrap(); + + let mut handler = TestHandler; + + let buf = vec![0xffu8; 0x1000]; + uring + .writev( + 0, + &mut vec![IoDataDesc { + data_addr: buf.as_ptr() as u64, + data_len: 0x10, + }], + 0x123, + ) + .unwrap(); + + helper.run(&mut handler).unwrap(); + + let com_res = uring.complete().unwrap(); + for cr in com_res { + assert_eq!(cr.0, 0x123); + assert_eq!(cr.1, 0x10); + } + let mut rbuf = vec![0u8; 0x100]; + let rn = temp_file.as_file().read(&mut rbuf).unwrap(); + assert_eq!(rn, 0x10); + assert_eq!(&rbuf[..0x10], &vec![0xff; 0x10]); + + //temp_file.as_file().seek(SeekFrom::End(0x20)).unwrap(); + temp_file.as_file().seek(SeekFrom::Start(0x120)).unwrap(); + temp_file.as_file().write_all(&[0xeeu8; 0x20]).unwrap(); + + let rbuf = vec![0u8; 0x100]; + let ret = uring.readv( + -0x120, + &mut vec![IoDataDesc { + data_addr: rbuf.as_ptr() as u64, + data_len: 0x20, + }], + 0x456, + ); + assert_eq!(ret.unwrap(), 1); + helper.run(&mut handler).unwrap(); + let com_res = uring.complete().unwrap(); + for cr in com_res { + assert_eq!(cr.0, 0x456); + assert_eq!(cr.1, -22); + } + + uring + .readv( + 0x120, + &mut vec![IoDataDesc { + data_addr: rbuf.as_ptr() as u64, + data_len: 0x20, + }], + 0x456, + ) + .unwrap(); + + helper.run(&mut handler).unwrap(); + + let com_res = uring.complete().unwrap(); + for cr in com_res { + assert_eq!(cr.0, 0x456); + assert_eq!(cr.1, 0x20); + } + assert_eq!(&rbuf[..0x20], &vec![0xee; 0x20]); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/block/ufile/localfile.rs b/src/dragonball/src/dbs_virtio_devices/src/block/ufile/localfile.rs new file mode 100644 index 000000000..b45d87e2a --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/block/ufile/localfile.rs @@ -0,0 +1,480 @@ +// Copyright 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::fs::File; +use std::io::{self, Read, Seek, SeekFrom, Write}; +use std::mem::ManuallyDrop; +use std::os::linux::fs::MetadataExt; +use std::os::unix::io::{AsRawFd, RawFd}; + +use log::{info, warn}; +use virtio_bindings::bindings::virtio_blk::{VIRTIO_BLK_S_IOERR, VIRTIO_BLK_S_OK}; + +use super::{IoDataDesc, IoEngine, Ufile}; + +pub struct LocalFile { + pub(crate) file: ManuallyDrop, + no_drop: bool, + capacity: u64, + io_engine: E, +} + +impl LocalFile { + /// Creates a LocalFile instance. + pub fn new(mut file: File, no_drop: bool, io_engine: E) -> io::Result { + let capacity = file.seek(SeekFrom::End(0))?; + + Ok(Self { + file: ManuallyDrop::new(file), + no_drop, + capacity, + io_engine, + }) + } +} + +// Implement our own Drop for LocalFile, as we don't want to close LocalFile.file if no_drop is +// enabled. +impl Drop for LocalFile { + fn drop(&mut self) { + if self.no_drop { + info!("LocalFile: no_drop is enabled, don't close file on drop"); + } else { + // Close the raw fd directly. + let fd = self.file.as_raw_fd(); + if let Err(e) = nix::unistd::close(fd) { + warn!("LocalFile: failed to close disk file: {:?}", e); + } + } + } +} + +impl Read for LocalFile { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.file.read(buf) + } +} + +impl Write for LocalFile { + fn write(&mut self, buf: &[u8]) -> io::Result { + self.file.write(buf) + } + + fn flush(&mut self) -> io::Result<()> { + self.file.flush() + } +} + +impl Seek for LocalFile { + fn seek(&mut self, pos: SeekFrom) -> io::Result { + self.file.seek(pos) + } +} + +impl Ufile for LocalFile { + fn get_capacity(&self) -> u64 { + self.capacity + } + + fn get_max_size(&self) -> u32 { + // Set max size to 1M to avoid interferes with rate limiter. + 0x100000 + } + + fn get_device_id(&self) -> io::Result { + let blk_metadata = self.file.metadata()?; + // This is how kvmtool does it. + Ok(format!( + "{}{}{}", + blk_metadata.st_dev(), + blk_metadata.st_rdev(), + blk_metadata.st_ino() + )) + } + + fn get_data_evt_fd(&self) -> RawFd { + self.io_engine.event_fd().as_raw_fd() + } + + fn io_read_submit( + &mut self, + offset: i64, + iovecs: &mut Vec, + user_data: u16, + ) -> io::Result { + self.io_engine.readv(offset, iovecs, user_data as u64) + } + + fn io_write_submit( + &mut self, + offset: i64, + iovecs: &mut Vec, + user_data: u16, + ) -> io::Result { + self.io_engine.writev(offset, iovecs, user_data as u64) + } + + fn io_complete(&mut self) -> io::Result> { + Ok(self + .io_engine + .complete()? + .iter() + .map(|(user_data, res)| { + ( + *user_data as u16, + if *res >= 0 { + VIRTIO_BLK_S_OK + } else { + VIRTIO_BLK_S_IOERR + }, + ) + }) + .collect()) + } +} + +#[cfg(test)] +mod tests { + use std::ffi::OsStr; + use std::io::SeekFrom; + + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::block::aio::Aio; + use crate::block::io_uring::IoUring; + use crate::epoll_helper::*; + + const STOP_EVENT: u32 = 0xfeed; + + struct TestHandler; + + impl EpollHelperHandler for TestHandler { + fn handle_event(&mut self, _helper: &mut EpollHelper, event: &epoll::Event) -> bool { + let slot = event.data as u32; + slot == STOP_EVENT + } + } + + fn new_aio_engine() -> Aio { + let temp_file = TempFile::new().unwrap(); + let aio = Aio::new(temp_file.as_file().as_raw_fd(), 128).unwrap(); + aio + } + + fn new_iouring_engine() -> IoUring { + let temp_file = TempFile::new().unwrap(); + let iouring = IoUring::new(temp_file.as_file().as_raw_fd(), 128).unwrap(); + iouring + } + + #[test] + fn test_new() { + // Create with AIO. + let file = TempFile::new().unwrap().into_file(); + let file_with_aio = LocalFile::new(file, false, new_aio_engine()); + assert!(file_with_aio.is_ok()); + + // Create with IO_Uring. + let file = TempFile::new().unwrap().into_file(); + let file_with_iouring = LocalFile::new(file, false, new_iouring_engine()); + assert!(file_with_iouring.is_ok()); + } + + fn have_target_fd(fd: i32, filename: &OsStr) -> bool { + let mut path = std::path::PathBuf::from("/proc/self/fd"); + path.push(fd.to_string()); + if path.exists() { + let entry = path.read_link().unwrap(); + if entry + .file_name() + .unwrap() + .to_str() + .unwrap() + .contains(filename.to_str().unwrap()) + { + return true; + } + } + false + } + + #[test] + fn test_drop() { + // Droped case. + let tempfile = TempFile::new().unwrap(); + let filename = tempfile.as_path().file_name().unwrap().to_owned(); + let file = tempfile.into_file(); + let fd_of_file = file.as_raw_fd(); + let file_with_aio = LocalFile::new(file, false, new_aio_engine()).unwrap(); + + assert!(have_target_fd(fd_of_file, &filename)); + drop(file_with_aio); + assert!(!have_target_fd(fd_of_file, &filename)); + + let tempfile = TempFile::new().unwrap(); + let filename = tempfile.as_path().file_name().unwrap().to_owned(); + let file = tempfile.into_file(); + let fd_of_file = file.as_raw_fd(); + let file_with_iouring = LocalFile::new(file, false, new_iouring_engine()).unwrap(); + + assert!(have_target_fd(fd_of_file, &filename)); + drop(file_with_iouring); + assert!(!have_target_fd(fd_of_file, &filename)); + + // No-drop case. + let tempfile = TempFile::new().unwrap(); + let filename = tempfile.as_path().file_name().unwrap().to_owned(); + let file = tempfile.into_file(); + let fd_of_file = file.as_raw_fd(); + let file_with_aio = LocalFile::new(file, true, new_aio_engine()).unwrap(); + + assert!(have_target_fd(fd_of_file, &filename)); + drop(file_with_aio); + assert!(have_target_fd(fd_of_file, &filename)); + + let tempfile = TempFile::new().unwrap(); + let filename = tempfile.as_path().file_name().unwrap().to_owned(); + let file = tempfile.into_file(); + let fd_of_file = file.as_raw_fd(); + let file_with_iouring = LocalFile::new(file, true, new_iouring_engine()).unwrap(); + + assert!(have_target_fd(fd_of_file, &filename)); + drop(file_with_iouring); + assert!(have_target_fd(fd_of_file, &filename)); + } + + #[test] + fn test_read_write_flush_seek() { + let original_content = b"hello world"; + let size_of_content = original_content.len(); + let file = TempFile::new().unwrap().into_file(); + let mut file_with_aio = LocalFile::new(file, false, new_aio_engine()).unwrap(); + let bytes_write = file_with_aio.write(original_content).unwrap(); + assert_eq!(bytes_write, size_of_content); + file_with_aio.flush().unwrap(); + file_with_aio.rewind().unwrap(); + let mut content = vec![0; 11]; + let bytes_read = file_with_aio.read(&mut content).unwrap(); + assert_eq!(bytes_read, size_of_content); + assert_eq!(content, original_content); + + let original_content = b"hello world"; + let file = TempFile::new().unwrap().into_file(); + let mut file_with_iouring = LocalFile::new(file, false, new_iouring_engine()).unwrap(); + let bytes_write = file_with_iouring.write(original_content).unwrap(); + assert_eq!(bytes_write, size_of_content); + file_with_iouring.flush().unwrap(); + let start: usize = 6; + file_with_iouring + .seek(SeekFrom::Start(start as u64)) + .unwrap(); + let mut content = vec![0; size_of_content - start]; + let bytes_read = file_with_iouring.read(&mut content).unwrap(); + assert_eq!(bytes_read, size_of_content - start); + assert_eq!(content, original_content[start..]); + } + + #[test] + fn test_get_capacity() { + let mut file = TempFile::new().unwrap().into_file(); + let original_content = b"hello world"; + let size_of_content = original_content.len(); + let bytes_write = file.write(original_content).unwrap(); + assert_eq!(bytes_write, size_of_content); + file.rewind().unwrap(); + let file_with_aio = LocalFile::new(file, false, new_aio_engine()).unwrap(); + assert_eq!(file_with_aio.get_capacity(), size_of_content as u64); + + let mut file = TempFile::new().unwrap().into_file(); + let original_content = b"hello world"; + let size_of_content = original_content.len(); + let bytes_write = file.write(original_content).unwrap(); + assert_eq!(bytes_write, size_of_content); + file.rewind().unwrap(); + let file_with_iouring = LocalFile::new(file, false, new_iouring_engine()).unwrap(); + assert_eq!(file_with_iouring.get_capacity(), size_of_content as u64); + } + + #[test] + fn test_get_max_capacity() { + let file = TempFile::new().unwrap().into_file(); + let file_with_aio = LocalFile::new(file, false, new_aio_engine()).unwrap(); + assert_eq!(file_with_aio.get_max_size(), 0x100000); + + let file = TempFile::new().unwrap().into_file(); + let file_with_iouring = LocalFile::new(file, false, new_iouring_engine()).unwrap(); + assert_eq!(file_with_iouring.get_max_size(), 0x100000); + } + + #[test] + fn test_get_device_id() { + let file = TempFile::new().unwrap().into_file(); + let file_with_aio = LocalFile::new(file, false, new_aio_engine()).unwrap(); + assert!(file_with_aio.get_device_id().is_ok()); + let metadata = file_with_aio.file.metadata().unwrap(); + assert_eq!( + file_with_aio.get_device_id().unwrap(), + format!( + "{}{}{}", + metadata.st_dev(), + metadata.st_rdev(), + metadata.st_ino() + ) + ); + + let file = TempFile::new().unwrap().into_file(); + let file_with_iouring = LocalFile::new(file, false, new_iouring_engine()).unwrap(); + assert!(file_with_iouring.get_device_id().is_ok()); + let metadata = file_with_iouring.file.metadata().unwrap(); + assert_eq!( + file_with_iouring.get_device_id().unwrap(), + format!( + "{}{}{}", + metadata.st_dev(), + metadata.st_rdev(), + metadata.st_ino() + ) + ); + } + + #[test] + fn test_get_data_evt_fd() { + let file = TempFile::new().unwrap(); + let aio = Aio::new(file.as_file().as_raw_fd(), 128).unwrap(); + let file_with_aio = LocalFile::new(file.into_file(), false, aio).unwrap(); + assert_eq!( + file_with_aio.get_data_evt_fd(), + file_with_aio.io_engine.event_fd().as_raw_fd() + ); + + let file = TempFile::new().unwrap(); + let iouring = IoUring::new(file.as_file().as_raw_fd(), 128).unwrap(); + let file_with_iouring = LocalFile::new(file.into_file(), false, iouring).unwrap(); + assert_eq!( + file_with_iouring.get_data_evt_fd(), + file_with_iouring.io_engine.event_fd().as_raw_fd() + ); + } + + #[test] + fn test_io_write_submit() { + // Test with Aio. + let file = TempFile::new().unwrap(); + let aio = Aio::new(file.as_file().as_raw_fd(), 128).unwrap(); + let mut file_with_aio = LocalFile::new(file.into_file(), false, aio).unwrap(); + let buf = vec![0xffu8; 0xff]; + file_with_aio + .io_write_submit( + 8, + &mut vec![IoDataDesc { + data_addr: buf.as_ptr() as u64, + data_len: 0x8_usize, + }], + 0x12, + ) + .unwrap(); + let res = file_with_aio.io_complete().unwrap(); + + for element in res { + assert_eq!(element.0, 0x12); + assert_eq!(element.1, VIRTIO_BLK_S_OK); + } + + // Test with IoUring. + let file = TempFile::new().unwrap(); + let iouring = IoUring::new(file.as_file().as_raw_fd(), 128).unwrap(); + let mut helper = EpollHelper::new().unwrap(); + helper + .add_event(iouring.event_fd().as_raw_fd(), 0xfeed) + .unwrap(); + let mut file_with_iouring = LocalFile::new(file.into_file(), false, iouring).unwrap(); + let mut handler = TestHandler; + let buf = vec![0xffu8; 0xff]; + file_with_iouring + .io_write_submit( + 8, + &mut vec![IoDataDesc { + data_addr: buf.as_ptr() as u64, + data_len: 0x8_usize, + }], + 0x12, + ) + .unwrap(); + helper.run(&mut handler).unwrap(); + let res = file_with_iouring.io_complete().unwrap(); + + for element in res { + assert_eq!(element.0, 0x12); + assert_eq!(element.1, VIRTIO_BLK_S_OK); + } + } + + #[test] + fn test_io_read_submit() { + // Test with Aio. + let file = TempFile::new().unwrap(); + file.as_file().seek(SeekFrom::Start(0x120)).unwrap(); + file.as_file().write_all(&[0xeeu8; 0x20]).unwrap(); + let aio = Aio::new(file.as_file().as_raw_fd(), 128).unwrap(); + let mut file_with_aio = LocalFile::new(file.into_file(), false, aio).unwrap(); + let rbuf = vec![0u8; 0x100]; + let ret = file_with_aio.io_read_submit( + -0x20, + &mut vec![IoDataDesc { + data_addr: rbuf.as_ptr() as u64, + data_len: 0x20, + }], + 0x456, + ); + assert_eq!(ret.unwrap_err().kind(), io::ErrorKind::InvalidInput); + + file_with_aio + .io_read_submit( + 0x120, + &mut vec![IoDataDesc { + data_addr: rbuf.as_ptr() as u64, + data_len: 0x20, + }], + 0x456, + ) + .unwrap(); + let com_res = file_with_aio.io_complete().unwrap(); + for element in com_res { + assert_eq!(element.0, 0x456); + assert_eq!(element.1, VIRTIO_BLK_S_OK); + } + assert_eq!(&rbuf[..0x20], &vec![0xee; 0x20]); + + // Test with IoUring. + let file = TempFile::new().unwrap(); + file.as_file().seek(SeekFrom::Start(0x120)).unwrap(); + file.as_file().write_all(&[0xeeu8; 0x20]).unwrap(); + let iouring = IoUring::new(file.as_file().as_raw_fd(), 128).unwrap(); + let mut helper = EpollHelper::new().unwrap(); + helper + .add_event(iouring.event_fd().as_raw_fd(), 0xfeed) + .unwrap(); + let mut file_with_iouring = LocalFile::new(file.into_file(), false, iouring).unwrap(); + let mut handler = TestHandler; + let rbuf = vec![0u8; 0x100]; + + file_with_iouring + .io_read_submit( + 0x120, + &mut vec![IoDataDesc { + data_addr: rbuf.as_ptr() as u64, + data_len: 0x20, + }], + 0x456, + ) + .unwrap(); + helper.run(&mut handler).unwrap(); + let com_res = file_with_iouring.io_complete().unwrap(); + for element in com_res { + assert_eq!(element.0, 0x456); + assert_eq!(element.1, VIRTIO_BLK_S_OK); + } + assert_eq!(&rbuf[..0x20], &vec![0xee; 0x20]); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/block/ufile/mod.rs b/src/dragonball/src/dbs_virtio_devices/src/block/ufile/mod.rs new file mode 100644 index 000000000..a9f9cf9fb --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/block/ufile/mod.rs @@ -0,0 +1,82 @@ +// Copyright 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +mod localfile; +pub use self::localfile::LocalFile; + +pub mod aio; +pub mod io_uring; + +use std::io::{self, Read, Seek, Write}; +use std::os::unix::io::RawFd; + +use vmm_sys_util::eventfd::EventFd; + +use super::request::IoDataDesc; + +/// Traits for the virtio-blk driver to access backend storage devices, such as localfile. +pub trait Ufile: Read + Write + Seek + Send { + /// Get disk capacity in bytes. + fn get_capacity(&self) -> u64; + + /// Get max size in a segment. + fn get_max_size(&self) -> u32; + + /// Generate a unique device id for the virtio-blk device. + fn get_device_id(&self) -> io::Result; + + /// Get the raw event fd for data plane. + fn get_data_evt_fd(&self) -> RawFd; + + /// Submit asynchronous Read IO requests. + fn io_read_submit( + &mut self, + offset: i64, + iovecs: &mut Vec, + user_data: u16, + ) -> io::Result; + + /// Submit asynchronous Write IO requests. + fn io_write_submit( + &mut self, + offset: i64, + iovecs: &mut Vec, + user_data: u16, + ) -> io::Result; + + /// Poll for completed asynchronous IO requests. + /// + /// For currently supported LocalFile backend, it must not return temporary errors + /// and may only return permanent errors. So the virtio-blk driver layer will not try to + /// recover and only pass errors up onto the device manager. When changing the error handling + /// policy, please do help to update BlockEpollHandler::io_complete(). + fn io_complete(&mut self) -> io::Result>; +} + +/// Traits for the backend IO engine, such as aio or io-uring. +pub trait IoEngine { + /// Returns the EventFd that will notify when something is ready. + fn event_fd(&self) -> &EventFd; + + /// Submit asynchronous Read requests. + fn readv( + &mut self, + offset: i64, + iovecs: &mut Vec, + user_data: u64, + ) -> io::Result; + + /// Submit asynchronous Write requests. + fn writev( + &mut self, + offset: i64, + iovecs: &mut Vec, + user_data: u64, + ) -> io::Result; + + /// Poll for completed asynchronous IO requests. + /// + /// Return the vector of (user data, result code). + /// NOTE: complete need to drain the io event fd. + fn complete(&mut self) -> io::Result>; +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/device.rs b/src/dragonball/src/dbs_virtio_devices/src/device.rs new file mode 100644 index 000000000..8ba641df4 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/device.rs @@ -0,0 +1,884 @@ +// Copyright 2019-2022 Alibaba Cloud. All rights reserved. +// +// Portions Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! Vritio Device Model. +//! +//! The Virtio specification defines a group of Virtio devices and transport layers. +//! The Virtio device model defines traits and structs for Virtio transport layers to +//! manage Virtio device backend drivers. + +use std::any::Any; +use std::cmp; +use std::io::Write; +use std::ops::Deref; +use std::sync::Arc; + +use dbs_device::resources::{DeviceResources, ResourceConstraint}; +use dbs_interrupt::{InterruptNotifier, NoopNotifier}; +use dbs_utils::epoll_manager::{EpollManager, EpollSubscriber, SubscriberId}; +use kvm_ioctls::VmFd; +use log::{error, warn}; +use virtio_queue::{DescriptorChain, QueueOwnedT, QueueSync, QueueT}; +use vm_memory::{ + Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryRegion, GuestRegionMmap, + GuestUsize, +}; +use vmm_sys_util::eventfd::{EventFd, EFD_NONBLOCK}; + +use crate::{ActivateError, ActivateResult, ConfigError, ConfigResult, Error, Result}; + +/// Virtio queue configuration information. +/// +/// The `VirtioQueueConfig` maintains configuration information for a Virtio queue. +/// It also provides methods to access the queue and associated interrupt/event notifiers. +pub struct VirtioQueueConfig { + /// Virtio queue object to access the associated queue. + pub queue: Q, + /// EventFd to receive queue notification from guest. + pub eventfd: Arc, + /// Notifier to inject interrupt to guest. + notifier: Arc, + /// Queue index into the queue array. + index: u16, +} + +impl VirtioQueueConfig { + /// Create a `VirtioQueueConfig` object. + pub fn new( + queue: Q, + eventfd: Arc, + notifier: Arc, + index: u16, + ) -> Self { + VirtioQueueConfig { + queue, + eventfd, + notifier, + index, + } + } + + /// Create a `VirtioQueueConfig` object with the specified queue size and index. + pub fn create(queue_size: u16, index: u16) -> Result { + let eventfd = EventFd::new(EFD_NONBLOCK).map_err(Error::IOError)?; + + let queue = Q::new(queue_size)?; + Ok(VirtioQueueConfig { + queue, + eventfd: Arc::new(eventfd), + notifier: Arc::new(NoopNotifier::new()), + index, + }) + } + + /// Get queue index. + #[inline] + pub fn index(&self) -> u16 { + self.index + } + + /// Get immutable reference to the associated Virtio queue. + pub fn queue(&self) -> &Q { + &self.queue + } + + /// Get mutable reference to the associated Virtio queue. + pub fn queue_mut(&mut self) -> &mut Q { + &mut self.queue + } + + /// Get the maximum queue size. + #[inline] + pub fn max_size(&self) -> u16 { + self.queue.max_size() + } + + /// Get the next available descriptor. + pub fn get_next_descriptor(&mut self, mem: M) -> Result>> + where + M: Deref + Clone, + M::Target: GuestMemory + Sized, + { + let mut guard = self.queue.lock(); + let mut iter = guard.iter(mem)?; + Ok(iter.next()) + } + + /// Put a used descriptor into the used ring. + #[inline] + pub fn add_used(&mut self, mem: &M, desc_index: u16, len: u32) { + self.queue + .add_used(mem, desc_index, len) + .unwrap_or_else(|_| panic!("Failed to add used. index: {}", desc_index)) + } + + /// Consume a queue notification event. + #[inline] + pub fn consume_event(&self) -> Result { + self.eventfd.read().map_err(Error::IOError) + } + + /// Produce a queue notification event. + #[inline] + pub fn generate_event(&self) -> Result<()> { + self.eventfd.write(1).map_err(Error::IOError) + } + + /// Inject an interrupt to the guest for queue change events. + #[inline] + pub fn notify(&self) -> Result<()> { + self.notifier.notify().map_err(Error::IOError) + } + + /// Set interrupt notifier to inject interrupts to the guest. + #[inline] + pub fn set_interrupt_notifier(&mut self, notifier: Arc) { + self.notifier = notifier; + } + + /// Return the actual size of the queue, as the driver may not set up a + /// queue as big as the device allows. + #[inline] + pub fn actual_size(&self) -> u16 { + // TODO: rework once https://github.com/rust-vmm/vm-virtio/pull/153 get merged. + //self.queue.size() + std::cmp::min(self.queue.size(), self.queue.max_size()) + } +} + +impl Clone for VirtioQueueConfig { + fn clone(&self) -> Self { + VirtioQueueConfig { + queue: self.queue.clone(), + eventfd: self.eventfd.clone(), + notifier: self.notifier.clone(), + index: self.index, + } + } +} + +/// Virtio device configuration information. +/// +/// This structure maintains all configuration information for a Virtio device. It will be passed +/// to VirtioDevice::activate() and the Virtio device will take ownership of the configuration +/// object. On VirtioDevice::reset(), the configuration object should be returned to the caller. +pub struct VirtioDeviceConfig< + AS: GuestAddressSpace, + Q: QueueT = QueueSync, + R: GuestMemoryRegion = GuestRegionMmap, +> { + /// `GustMemoryAddress` object to access the guest memory. + pub vm_as: AS, + /// `VmFd` object for the device to access the hypervisor, such as KVM/HyperV etc. + pub vm_fd: Arc, + /// Resources assigned to the Virtio device. + pub resources: DeviceResources, + /// Virtio queues for normal data stream. + pub queues: Vec>, + /// Virtio queue for device control requests. + pub ctrl_queue: Option>, + /// Interrupt notifier to inject Virtio device change interrupt to the guest. + pub device_change_notifier: Arc, + /// Shared memory region for Virtio-fs etc. + pub shm_regions: Option>, +} + +impl VirtioDeviceConfig +where + AS: GuestAddressSpace, + Q: QueueT, + R: GuestMemoryRegion, +{ + /// Creates a new `VirtioDeviceConfig` object. + pub fn new( + vm_as: AS, + vm_fd: Arc, + resources: DeviceResources, + queues: Vec>, + ctrl_queue: Option>, + device_change_notifier: Arc, + ) -> Self { + VirtioDeviceConfig { + vm_as, + vm_fd, + resources, + queues, + ctrl_queue, + device_change_notifier, + shm_regions: None, + } + } + + /// Inject a Virtio device change notification to the guest. + pub fn notify_device_changes(&self) -> Result<()> { + self.device_change_notifier.notify().map_err(Error::IOError) + } + + /// Get interrupt eventfds for normal Vritio queues. + pub fn get_queue_interrupt_eventfds(&self) -> Vec<&EventFd> { + self.queues + .iter() + .map(|x| x.notifier.notifier().unwrap()) + .collect() + } + + /// Set shared memory region for Virtio-fs. + pub fn set_shm_regions(&mut self, shm_regions: VirtioSharedMemoryList) { + self.shm_regions = Some(shm_regions); + } + + /// Get host address and guest address of the shared memory region. + pub fn get_shm_region_addr(&self) -> Option<(u64, u64)> { + self.shm_regions + .as_ref() + .map(|shms| (shms.host_addr, shms.guest_addr.raw_value())) + } + + /// Gets a shared reference to the guest memory object. + pub fn lock_guest_memory(&self) -> AS::T { + self.vm_as.memory() + } +} + +/// Device memory shared between guest and the device backend driver, defined by the Virtio +/// specification for Virtio-fs devices. +#[derive(Clone, Eq, PartialEq, Debug)] +pub struct VirtioSharedMemory { + /// offset from the bar base + pub offset: u64, + /// len of this shared memory region + pub len: u64, +} + +/// A list of Shared Memory regions +#[derive(Debug)] +pub struct VirtioSharedMemoryList { + /// Host address + pub host_addr: u64, + /// Guest address + pub guest_addr: GuestAddress, + /// Length + pub len: GuestUsize, + /// kvm_userspace_memory_region flags + pub kvm_userspace_memory_region_flags: u32, + /// kvm_userspace_memory_region slot + pub kvm_userspace_memory_region_slot: u32, + /// List of shared regions. + pub region_list: Vec, + + /// List of mmap()ed regions managed through GuestRegionMmap instances. Using + /// GuestRegionMmap will perform the unmapping automatically when the instance + /// is dropped, which happens when the VirtioDevice gets dropped. + /// + /// GuestRegionMmap is used instead of MmapRegion. Because We need to insert + /// this region into vm_as,but vm_as uses GuestRegionMmap to manage regions. + /// If MmapRegion is used in here, the MmapRegion needs to be clone() to create + /// new GuestRegionMmap for vm_as. MmapRegion clone() will cause the problem of + /// duplicate unmap during automatic drop, so we should try to avoid the clone + /// of MmapRegion. This problem does not exist with GuestRegionMmap because + /// vm_as and VirtioSharedMemoryList can share GuestRegionMmap through Arc. + pub mmap_region: Arc, +} + +impl Clone for VirtioSharedMemoryList { + fn clone(&self) -> Self { + Self { + host_addr: self.host_addr, + guest_addr: self.guest_addr, + len: self.len, + kvm_userspace_memory_region_slot: self.kvm_userspace_memory_region_slot, + kvm_userspace_memory_region_flags: self.kvm_userspace_memory_region_flags, + region_list: self.region_list.clone(), + mmap_region: self.mmap_region.clone(), + } + } +} + +/// A callback for the VMM to insert memory region for virtio devices that +/// has device memory, such as DAX of virtiofs, pmem. +/// +/// insert_region function is used to solve the problem that the virtio device cannot +/// find the host address corresponding to the guest address when reading the +/// guest device memory. +/// +/// For example, the guest application executes the following code: +/// { +/// // "dax_fd" is virtio-fs file that support dax +/// // "no_dax_fd" is virtio-fs file that do not support dax +/// void *dax_ptr = (void*)mmap(NUMM, 4096, PORT, MAP_SHARED, dax_fd, 0); +/// write(no_dax_fd, dax_ptr, 4096); +/// } +/// dragonball will coredump. +/// +/// This is because the virtiofs device cannot resolve the dax_ptr address +/// when calling vm_as.get_slice(). There is no DAX region in vm_as. This +/// trait inserts the virtio device memory region, such as DAX region, into +/// vm_as. This trait should be implemented in VMM when creating virtio +/// devices with device memory, because the virtio device does not have +/// permission to change vm_as. +pub trait VirtioRegionHandler: Send { + /// Insert GuestRegionMmap to vm_as & address_space. + fn insert_region(&mut self, region: Arc) -> Result<()>; +} + +/// Trait for Virtio transport layer to manage virtio devices. +/// +/// The virtio transport driver takes the responsibility to manage lifecycle of virtio devices. +/// The device manager registers virtio devices to the transport driver, which will then manage +/// the device by: +/// - query device's resource requirement and allocate resources for it. +/// - handle guest register access by forwarding requests to the device. +/// - call activate()/reset() when the device is activated/reset by the guest. +/// The lifecycle of a virtio device is to be moved to a virtio transport, which will then query the +/// device. Once the guest driver has configured the device, `VirtioDevice::activate` will be called +/// and all the events, memory, and queues for device operation will be moved into the device. +/// Optionally, a virtio device can implement device reset in which it returns said resources and +/// resets its internal. +pub trait VirtioDevice: Send { + /// The virtio device type. + fn device_type(&self) -> u32; + + /// The maximum size of each queue that this device supports. + fn queue_max_sizes(&self) -> &[u16]; + + /// The maxinum size of control queue + fn ctrl_queue_max_sizes(&self) -> u16 { + 0 + } + + /// The set of feature bits shifted by `page * 32`. + fn get_avail_features(&self, page: u32) -> u32 { + let _ = page; + 0 + } + + /// Acknowledges that this set of features should be enabled. + fn set_acked_features(&mut self, page: u32, value: u32); + + /// Reads this device configuration space at `offset`. + fn read_config(&mut self, offset: u64, data: &mut [u8]) -> ConfigResult; + + /// Writes to this device configuration space at `offset`. + fn write_config(&mut self, offset: u64, data: &[u8]) -> ConfigResult; + + /// Activates this device for real usage. + fn activate(&mut self, config: VirtioDeviceConfig) -> ActivateResult; + + /// Deactivates this device. + fn reset(&mut self) -> ActivateResult { + Err(ActivateError::InternalError) + } + + /// Removes this devices. + fn remove(&mut self) {} + + /// every new device object has its resource requirements + fn get_resource_requirements( + &self, + requests: &mut Vec, + use_generic_irq: bool, + ); + + /// Assigns requested resources back to virtio device + fn set_resource( + &mut self, + _vm_fd: Arc, + _resource: DeviceResources, + ) -> Result>> { + Ok(None) + } + + /// Used to downcast to the specific type. + fn as_any(&self) -> &dyn Any; + fn as_any_mut(&mut self) -> &mut dyn Any; +} + +/// A helper struct to support basic operations for emulated VirtioDevice backend devices. +pub struct VirtioDeviceInfo { + /// Name of the virtio backend device. + pub driver_name: String, + /// Available features of the virtio backend device. + pub avail_features: u64, + /// Acknowledged features of the virtio backend device. + pub acked_features: u64, + /// Array of queue sizes. + pub queue_sizes: Arc>, + /// Space to store device specific configuration data. + pub config_space: Vec, + /// EventManager SubscriberOps to register/unregister epoll events. + pub epoll_manager: EpollManager, +} + +/// A helper struct to support basic operations for emulated VirtioDevice backend devices. +impl VirtioDeviceInfo { + /// Creates a VirtioDeviceInfo instance. + pub fn new( + driver_name: String, + avail_features: u64, + queue_sizes: Arc>, + config_space: Vec, + epoll_manager: EpollManager, + ) -> Self { + VirtioDeviceInfo { + driver_name, + avail_features, + acked_features: 0u64, + queue_sizes, + config_space, + epoll_manager, + } + } + + /// Gets available features of virtio backend device. + #[inline] + pub fn avail_features(&self) -> u64 { + self.avail_features + } + + /// Gets available features of virtio backend device. + pub fn get_avail_features(&self, page: u32) -> u32 { + match page { + // Get the lower 32-bits of the features bitfield. + 0 => self.avail_features as u32, + // Get the upper 32-bits of the features bitfield. + 1 => (self.avail_features >> 32) as u32, + _ => { + warn!("{}: query features page: {}", self.driver_name, page); + 0u32 + } + } + } + + /// Gets acknowledged features of virtio backend device. + #[inline] + pub fn acked_features(&self) -> u64 { + self.acked_features + } + + /// Sets acknowledged features of virtio backend device. + pub fn set_acked_features(&mut self, page: u32, value: u32) { + let mut v = match page { + 0 => value as u64, + 1 => (value as u64) << 32, + _ => { + warn!("{}: ack unknown feature page: {}", self.driver_name, page); + 0u64 + } + }; + + // Check if the guest is ACK'ing a feature that we didn't claim to have. + let unrequested_features = v & !self.avail_features; + if unrequested_features != 0 { + warn!("{}: ackknowlege unknown feature: {:x}", self.driver_name, v); + // Don't count these features as acked. + v &= !unrequested_features; + } + self.acked_features |= v; + } + + /// Reads device specific configuration data of virtio backend device. + /// + /// The `offset` is based of 0x100 from the MMIO configuration address space. + pub fn read_config(&self, offset: u64, mut data: &mut [u8]) -> ConfigResult { + let config_len = self.config_space.len() as u64; + if offset >= config_len { + error!( + "{}: config space read request out of range, offset {}", + self.driver_name, offset + ); + return Err(ConfigError::InvalidOffset(offset)); + } + if let Some(end) = offset.checked_add(data.len() as u64) { + // This write can't fail, offset and end are checked against config_len. + data.write_all(&self.config_space[offset as usize..cmp::min(end, config_len) as usize]) + .unwrap(); + } + Ok(()) + } + + /// Writes device specific configuration data of virtio backend device. + /// + /// The `offset` is based of 0x100 from the MMIO configuration address space. + pub fn write_config(&mut self, offset: u64, data: &[u8]) -> ConfigResult { + let data_len = data.len() as u64; + let config_len = self.config_space.len() as u64; + if offset >= config_len { + error!( + "{}: config space write request out of range, offset {}", + self.driver_name, offset + ); + return Err(ConfigError::InvalidOffset(offset)); + } + if offset.checked_add(data_len).is_none() { + error!( + "{}: config space write request out of range, offset {}, data length {}", + self.driver_name, offset, data_len + ); + return Err(ConfigError::PlusOverflow(offset, data_len)); + } + if offset + data_len > config_len { + error!( + "{}: config space write request out of range, offset {}, data length {}", + self.driver_name, offset, data_len + ); + return Err(ConfigError::InvalidOffsetPlusDataLen(offset + data_len)); + } + + let dst = &mut self.config_space[offset as usize..(offset + data_len) as usize]; + dst.copy_from_slice(data); + Ok(()) + } + + /// Validate size of queues and queue eventfds. + pub fn check_queue_sizes(&self, queues: &[VirtioQueueConfig]) -> ActivateResult { + if queues.is_empty() || queues.len() != self.queue_sizes.len() { + error!( + "{}: invalid configuration: maximum {} queue(s), got {} queues", + self.driver_name, + self.queue_sizes.len(), + queues.len(), + ); + return Err(ActivateError::InvalidParam); + } + Ok(()) + } + + /// Register event handler for the device. + pub fn register_event_handler(&self, handler: EpollSubscriber) -> SubscriberId { + self.epoll_manager.add_subscriber(handler) + } + + /// Unregister event handler for the device. + pub fn remove_event_handler(&mut self, id: SubscriberId) -> Result { + self.epoll_manager.remove_subscriber(id).map_err(|e| { + Error::IOError(std::io::Error::new( + std::io::ErrorKind::Other, + format!("remove_event_handler failed: {e:?}"), + )) + }) + } +} + +#[cfg(test)] +pub(crate) mod tests { + use dbs_interrupt::{ + InterruptManager, InterruptSourceType, InterruptStatusRegister32, LegacyNotifier, + }; + use dbs_utils::epoll_manager::{EventOps, Events, MutEventSubscriber}; + use kvm_ioctls::Kvm; + use virtio_queue::QueueSync; + use vm_memory::{GuestMemoryAtomic, GuestMemoryMmap, GuestMemoryRegion, MmapRegion}; + + use super::*; + use crate::{VIRTIO_INTR_CONFIG, VIRTIO_INTR_VRING}; + + pub fn create_virtio_device_config() -> VirtioDeviceConfig> { + let (vmfd, irq_manager) = crate::tests::create_vm_and_irq_manager(); + let group = irq_manager + .create_group(InterruptSourceType::LegacyIrq, 0, 1) + .unwrap(); + let status = Arc::new(InterruptStatusRegister32::new()); + let device_change_notifier = Arc::new(LegacyNotifier::new( + group.clone(), + status.clone(), + VIRTIO_INTR_CONFIG, + )); + + let mem = Arc::new(GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap()); + + let mut queues = Vec::new(); + for idx in 0..8 { + queues.push(VirtioQueueConfig::new( + QueueSync::new(512).unwrap(), + Arc::new(EventFd::new(0).unwrap()), + Arc::new(LegacyNotifier::new( + group.clone(), + status.clone(), + VIRTIO_INTR_VRING, + )), + idx, + )); + } + + VirtioDeviceConfig::new( + mem, + vmfd, + DeviceResources::new(), + queues, + None, + device_change_notifier, + ) + } + + #[test] + fn test_create_virtio_queue_config() { + let (_vmfd, irq_manager) = crate::tests::create_vm_and_irq_manager(); + let group = irq_manager + .create_group(InterruptSourceType::LegacyIrq, 0, 1) + .unwrap(); + let status = Arc::new(InterruptStatusRegister32::new()); + let notifier = Arc::new(LegacyNotifier::new(group, status, VIRTIO_INTR_VRING)); + + let mut cfg = VirtioQueueConfig::::create(1024, 1).unwrap(); + cfg.set_interrupt_notifier(notifier); + + let mem = + Arc::new(GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap()); + let desc = cfg.get_next_descriptor(mem.memory()).unwrap(); + assert!(matches!(desc, None)); + + cfg.notify().unwrap(); + assert_eq!(cfg.index(), 1); + assert_eq!(cfg.max_size(), 1024); + assert_eq!(cfg.actual_size(), 1024); + cfg.generate_event().unwrap(); + assert_eq!(cfg.consume_event().unwrap(), 1); + } + + #[test] + fn test_clone_virtio_queue_config() { + let (_vmfd, irq_manager) = crate::tests::create_vm_and_irq_manager(); + let group = irq_manager + .create_group(InterruptSourceType::LegacyIrq, 0, 1) + .unwrap(); + let status = Arc::new(InterruptStatusRegister32::new()); + let notifier = Arc::new(LegacyNotifier::new(group, status, VIRTIO_INTR_VRING)); + + let mut cfg = VirtioQueueConfig::::create(1024, 1).unwrap(); + cfg.set_interrupt_notifier(notifier); + let mut cfg = cfg.clone(); + + let mem = + Arc::new(GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap()); + let desc = cfg.get_next_descriptor(mem.memory()).unwrap(); + assert!(matches!(desc, None)); + + { + let mut guard = cfg.queue_mut().lock(); + let mut iter = guard.iter(mem.memory()).unwrap(); + assert!(matches!(iter.next(), None)); + } + + cfg.notify().unwrap(); + assert_eq!(cfg.index(), 1); + assert_eq!(cfg.max_size(), 1024); + assert_eq!(cfg.actual_size(), 1024); + assert_eq!(cfg.queue.max_size(), 1024); + cfg.generate_event().unwrap(); + assert_eq!(cfg.consume_event().unwrap(), 1); + } + + #[test] + fn test_create_virtio_device_config() { + let mut device_config = create_virtio_device_config(); + + device_config.notify_device_changes().unwrap(); + assert_eq!(device_config.get_queue_interrupt_eventfds().len(), 8); + + let shared_mem = + GuestRegionMmap::new(MmapRegion::new(4096).unwrap(), GuestAddress(0)).unwrap(); + + let list = VirtioSharedMemoryList { + host_addr: 0x1234, + guest_addr: GuestAddress(0x5678), + len: shared_mem.len(), + kvm_userspace_memory_region_flags: 0, + kvm_userspace_memory_region_slot: 1, + region_list: vec![VirtioSharedMemory { + offset: 0, + len: 4096, + }], + mmap_region: Arc::new(shared_mem), + }; + + device_config.set_shm_regions(list); + let (host_addr, guest_addr) = device_config.get_shm_region_addr().unwrap(); + assert_eq!(host_addr, 0x1234); + assert_eq!(guest_addr, 0x5678); + let list = device_config.shm_regions.unwrap(); + assert_eq!(list.kvm_userspace_memory_region_slot, 1); + assert_eq!(list.kvm_userspace_memory_region_flags, 0); + assert_eq!(list.region_list.len(), 1); + } + + struct DummyDevice { + queue_size: Arc>, + device_info: VirtioDeviceInfo, + } + + impl VirtioDevice, QueueSync, GuestRegionMmap> for DummyDevice { + fn device_type(&self) -> u32 { + 0xffff + } + fn queue_max_sizes(&self) -> &[u16] { + &self.queue_size + } + + fn get_avail_features(&self, page: u32) -> u32 { + self.device_info.get_avail_features(page) + } + fn set_acked_features(&mut self, page: u32, value: u32) { + self.device_info.set_acked_features(page, value) + } + + fn read_config(&mut self, offset: u64, data: &mut [u8]) -> ConfigResult { + self.device_info.read_config(offset, data) + } + fn write_config(&mut self, offset: u64, data: &[u8]) -> ConfigResult { + self.device_info.write_config(offset, data) + } + fn activate( + &mut self, + _config: VirtioDeviceConfig>, + ) -> ActivateResult { + Ok(()) + } + fn get_resource_requirements( + &self, + _requests: &mut Vec, + _use_generic_irq: bool, + ) { + } + fn as_any(&self) -> &dyn Any { + self + } + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + } + + struct DummyHandler; + impl MutEventSubscriber for DummyHandler { + fn process(&mut self, _events: Events, _ops: &mut EventOps) {} + fn init(&mut self, _ops: &mut EventOps) {} + } + + #[test] + fn test_virtio_device() { + let epoll_mgr = EpollManager::default(); + + let avail_features = 0x1234 << 32 | 0x4567; + let config_space = vec![1; 16]; + let queue_size = Arc::new(vec![256; 1]); + let device_info = VirtioDeviceInfo::new( + String::from("dummy-device"), + avail_features, + queue_size.clone(), + config_space, + epoll_mgr, + ); + + let mut device = DummyDevice { + queue_size, + device_info, + }; + assert_eq!(device.device_type(), 0xffff); + assert_eq!(device.queue_max_sizes(), &[256]); + assert_eq!(device.ctrl_queue_max_sizes(), 0); + + device.get_resource_requirements(&mut Vec::new(), true); + + // tests avail features + assert_eq!(device.get_avail_features(0), 0x4567); + assert_eq!( + device.get_avail_features(1), + (device.device_info.avail_features() >> 32) as u32 + ); + assert_eq!(device.get_avail_features(2), 0); + + // tests acked features + assert_eq!(device.device_info.acked_features(), 0); + device.set_acked_features(2, 0x0004 | 0x0002); + assert_eq!(device.device_info.acked_features(), 0); + device.set_acked_features(1, 0x0004 | 0x0002); + assert_eq!(device.device_info.acked_features(), 0x0004 << 32); + device.set_acked_features(0, 0x4567 | 0x0008); + assert_eq!(device.device_info.acked_features(), 0x4567 | 0x0004 << 32); + + // test config space invalid read + let mut data = vec![0u8; 16]; + assert_eq!( + device.read_config(16, data.as_mut_slice()).unwrap_err(), + ConfigError::InvalidOffset(16) + ); + assert_eq!(data, vec![0; 16]); + // test read config + device.read_config(4, &mut data[..14]).unwrap(); + assert_eq!(data, vec![1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]); + device.read_config(0, data.as_mut_slice()).unwrap(); + assert_eq!(data, vec![1; 16]); + + // test config space invalid write + let write_data = vec![0xffu8; 16]; + let mut read_data = vec![0x0; 16]; + assert_eq!( + device.write_config(4, &write_data[..13]).unwrap_err(), + ConfigError::InvalidOffsetPlusDataLen(17) + ); + assert_eq!( + device.write_config(16, &write_data[..4]).unwrap_err(), + ConfigError::InvalidOffset(16) + ); + device.read_config(0, read_data.as_mut_slice()).unwrap(); + assert_eq!(read_data, vec![0x1; 16]); + + // test config space write + device.write_config(4, &write_data[6..10]).unwrap(); + assert_eq!( + device.device_info.config_space, + vec![1, 1, 1, 1, 0xff, 0xff, 0xff, 0xff, 1, 1, 1, 1, 1, 1, 1, 1] + ); + + // test device info check_queue_sizes + let queue_size = Vec::new(); + assert!(matches!( + device + .device_info + .check_queue_sizes::(&queue_size), + Err(ActivateError::InvalidParam) + )); + + assert!(matches!(device.reset(), Err(ActivateError::InternalError))); + + // test event handler + let handler = DummyHandler; + let id = device.device_info.register_event_handler(Box::new(handler)); + device.device_info.remove_event_handler(id).unwrap(); + assert!(matches!( + device.device_info.remove_event_handler(id), + Err(Error::IOError(_)) + )); + + // test device activate + let region_size = 0x400; + let regions = vec![ + (GuestAddress(0x0), region_size), + (GuestAddress(0x1000), region_size), + ]; + let gmm = GuestMemoryMmap::from_ranges(®ions).unwrap(); + let gm = GuestMemoryAtomic::::new(gmm); + + let queues = vec![ + VirtioQueueConfig::create(2, 0).unwrap(), + VirtioQueueConfig::create(2, 0).unwrap(), + ]; + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let device_config = VirtioDeviceConfig::new( + gm, + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + device.activate(device_config).unwrap(); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/epoll_helper.rs b/src/dragonball/src/dbs_virtio_devices/src/epoll_helper.rs new file mode 100644 index 000000000..42732d31c --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/epoll_helper.rs @@ -0,0 +1,157 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2020 Intel Corporation +// +// Copyright © 2021 Ant Group Corporation + +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::fs::File; +use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; + +use log::error; + +pub struct EpollHelper { + epoll_file: File, +} + +#[derive(Debug)] +pub enum EpollHelperError { + CreateFd(std::io::Error), + Ctl(std::io::Error), + IoError(std::io::Error), + Wait(std::io::Error), +} + +pub trait EpollHelperHandler { + // Return true if execution of the loop should be stopped + fn handle_event(&mut self, helper: &mut EpollHelper, event: &epoll::Event) -> bool; +} + +impl EpollHelper { + pub fn new() -> std::result::Result { + // Create the epoll file descriptor + let epoll_fd = epoll::create(true).map_err(EpollHelperError::CreateFd)?; + // Use 'File' to enforce closing on 'epoll_fd' + let epoll_file = unsafe { File::from_raw_fd(epoll_fd) }; + + Ok(Self { epoll_file }) + } + + pub fn add_event(&mut self, fd: RawFd, id: u32) -> std::result::Result<(), EpollHelperError> { + self.add_event_custom(fd, id, epoll::Events::EPOLLIN) + } + + pub fn add_event_custom( + &mut self, + fd: RawFd, + id: u32, + evts: epoll::Events, + ) -> std::result::Result<(), EpollHelperError> { + epoll::ctl( + self.epoll_file.as_raw_fd(), + epoll::ControlOptions::EPOLL_CTL_ADD, + fd, + epoll::Event::new(evts, id.into()), + ) + .map_err(EpollHelperError::Ctl) + } + + pub fn del_event_custom( + &mut self, + fd: RawFd, + id: u32, + evts: epoll::Events, + ) -> std::result::Result<(), EpollHelperError> { + epoll::ctl( + self.epoll_file.as_raw_fd(), + epoll::ControlOptions::EPOLL_CTL_DEL, + fd, + epoll::Event::new(evts, id.into()), + ) + .map_err(EpollHelperError::Ctl) + } + + pub fn run( + &mut self, + handler: &mut dyn EpollHelperHandler, + ) -> std::result::Result<(), EpollHelperError> { + const EPOLL_EVENTS_LEN: usize = 100; + let mut events = vec![epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN]; + + loop { + let num_events = match epoll::wait(self.epoll_file.as_raw_fd(), -1, &mut events[..]) { + Ok(res) => res, + Err(e) => { + if e.kind() == std::io::ErrorKind::Interrupted { + // It's well defined from the epoll_wait() syscall + // documentation that the epoll loop can be interrupted + // before any of the requested events occurred or the + // timeout expired. In both those cases, epoll_wait() + // returns an error of type EINTR, but this should not + // be considered as a regular error. Instead it is more + // appropriate to retry, by calling into epoll_wait(). + continue; + } + error!("io thread epoll wait failed: {:?}", e); + return Err(EpollHelperError::Wait(e)); + } + }; + + for event in events.iter().take(num_events) { + if handler.handle_event(self, event) { + return Ok(()); + } + } + } + } +} + +impl AsRawFd for EpollHelper { + fn as_raw_fd(&self) -> RawFd { + self.epoll_file.as_raw_fd() + } +} + +#[cfg(test)] +mod tests { + use std::os::unix::io::AsRawFd; + use vmm_sys_util::eventfd::EventFd; + + use super::EpollHelper; + + #[test] + fn test_new_epoller() { + let helper = EpollHelper::new(); + assert!(helper.is_ok()); + } + + #[test] + fn test_add_event() { + let helper = EpollHelper::new(); + assert!(helper.is_ok()); + + let eventfd = EventFd::new(0).unwrap(); + + let res = helper.unwrap().add_event(eventfd.as_raw_fd(), 0); + assert!(res.is_ok()) + } + + #[test] + fn test_delete_event() { + let helper = EpollHelper::new(); + assert!(helper.is_ok()); + + let eventfd = EventFd::new(0).unwrap(); + let mut helper = helper.unwrap(); + let res = helper.add_event(eventfd.as_raw_fd(), 0); + assert!(res.is_ok()); + + let res = helper.del_event_custom(eventfd.as_raw_fd(), 0, epoll::Events::EPOLLIN); + assert!(res.is_ok()); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/fs/device.rs b/src/dragonball/src/dbs_virtio_devices/src/fs/device.rs new file mode 100644 index 000000000..2f9c2c283 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/fs/device.rs @@ -0,0 +1,1797 @@ +// Copyright 2020 Alibaba Cloud. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::collections::HashMap; +use std::ffi::CString; +use std::fs::File; +use std::io::{BufRead, BufReader, Read}; +use std::marker::PhantomData; +use std::ops::Deref; +use std::os::unix::io::FromRawFd; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use std::sync::{mpsc, Arc}; +use std::time::Duration; + +use caps::{CapSet, Capability}; +use dbs_device::resources::{DeviceResources, ResourceConstraint}; +use dbs_utils::epoll_manager::{EpollManager, SubscriberId}; +use dbs_utils::rate_limiter::{BucketUpdate, RateLimiter}; +use fuse_backend_rs::api::{Vfs, VfsIndex, VfsOptions}; +use fuse_backend_rs::passthrough::{CachePolicy, Config as PassthroughConfig, PassthroughFs}; +use kvm_bindings::kvm_userspace_memory_region; +use kvm_ioctls::VmFd; +use log::{debug, error, info, trace, warn}; +use nix::sys::memfd; +use nydus_api::ConfigV2; +use nydus_rafs::blobfs::{BlobFs, Config as BlobfsConfig}; +use nydus_rafs::{fs::Rafs, RafsIoRead}; +use rlimit::Resource; +use virtio_bindings::bindings::virtio_blk::VIRTIO_F_VERSION_1; +use virtio_queue::QueueT; +use vm_memory::{ + FileOffset, GuestAddress, GuestAddressSpace, GuestRegionMmap, GuestUsize, MmapRegion, +}; +use vmm_sys_util::eventfd::EventFd; + +use crate::{ + ActivateError, ActivateResult, ConfigResult, Error, Result, VirtioDevice, VirtioDeviceConfig, + VirtioDeviceInfo, VirtioRegionHandler, VirtioSharedMemory, VirtioSharedMemoryList, + TYPE_VIRTIO_FS, +}; + +use super::{ + CacheHandler, Error as FsError, Result as FsResult, VirtioFsEpollHandler, VIRTIO_FS_NAME, +}; + +const CONFIG_SPACE_TAG_SIZE: usize = 36; +const CONFIG_SPACE_NUM_QUEUES_SIZE: usize = 4; +const CONFIG_SPACE_SIZE: usize = CONFIG_SPACE_TAG_SIZE + CONFIG_SPACE_NUM_QUEUES_SIZE; +const NUM_QUEUE_OFFSET: usize = 1; + +// Attr and entry timeout values +const CACHE_ALWAYS_TIMEOUT: u64 = 86_400; // 1 day +const CACHE_AUTO_TIMEOUT: u64 = 1; +const CACHE_NONE_TIMEOUT: u64 = 0; + +// VirtioFs backend fs type +pub(crate) const PASSTHROUGHFS: &str = "passthroughfs"; +pub(crate) const BLOBFS: &str = "blobfs"; +pub(crate) const RAFS: &str = "rafs"; + +/// Info of backend filesystems of VirtioFs +#[allow(dead_code)] +pub struct BackendFsInfo { + pub(crate) index: VfsIndex, + pub(crate) fstype: String, + // (source, config), only suitable for Rafs + pub(crate) src_cfg: Option<(String, String)>, +} + +/// Virtio device for virtiofs +pub struct VirtioFs { + pub(crate) device_info: VirtioDeviceInfo, + pub(crate) cache_size: u64, + pub(crate) queue_sizes: Arc>, + pub(crate) thread_pool_size: u16, + pub(crate) cache_policy: CachePolicy, + pub(crate) writeback_cache: bool, + pub(crate) no_open: bool, + pub(crate) killpriv_v2: bool, + pub(crate) no_readdir: bool, + pub(crate) xattr: bool, + pub(crate) handler: Box, + pub(crate) fs: Arc, + pub(crate) backend_fs: HashMap, + pub(crate) subscriber_id: Option, + pub(crate) id: String, + pub(crate) rate_limiter: Option, + pub(crate) patch_rate_limiter_fd: EventFd, + pub(crate) sender: Option>, + phantom: PhantomData, +} + +impl VirtioFs +where + AS: GuestAddressSpace + 'static, +{ + pub fn set_patch_rate_limiters(&self, bytes: BucketUpdate, ops: BucketUpdate) -> Result<()> { + match &self.sender { + Some(sender) => { + sender.send((bytes, ops)).map_err(|e| { + error!( + "{}: failed to send rate-limiter patch data {:?}", + VIRTIO_FS_NAME, e + ); + Error::InternalError + })?; + self.patch_rate_limiter_fd.write(1).map_err(|e| { + error!( + "{}: failed to write rate-limiter patch event {:?}", + VIRTIO_FS_NAME, e + ); + Error::InternalError + })?; + Ok(()) + } + None => { + error!( + "{}: failed to establish channel to send rate-limiter patch data", + VIRTIO_FS_NAME + ); + Err(Error::InternalError) + } + } + } +} + +#[allow(clippy::too_many_arguments)] +impl VirtioFs { + /// Create a new virtiofs device. + pub fn new( + tag: &str, + req_num_queues: usize, + queue_size: u16, + cache_size: u64, + cache_policy: &str, + thread_pool_size: u16, + writeback_cache: bool, + no_open: bool, + killpriv_v2: bool, + xattr: bool, + drop_sys_resource: bool, + no_readdir: bool, + handler: Box, + epoll_mgr: EpollManager, + rate_limiter: Option, + ) -> Result { + info!( + "{}: tag {} req_num_queues {} queue_size {} cache_size {} cache_policy {} thread_pool_size {} writeback_cache {} no_open {} killpriv_v2 {} xattr {} drop_sys_resource {} no_readdir {}", + VIRTIO_FS_NAME, tag, req_num_queues, queue_size, cache_size, cache_policy, thread_pool_size, writeback_cache, no_open, killpriv_v2, xattr, drop_sys_resource, no_readdir + ); + + let num_queues = NUM_QUEUE_OFFSET + req_num_queues; + + // Create virtio device config space. + // First by adding the tag. + let mut config_space = tag.to_string().into_bytes(); + config_space.resize(CONFIG_SPACE_SIZE, 0); + + // And then by copying the number of queues. + let mut num_queues_slice: [u8; 4] = (req_num_queues as u32).to_be_bytes(); + num_queues_slice.reverse(); + config_space[CONFIG_SPACE_TAG_SIZE..CONFIG_SPACE_SIZE].copy_from_slice(&num_queues_slice); + + let cache = match CachePolicy::from_str(cache_policy) { + Ok(c) => c, + Err(e) => { + error!( + "{}: Parse cache_policy \"{}\" failed: {:?}", + VIRTIO_FS_NAME, cache_policy, e + ); + return Err(Error::InvalidInput); + } + }; + + // Set rlimit first, in case we dropped CAP_SYS_RESOURCE later and hit EPERM. + if let Err(e) = set_default_rlimit_nofile() { + warn!("{}: failed to set rlimit: {:?}", VIRTIO_FS_NAME, e); + } + + if drop_sys_resource && writeback_cache { + error!( + "{}: writeback_cache is not compatible with drop_sys_resource", + VIRTIO_FS_NAME + ); + return Err(Error::InvalidInput); + } + + // Drop CAP_SYS_RESOURCE when creating VirtioFs device, not in activate(), as it's vcpu + // thread that calls activate(), but we do I/O in vmm epoll thread, so drop cap here. + if drop_sys_resource { + info!( + "{}: Dropping CAP_SYS_RESOURCE, tid {:?}", + VIRTIO_FS_NAME, + nix::unistd::gettid() + ); + if let Err(e) = caps::drop(None, CapSet::Effective, Capability::CAP_SYS_RESOURCE) { + warn!( + "{}: failed to drop CAP_SYS_RESOURCE: {:?}", + VIRTIO_FS_NAME, e + ); + } + } + + let vfs_opts = VfsOptions { + no_writeback: !writeback_cache, + no_open, + killpriv_v2, + no_readdir, + ..VfsOptions::default() + }; + + Ok(VirtioFs { + device_info: VirtioDeviceInfo::new( + VIRTIO_FS_NAME.to_string(), + 1u64 << VIRTIO_F_VERSION_1, + Arc::new(vec![queue_size; num_queues]), + config_space, + epoll_mgr, + ), + cache_size, + queue_sizes: Arc::new(vec![queue_size; num_queues]), + thread_pool_size, + cache_policy: cache, + writeback_cache, + no_open, + no_readdir, + killpriv_v2, + xattr, + handler, + fs: Arc::new(Vfs::new(vfs_opts)), + backend_fs: HashMap::new(), + subscriber_id: None, + id: tag.to_string(), + rate_limiter, + patch_rate_limiter_fd: EventFd::new(0).unwrap(), + sender: None, + phantom: PhantomData, + }) + } + + fn is_dax_on(&self) -> bool { + self.cache_size > 0 + } + + fn get_timeout(&self) -> Duration { + match self.cache_policy { + CachePolicy::Always => Duration::from_secs(CACHE_ALWAYS_TIMEOUT), + CachePolicy::Never => Duration::from_secs(CACHE_NONE_TIMEOUT), + CachePolicy::Auto => Duration::from_secs(CACHE_AUTO_TIMEOUT), + } + } + + fn parse_blobfs_cfg( + &self, + source: &str, + config: Option, + dax_threshold_size_kb: Option, + ) -> FsResult<(String, String, Option)> { + let (blob_cache_dir, blob_ondemand_cfg) = match config.as_ref() { + Some(cfg) => { + let conf = ConfigV2::from_str(cfg).map_err(|e| { + error!("failed to load rafs config {} error: {:?}", &cfg, e); + FsError::InvalidData + })?; + + // v6 doesn't support digest validation yet. + if conf.rafs.ok_or(FsError::InvalidData)?.validate { + error!("config.digest_validate needs to be false"); + return Err(FsError::InvalidData); + } + + let work_dir = conf + .cache + .ok_or(FsError::InvalidData)? + .file_cache + .ok_or(FsError::InvalidData)? + .work_dir; + + let blob_ondemand_cfg = format!( + r#" + {{ + "rafs_conf": {}, + "bootstrap_path": "{}", + "blob_cache_dir": "{}" + }}"#, + cfg, source, &work_dir + ); + + (work_dir, blob_ondemand_cfg) + } + None => return Err(FsError::BackendFs("no rafs config file".to_string())), + }; + + let dax_file_size = match dax_threshold_size_kb { + Some(size) => Some(kb_to_bytes(size)?), + None => None, + }; + + Ok((blob_cache_dir, blob_ondemand_cfg, dax_file_size)) + } + + pub fn manipulate_backend_fs( + &mut self, + source: Option, + fstype: Option, + mountpoint: &str, + config: Option, + ops: &str, + prefetch_list_path: Option, + dax_threshold_size_kb: Option, + ) -> FsResult<()> { + debug!( + "source {:?}, fstype {:?}, mountpoint {:?}, config {:?}, ops {:?}, prefetch_list_path {:?}, dax_threshold_size_kb 0x{:x?}", + source, fstype, mountpoint, config, ops, prefetch_list_path, dax_threshold_size_kb + ); + match ops { + "mount" => { + if source.is_none() { + error!("{}: source is required for mount.", VIRTIO_FS_NAME); + return Err(FsError::InvalidData); + } + // safe because is not None + let source = source.unwrap(); + match fstype.as_deref() { + Some("Blobfs") | Some(BLOBFS) => { + self.mount_blobfs(source, mountpoint, config, dax_threshold_size_kb) + } + Some("PassthroughFs") | Some(PASSTHROUGHFS) => { + self.mount_passthroughfs(source, mountpoint, dax_threshold_size_kb) + } + Some("Rafs") | Some(RAFS) => { + self.mount_rafs(source, mountpoint, config, prefetch_list_path) + } + _ => { + error!("http_server: type is not invalid."); + Err(FsError::InvalidData) + } + } + } + "umount" => { + self.fs.umount(mountpoint).map_err(|e| { + error!("umount {:?}", e); + FsError::InvalidData + })?; + self.backend_fs.remove(mountpoint); + Ok(()) + } + "update" => { + info!("switch backend"); + self.update_rafs(source, mountpoint, config) + } + _ => { + error!("invalid ops, mount failed."); + Err(FsError::InvalidData) + } + } + } + + fn mount_blobfs( + &mut self, + source: String, + mountpoint: &str, + config: Option, + dax_threshold_size_kb: Option, + ) -> FsResult<()> { + debug!("http_server blobfs"); + let timeout = self.get_timeout(); + let (blob_cache_dir, blob_ondemand_cfg, dax_file_size) = + self.parse_blobfs_cfg(&source, config, dax_threshold_size_kb)?; + + let fs_cfg = BlobfsConfig { + ps_config: PassthroughConfig { + root_dir: blob_cache_dir, + do_import: true, + writeback: self.writeback_cache, + no_open: self.no_open, + xattr: self.xattr, + cache_policy: self.cache_policy.clone(), + entry_timeout: timeout, + attr_timeout: timeout, + dax_file_size, + ..Default::default() + }, + blob_ondemand_cfg, + }; + let blob_fs = BlobFs::new(fs_cfg).map_err(FsError::IOError)?; + blob_fs.import().map_err(FsError::IOError)?; + debug!("blobfs mounted"); + + let fs = Box::new(blob_fs); + match self.fs.mount(fs, mountpoint) { + Ok(idx) => { + self.backend_fs.insert( + mountpoint.to_string(), + BackendFsInfo { + index: idx, + fstype: BLOBFS.to_string(), + src_cfg: None, + }, + ); + Ok(()) + } + Err(e) => { + error!("blobfs mount {:?}", e); + Err(FsError::InvalidData) + } + } + } + + fn mount_passthroughfs( + &mut self, + source: String, + mountpoint: &str, + dax_threshold_size_kb: Option, + ) -> FsResult<()> { + debug!("http_server passthrough"); + let timeout = self.get_timeout(); + + let dax_threshold_size = match dax_threshold_size_kb { + Some(size) => Some(kb_to_bytes(size)?), + None => None, + }; + + let fs_cfg = PassthroughConfig { + root_dir: source, + do_import: false, + writeback: self.writeback_cache, + no_open: self.no_open, + no_readdir: self.no_readdir, + killpriv_v2: self.killpriv_v2, + xattr: self.xattr, + cache_policy: self.cache_policy.clone(), + entry_timeout: timeout, + attr_timeout: timeout, + dax_file_size: dax_threshold_size, + ..Default::default() + }; + + let passthrough_fs = PassthroughFs::<()>::new(fs_cfg).map_err(FsError::IOError)?; + passthrough_fs.import().map_err(FsError::IOError)?; + debug!("passthroughfs mounted"); + + let fs = Box::new(passthrough_fs); + match self.fs.mount(fs, mountpoint) { + Ok(idx) => { + self.backend_fs.insert( + mountpoint.to_string(), + BackendFsInfo { + index: idx, + fstype: PASSTHROUGHFS.to_string(), + src_cfg: None, + }, + ); + Ok(()) + } + Err(e) => { + error!("passthroughfs mount {:?}", e); + Err(FsError::InvalidData) + } + } + } + + fn mount_rafs( + &mut self, + source: String, + mountpoint: &str, + config: Option, + prefetch_list_path: Option, + ) -> FsResult<()> { + debug!("http_server rafs"); + let file = Path::new(&source); + let (mut rafs, rafs_cfg) = match config.as_ref() { + Some(cfg) => { + let rafs_conf: Arc = Arc::new( + serde_json::from_str(cfg).map_err(|e| FsError::BackendFs(e.to_string()))?, + ); + + ( + Rafs::new(&rafs_conf, mountpoint, file) + .map_err(|e| FsError::BackendFs(format!("Rafs::new() failed: {e:?}")))?, + cfg.clone(), + ) + } + None => return Err(FsError::BackendFs("no rafs config file".to_string())), + }; + let prefetch_files = parse_prefetch_files(prefetch_list_path.clone()); + debug!( + "{}: Import rafs with prefetch_files {:?}", + VIRTIO_FS_NAME, prefetch_files + ); + rafs.0 + .import(rafs.1, prefetch_files) + .map_err(|e| FsError::BackendFs(format!("Import rafs failed: {e:?}")))?; + info!( + "{}: Rafs imported with prefetch_list_path {:?}", + VIRTIO_FS_NAME, prefetch_list_path + ); + let fs = Box::new(rafs.0); + match self.fs.mount(fs, mountpoint) { + Ok(idx) => { + self.backend_fs.insert( + mountpoint.to_string(), + BackendFsInfo { + index: idx, + fstype: RAFS.to_string(), + src_cfg: Some((source, rafs_cfg)), + }, + ); + Ok(()) + } + Err(e) => { + error!("Rafs mount failed: {:?}", e); + Err(FsError::InvalidData) + } + } + } + + fn update_rafs( + &mut self, + source: Option, + mountpoint: &str, + config: Option, + ) -> FsResult<()> { + if config.is_none() { + return Err(FsError::BackendFs("no rafs config file".to_string())); + } + if source.is_none() { + return Err(FsError::BackendFs(format!( + "rafs mounted at {mountpoint} doesn't have source configured" + ))); + } + // safe because config is not None. + let config = config.unwrap(); + let source = source.unwrap(); + let rafs_conf: Arc = + Arc::new(serde_json::from_str(&config).map_err(|e| FsError::BackendFs(e.to_string()))?); + // Update rafs config, update BackendFsInfo as well. + let new_info = match self.backend_fs.get(mountpoint) { + Some(orig_info) => BackendFsInfo { + index: orig_info.index, + fstype: orig_info.fstype.clone(), + src_cfg: Some((source.to_string(), config)), + }, + None => { + return Err(FsError::BackendFs(format!( + "rafs mount point {mountpoint} is not mounted" + ))); + } + }; + let rootfs = match self.fs.get_rootfs(mountpoint) { + Ok(fs) => match fs { + Some(f) => f, + None => { + return Err(FsError::BackendFs(format!( + "rafs get_rootfs() failed: mountpoint {mountpoint} not mounted" + ))); + } + }, + Err(e) => { + return Err(FsError::BackendFs(format!( + "rafs get_rootfs() failed: {e:?}" + ))); + } + }; + let any_fs = rootfs.deref().as_any(); + if let Some(fs_swap) = any_fs.downcast_ref::() { + let mut file = ::from_file(&source) + .map_err(|e| FsError::BackendFs(format!("RafsIoRead failed: {e:?}")))?; + + fs_swap + .update(&mut file, &rafs_conf) + .map_err(|e| FsError::BackendFs(format!("Update rafs failed: {e:?}")))?; + self.backend_fs.insert(mountpoint.to_string(), new_info); + Ok(()) + } else { + Err(FsError::BackendFs("no rafs is found".to_string())) + } + } + + fn register_mmap_region( + &mut self, + vm_fd: Arc, + guest_addr: u64, + len: u64, + slot_res: &[u32], + ) -> Result> { + // Create file backend for virtiofs's mmap region to let goku and + // vhost-user slave can remap memory by memfd. However, this is not a + // complete solution, because when dax is actually on, they need to be + // notified of the change in the dax memory mapping relationship. + let file_offset = { + let fd = memfd::memfd_create( + // safe to unwrap, no nul byte in file name + &CString::new("virtio_fs_mem").unwrap(), + memfd::MemFdCreateFlag::empty(), + ) + .map_err(|e| Error::VirtioFs(FsError::MemFdCreate(e)))?; + let file: File = unsafe { File::from_raw_fd(fd) }; + file.set_len(len) + .map_err(|e| Error::VirtioFs(FsError::SetFileSize(e)))?; + Some(FileOffset::new(file, 0)) + }; + + // unmap will be handled on MmapRegion'd Drop. + let mmap_region = MmapRegion::build( + file_offset, + len as usize, + libc::PROT_NONE, + libc::MAP_ANONYMOUS | libc::MAP_NORESERVE | libc::MAP_PRIVATE, + ) + .map_err(Error::NewMmapRegion)?; + + let host_addr: u64 = mmap_region.as_ptr() as u64; + let kvm_mem_region = kvm_userspace_memory_region { + slot: slot_res[0], + flags: 0, + guest_phys_addr: guest_addr, + memory_size: len, + userspace_addr: host_addr, + }; + debug!( + "{}: mmio shared memory kvm_region: {:?}", + self.id, kvm_mem_region, + ); + + // Safe because the user mem region is just created, and kvm slot is allocated + // by resource allocator. + unsafe { + vm_fd + .set_user_memory_region(kvm_mem_region) + .map_err(Error::SetUserMemoryRegion)? + }; + + let region = Arc::new( + GuestRegionMmap::new(mmap_region, GuestAddress(guest_addr)) + .map_err(Error::InsertMmap)?, + ); + self.handler.insert_region(region.clone())?; + + Ok(region) + } +} + +fn parse_prefetch_files(prefetch_list_path: Option) -> Option> { + let prefetch_files: Option> = match prefetch_list_path { + Some(p) => { + match File::open(p.as_str()) { + Ok(f) => { + let r = BufReader::new(f); + // All prefetch files should be absolute path + let v: Vec = r + .lines() + .filter(|l| { + let lref = l.as_ref(); + lref.is_ok() && lref.unwrap().starts_with('/') + }) + .map(|l| PathBuf::from(l.unwrap().as_str())) + .collect(); + if v.is_empty() { + None + } else { + Some(v) + } + } + Err(e) => { + // We could contineu without prefetch files, just print warning and return + warn!( + "{}: Open prefetch_file_path {} failed: {:?}", + VIRTIO_FS_NAME, + p.as_str(), + e + ); + None + } + } + } + None => None, + }; + prefetch_files +} + +fn kb_to_bytes(kb: u64) -> FsResult { + if (kb & 0xffc0_0000_0000_0000) != 0 { + error!( + "dax_threshold_size_kb * 1024 overflow. dax_threshold_size_kb is 0x{:x}.", + kb + ); + return Err(FsError::InvalidData); + } + + let bytes = kb << 10; + Ok(bytes) +} + +fn set_default_rlimit_nofile() -> Result<()> { + // Our default RLIMIT_NOFILE target. + let mut max_fds: u64 = 300_000; + // leave at least this many fds free + let reserved_fds: u64 = 16_384; + + // Reduce max_fds below the system-wide maximum, if necessary. + // This ensures there are fds available for other processes so we + // don't cause resource exhaustion. + let mut file_max = String::new(); + let mut f = File::open("/proc/sys/fs/file-max").map_err(|e| { + error!( + "{}: failed to read /proc/sys/fs/file-max {:?}", + VIRTIO_FS_NAME, e + ); + Error::IOError(e) + })?; + f.read_to_string(&mut file_max)?; + let file_max = file_max.trim().parse::().map_err(|e| { + error!("{}: read fs.file-max sysctl wrong {:?}", VIRTIO_FS_NAME, e); + Error::InvalidInput + })?; + if file_max < 2 * reserved_fds { + error!( + "{}: The fs.file-max sysctl ({}) is too low to allow a reasonable number of open files ({}).", + VIRTIO_FS_NAME, file_max, 2 * reserved_fds + ); + return Err(Error::InvalidInput); + } + + max_fds = std::cmp::min(file_max - reserved_fds, max_fds); + let rlimit_nofile = Resource::NOFILE + .get() + .map(|(curr, _)| if curr >= max_fds { 0 } else { max_fds }) + .map_err(|e| { + error!("{}: failed to get rlimit {:?}", VIRTIO_FS_NAME, e); + Error::IOError(e) + })?; + + if rlimit_nofile == 0 { + info!( + "{}: original rlimit nofile is greater than max_fds({}), keep rlimit nofile setting", + VIRTIO_FS_NAME, max_fds + ); + Ok(()) + } else { + info!( + "{}: set rlimit {} (max_fds {})", + VIRTIO_FS_NAME, rlimit_nofile, max_fds + ); + + Resource::NOFILE + .set(rlimit_nofile, rlimit_nofile) + .map_err(|e| { + error!("{}: failed to set rlimit {:?}", VIRTIO_FS_NAME, e); + Error::IOError(e) + }) + } +} + +impl VirtioDevice for VirtioFs +where + AS: 'static + GuestAddressSpace + Clone + Send + Sync, + AS::T: Send, + AS::M: Sync + Send, + Q: QueueT + Send + 'static, +{ + fn device_type(&self) -> u32 { + TYPE_VIRTIO_FS + } + + fn queue_max_sizes(&self) -> &[u16] { + &self.queue_sizes + } + + fn get_avail_features(&self, page: u32) -> u32 { + self.device_info.get_avail_features(page) + } + + fn set_acked_features(&mut self, page: u32, value: u32) { + trace!( + target: VIRTIO_FS_NAME, + "{}: VirtioDevice::set_acked_features({}, 0x{:x})", + self.id, + page, + value + ); + self.device_info.set_acked_features(page, value) + } + + fn read_config(&mut self, offset: u64, data: &mut [u8]) -> ConfigResult { + trace!( + target: VIRTIO_FS_NAME, + "{}: VirtioDevice::read_config(0x{:x}, {:?})", + self.id, + offset, + data + ); + self.device_info.read_config(offset, data) + } + + fn write_config(&mut self, offset: u64, data: &[u8]) -> ConfigResult { + trace!( + target: VIRTIO_FS_NAME, + "{}: VirtioDevice::write_config(0x{:x}, {:?})", + self.id, + offset, + data + ); + self.device_info.write_config(offset, data) + } + + fn activate(&mut self, config: VirtioDeviceConfig) -> ActivateResult { + trace!( + target: VIRTIO_FS_NAME, + "{}: VirtioDevice::activate()", + self.id + ); + + self.device_info.check_queue_sizes(&config.queues)?; + + let (sender, receiver) = mpsc::channel(); + self.sender = Some(sender); + let rate_limiter = self.rate_limiter.take().unwrap_or_default(); + let patch_rate_limiter_fd = self.patch_rate_limiter_fd.try_clone().map_err(|e| { + error!( + "{}: failed to clone patch rate limiter eventfd {:?}", + VIRTIO_FS_NAME, e + ); + ActivateError::InternalError + })?; + + let cache_handler = if let Some((addr, _guest_addr)) = config.get_shm_region_addr() { + let handler = CacheHandler { + cache_size: self.cache_size, + mmap_cache_addr: addr, + id: self.id.clone(), + }; + + Some(handler) + } else { + None + }; + + let handler = VirtioFsEpollHandler::new( + config, + self.fs.clone(), + cache_handler, + self.thread_pool_size, + self.id.clone(), + rate_limiter, + patch_rate_limiter_fd, + Some(receiver), + ); + + self.subscriber_id = Some(self.device_info.register_event_handler(Box::new(handler))); + + Ok(()) + } + + // Please keep in synchronization with vhost/fs.rs + fn get_resource_requirements( + &self, + requests: &mut Vec, + use_generic_irq: bool, + ) { + trace!( + target: VIRTIO_FS_NAME, + "{}: VirtioDevice::get_resource_requirements()", + self.id + ); + requests.push(ResourceConstraint::LegacyIrq { irq: None }); + if use_generic_irq { + // Allocate one irq for device configuration change events, and one irq for each queue. + requests.push(ResourceConstraint::GenericIrq { + size: (self.queue_sizes.len() + 1) as u32, + }); + } + + // Check if we have dax enabled or not, just return if no dax window requested. + if !self.is_dax_on() { + info!("{}: DAX window is disabled.", self.id); + return; + } + + // Request for DAX window. The memory needs to be 2MiB aligned in order to support + // hugepages, and needs to be above 4G to avoid confliction with lapic/ioapic devices. + requests.push(ResourceConstraint::MmioAddress { + range: Some((0x1_0000_0000, std::u64::MAX)), + align: 0x0020_0000, + size: self.cache_size, + }); + + // Request for new kvm memory slot for DAX window. + requests.push(ResourceConstraint::KvmMemSlot { + slot: None, + size: 1, + }); + } + + // Please keep in synchronization with vhost/fs.rs + fn set_resource( + &mut self, + vm_fd: Arc, + resource: DeviceResources, + ) -> Result>> { + trace!( + target: VIRTIO_FS_NAME, + "{}: VirtioDevice::set_resource()", + self.id + ); + + let mmio_res = resource.get_mmio_address_ranges(); + let slot_res = resource.get_kvm_mem_slots(); + + // Do nothing if there's no dax window requested. + if mmio_res.is_empty() { + return Ok(None); + } + + // Make sure we have the correct resource as requested, and currently we only support one + // shm region for DAX window (version table and journal are not supported yet). + if mmio_res.len() != slot_res.len() || mmio_res.len() != 1 { + error!( + "{}: wrong number of mmio or kvm slot resource ({}, {})", + self.id, + mmio_res.len(), + slot_res.len() + ); + return Err(Error::InvalidResource); + } + + let guest_addr = mmio_res[0].0; + let cache_len = mmio_res[0].1; + + let mmap_region = self.register_mmap_region(vm_fd, guest_addr, cache_len, &slot_res)?; + + Ok(Some(VirtioSharedMemoryList { + host_addr: mmap_region.deref().deref().as_ptr() as u64, + guest_addr: GuestAddress(guest_addr), + len: cache_len as GuestUsize, + kvm_userspace_memory_region_flags: 0, + kvm_userspace_memory_region_slot: slot_res[0], + region_list: vec![VirtioSharedMemory { + offset: 0, + len: cache_len, + }], + mmap_region, + })) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + +#[cfg(test)] +pub mod tests { + #[cfg(feature = "test-resources")] + use std::env::temp_dir; + use std::io::Write; + use std::path::PathBuf; + use std::sync::Arc; + + use dbs_device::resources::DeviceResources; + use dbs_interrupt::NoopNotifier; + use kvm_ioctls::Kvm; + use virtio_queue::QueueSync; + use vm_memory::GuestMemoryRegion; + use vm_memory::{GuestAddress, GuestMemoryMmap, GuestRegionMmap}; + use vmm_sys_util::tempfile::TempFile; + use Error as VirtIoError; + + use super::*; + use crate::device::VirtioRegionHandler; + use crate::{ActivateError, VirtioQueueConfig, TYPE_VIRTIO_FS}; + + pub(crate) const TAG: &str = "test"; + pub(crate) const NUM_QUEUES: usize = 1; + pub(crate) const QUEUE_SIZE: u16 = 1024; + pub(crate) const CACHE_SIZE: u64 = 0; + pub(crate) const THREAD_NUM: u16 = 10; + pub(crate) const CACHE_POLICY: &str = "auto"; + pub(crate) const WB_CACHE: bool = true; + pub(crate) const NO_OPEN: bool = true; + pub(crate) const NO_READDIR: bool = false; + pub(crate) const KILLPRIV_V2: bool = false; + pub(crate) const XATTR: bool = false; + pub(crate) const DROP_SYS_RSC: bool = false; + pub(crate) const FS_EVENTS_COUNT: u32 = 4; + + pub struct DummyVirtioRegionHandler {} + + impl VirtioRegionHandler for DummyVirtioRegionHandler { + fn insert_region( + &mut self, + _region: Arc, + ) -> std::result::Result<(), VirtIoError> { + Ok(()) + } + } + + pub fn new_dummy_handler_helper() -> Box { + Box::new(DummyVirtioRegionHandler {}) + } + + #[cfg(feature = "test-resources")] + fn create_fs_device_default() -> VirtioFs> { + let epoll_manager = EpollManager::default(); + let rate_limiter = RateLimiter::new(100, 0, 300, 10, 0, 300).unwrap(); + let fs: VirtioFs> = VirtioFs::new( + TAG, + NUM_QUEUES, + QUEUE_SIZE, + CACHE_SIZE, + CACHE_POLICY, + THREAD_NUM, + WB_CACHE, + NO_OPEN, + KILLPRIV_V2, + XATTR, + DROP_SYS_RSC, + NO_READDIR, + new_dummy_handler_helper(), + epoll_manager, + Some(rate_limiter), + ) + .unwrap(); + + fs + } + + pub(crate) fn create_fs_epoll_handler( + id: String, + ) -> VirtioFsEpollHandler, QueueSync, GuestRegionMmap> { + let vfs = Arc::new(Vfs::new(VfsOptions::default())); + let mem = Arc::new(GuestMemoryMmap::from_ranges(&[(GuestAddress(0x0), 0x10000)]).unwrap()); + let queues = vec![ + VirtioQueueConfig::create(256, 0).unwrap(), + VirtioQueueConfig::create(256, 0).unwrap(), + ]; + let rate_limiter = RateLimiter::default(); + + // Call for kvm too frequently would cause error in some host kernel. + std::thread::sleep(std::time::Duration::from_millis(5)); + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::new( + mem, + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + VirtioFsEpollHandler::new( + config, + vfs, + None, + 2, + id, + rate_limiter, + EventFd::new(0).unwrap(), + None, + ) + } + + #[test] + fn test_virtio_fs_device_create_error() { + let epoll_manager = EpollManager::default(); + let rate_limiter = RateLimiter::new(100, 0, 300, 10, 0, 300).unwrap(); + + // invalid cache policy + let res: Result>> = VirtioFs::new( + TAG, + NUM_QUEUES, + QUEUE_SIZE, + CACHE_SIZE, + "dummy_policy", + THREAD_NUM, + WB_CACHE, + NO_OPEN, + KILLPRIV_V2, + XATTR, + DROP_SYS_RSC, + NO_READDIR, + new_dummy_handler_helper(), + epoll_manager.clone(), + Some(rate_limiter), + ); + assert!(res.is_err()); + + // drop_sys_resource with write_back_cache + let rate_limiter = RateLimiter::new(100, 0, 300, 10, 0, 300).unwrap(); + let res: Result>> = VirtioFs::new( + TAG, + NUM_QUEUES, + QUEUE_SIZE, + CACHE_SIZE, + CACHE_POLICY, + THREAD_NUM, + true, + NO_OPEN, + KILLPRIV_V2, + XATTR, + true, + NO_READDIR, + new_dummy_handler_helper(), + epoll_manager, + Some(rate_limiter), + ); + assert!(res.is_err()); + } + + #[test] + fn test_virtio_fs_device_normal() { + let epoll_manager = EpollManager::default(); + let rate_limiter = RateLimiter::new(100, 0, 300, 10, 0, 300).unwrap(); + let mut fs: VirtioFs> = VirtioFs::new( + TAG, + NUM_QUEUES, + QUEUE_SIZE, + CACHE_SIZE, + CACHE_POLICY, + THREAD_NUM, + WB_CACHE, + NO_OPEN, + KILLPRIV_V2, + XATTR, + DROP_SYS_RSC, + NO_READDIR, + new_dummy_handler_helper(), + epoll_manager, + Some(rate_limiter), + ) + .unwrap(); + + assert!(!fs.is_dax_on()); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::device_type(&fs), + TYPE_VIRTIO_FS + ); + let queue_size = vec![QUEUE_SIZE; NUM_QUEUE_OFFSET + NUM_QUEUES]; + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::queue_max_sizes( + &fs + ), + &queue_size[..] + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&fs, 0), + fs.device_info.get_avail_features(0) + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&fs, 1), + fs.device_info.get_avail_features(1) + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&fs, 2), + fs.device_info.get_avail_features(2) + ); + VirtioDevice::>, QueueSync, GuestRegionMmap>::set_acked_features( + &mut fs, 2, 0, + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&fs, 2), + 0); + let mut config: [u8; 1] = [0]; + VirtioDevice::>, QueueSync, GuestRegionMmap>::read_config( + &mut fs, + 0, + &mut config, + ) + .unwrap(); + let config: [u8; 16] = [0; 16]; + VirtioDevice::>, QueueSync, GuestRegionMmap>::write_config( + &mut fs, 0, &config, + ) + .unwrap(); + } + + #[test] + fn test_virtio_fs_device_active() { + let epoll_manager = EpollManager::default(); + { + // config queue size is not 2 + let rate_limiter = RateLimiter::new(100, 0, 300, 10, 0, 300).unwrap(); + let mut fs: VirtioFs> = VirtioFs::new( + TAG, + NUM_QUEUES, + QUEUE_SIZE, + CACHE_SIZE, + CACHE_POLICY, + THREAD_NUM, + WB_CACHE, + NO_OPEN, + KILLPRIV_V2, + XATTR, + DROP_SYS_RSC, + NO_READDIR, + new_dummy_handler_helper(), + epoll_manager.clone(), + Some(rate_limiter), + ) + .unwrap(); + + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues: Vec> = Vec::new(); + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + assert!(matches!( + fs.activate(config), + Err(ActivateError::InvalidParam) + )); + } + + { + // Ok + let rate_limiter = RateLimiter::new(100, 0, 300, 10, 0, 300).unwrap(); + let mut fs: VirtioFs> = VirtioFs::new( + TAG, + NUM_QUEUES, + QUEUE_SIZE, + CACHE_SIZE, + CACHE_POLICY, + THREAD_NUM, + WB_CACHE, + NO_OPEN, + KILLPRIV_V2, + XATTR, + DROP_SYS_RSC, + NO_READDIR, + new_dummy_handler_helper(), + epoll_manager, + Some(rate_limiter), + ) + .unwrap(); + + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues = vec![ + VirtioQueueConfig::::create(1024, 0).unwrap(), + VirtioQueueConfig::::create(2, 0).unwrap(), + ]; + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + + let result = fs.activate(config); + assert!(result.is_ok()); + } + } + + // this test case need specific resources and is recommended to run + // via dbuvm docker image + #[test] + #[cfg(feature = "test-resources")] + fn test_fs_manipulate_backend_fs() { + let source = "/test_resources/nydus-rs/bootstrap/image_v2.boot"; + let source_path = PathBuf::from(source); + let bootstrapfile = source_path.to_str().unwrap().to_string(); + if !source_path.exists() { + panic!("Test resource file not found: {}", bootstrapfile); + } + // mount + { + // invalid fs type + { + let mut fs = create_fs_device_default(); + let res = fs.manipulate_backend_fs( + None, + Some(String::from("dummyFs")), + "/mountpoint", + None, + "mount", + None, + None, + ); + assert!(matches!(res, Err(FsError::BackendFs(_)))); + } + // passthroughFs + { + let mut fs = create_fs_device_default(); + + // no mount source + let res = fs.manipulate_backend_fs( + None, + Some(String::from("PassthroughFs")), + "/mountpoint", + None, + "mount", + None, + None, + ); + assert!(matches!(res, Err(FsError::BackendFs(_)))); + + // invalid mount source + let res = fs.manipulate_backend_fs( + Some(String::from("dummy_source_path")), + Some(String::from("PassthroughFs")), + "/mountpoint", + None, + "mount", + None, + None, + ); + assert!(matches!(res, Err(FsError::BackendFs(_)))); + + // success + let mount_dir = temp_dir(); + let mount_path = mount_dir.into_os_string().into_string().unwrap(); + fs.manipulate_backend_fs( + Some(mount_path), + Some(String::from("PassthroughFs")), + "/mountpoint", + None, + "mount", + None, + None, + ) + .unwrap(); + } + // Rafs + { + let mut fs = create_fs_device_default(); + + // no mount source + let res = fs.manipulate_backend_fs( + None, + Some(String::from("Rafs")), + "/mountpoint", + None, + "mount", + None, + None, + ); + assert!(matches!(res, Err(FsError::BackendFs(_)))); + + // invalid mount source + let res = fs.manipulate_backend_fs( + Some(String::from("dummy_source_path")), + Some(String::from("Rafs")), + "/mountpoint", + None, + "mount", + None, + None, + ); + assert!(matches!(res, Err(FsError::BackendFs(_)))); + + // invalid rafs cfg format + let dummy_rafs_cfg = r#" + { + "device": { + "backend": { + "type": "oss", + "config": { + "endpoint": "test" + } + } + } + }"#; + let res = fs.manipulate_backend_fs( + Some(bootstrapfile.clone()), + Some(String::from("Rafs")), + "/mountpoint", + Some(String::from(dummy_rafs_cfg)), + "mount", + None, + None, + ); + assert!(matches!(res, Err(FsError::BackendFs(_)))); + + // success + let rafs_cfg = r#" + { + "device": { + "backend": { + "type": "oss", + "config": { + "endpoint": "test", + "access_key_id": "test", + "access_key_secret": "test", + "bucket_name": "antsys-nydus", + "object_prefix":"nydus_v2/", + "scheme": "http" + } + } + }, + "mode": "direct", + "digest_validate": false, + "enable_xattr": true, + "fs_prefetch": { + "enable": true, + "threads_count": 10, + "merging_size": 131072, + "bandwidth_rate": 10485760 + } + }"#; + fs.manipulate_backend_fs( + Some(bootstrapfile.clone()), + Some(String::from("Rafs")), + "/mountpoint", + Some(String::from(rafs_cfg)), + "mount", + None, + None, + ) + .unwrap(); + } + } + // umount + { + let mut fs = create_fs_device_default(); + + // invalid mountpoint + let res = fs.manipulate_backend_fs( + None, + None, + "/dummy_mountpoint", + None, + "umount", + None, + None, + ); + assert!(matches!(res, Err(FsError::BackendFs(_)))); + + // success + let mut fs = create_fs_device_default(); + let dummy_dir = temp_dir(); + let dummy_path = dummy_dir.into_os_string().into_string().unwrap(); + fs.manipulate_backend_fs( + Some(dummy_path), + Some(String::from("PassthroughFs")), + "/mountpoint", + None, + "mount", + None, + None, + ) + .unwrap(); + fs.manipulate_backend_fs(None, None, "/mountpoint", None, "umount", None, None) + .unwrap(); + } + + // update + { + let mut fs = create_fs_device_default(); + let rafs_cfg = r#" + { + "device": { + "backend": { + "type": "oss", + "config": { + "endpoint": "test", + "access_key_id": "test", + "access_key_secret": "test", + "bucket_name": "antsys-nydus", + "object_prefix":"nydus_v2/", + "scheme": "http" + } + } + }, + "mode": "direct", + "digest_validate": false, + "enable_xattr": true, + "fs_prefetch": { + "enable": true, + "threads_count": 10, + "merging_size": 131072, + "bandwidth_rate": 10485760 + } + }"#; + // no config + let res = fs.manipulate_backend_fs( + Some(bootstrapfile.clone()), + Some(String::from("Rafs")), + "/mountpoint", + None, + "update", + None, + None, + ); + assert!(matches!(res, Err(FsError::BackendFs(_)))); + + // no source configured + let res = fs.manipulate_backend_fs( + Some(bootstrapfile.clone()), + Some(String::from("Rafs")), + "/mountpoint", + Some(String::from(rafs_cfg)), + "update", + None, + None, + ); + assert!(matches!(res, Err(FsError::BackendFs(_)))); + + // invalid mountpoint + fs.manipulate_backend_fs( + Some(bootstrapfile.clone()), + Some(String::from("Rafs")), + "/mountpoint", + Some(String::from(rafs_cfg)), + "mount", + None, + None, + ) + .unwrap(); + + let res = fs.manipulate_backend_fs( + Some(bootstrapfile.clone()), + Some(String::from("Rafs")), + "/dummy_mountpoint", + Some(String::from(rafs_cfg)), + "update", + None, + None, + ); + assert!(matches!(res, Err(FsError::BackendFs(_)))); + + // success + fs.manipulate_backend_fs( + Some(bootstrapfile.clone()), + Some(String::from("Rafs")), + "/mountpoint", + Some(String::from(rafs_cfg)), + "mount", + None, + None, + ) + .unwrap(); + + let res = fs.manipulate_backend_fs( + Some(bootstrapfile), + Some(String::from("Rafs")), + "/mountpoint", + Some(String::from(rafs_cfg)), + "update", + None, + None, + ); + assert!(res.is_ok()); + } + + // invalid operation + { + let mut fs = create_fs_device_default(); + let res = fs.manipulate_backend_fs( + None, + None, + "/mountpoint", + None, + "dummy_ops", + None, + Some(1024 * 1024 * 1024), + ); + assert!(matches!(res, Err(FsError::BackendFs(_)))); + } + } + + #[test] + fn test_parse_prefetch_files() { + // Non-empty prefetch list + let tmp_file = TempFile::new().unwrap(); + writeln!(tmp_file.as_file(), "/hello.txt").unwrap(); + writeln!(tmp_file.as_file()).unwrap(); + writeln!(tmp_file.as_file(), " ").unwrap(); + writeln!(tmp_file.as_file(), "\t").unwrap(); + writeln!(tmp_file.as_file(), "/").unwrap(); + writeln!(tmp_file.as_file(), "\n").unwrap(); + writeln!(tmp_file.as_file(), "test").unwrap(); + + let files = parse_prefetch_files(Some(tmp_file.as_path().to_str().unwrap().to_string())); + assert_eq!( + files, + Some(vec![PathBuf::from("/hello.txt"), PathBuf::from("/")]) + ); + + // Empty prefetch list + let tmp_file = TempFile::new().unwrap(); + let files = parse_prefetch_files(Some(tmp_file.as_path().to_str().unwrap().to_string())); + assert_eq!(files, None); + + // None prefetch list + let files = parse_prefetch_files(None); + assert_eq!(files, None); + + // Not exist prefetch list + let files = parse_prefetch_files(Some("no_such_file".to_string())); + assert_eq!(files, None); + } + + #[test] + #[allow(clippy::unusual_byte_groupings)] + fn test_kb_to_bytes() { + let kb = 0x1000; + assert_eq!(kb_to_bytes(kb).unwrap(), 0x400_000); + + let kb = 0x100_0000; + assert_eq!(kb_to_bytes(kb).unwrap(), 0x400_00_0000); + + let kb = 0x20_0000_0000_0000; + assert_eq!(kb_to_bytes(kb).unwrap(), 0x8000_0000_0000_0000); + + let kb = 0x100_0000_0000_0000; + assert!(kb_to_bytes(kb).is_err()); + + let kb = 0x1000_0000_0000_0000; + assert!(kb_to_bytes(kb).is_err()); + + let kb = 0x1100_0000_0000_0000; + assert!(kb_to_bytes(kb).is_err()); + } + + #[test] + fn test_get_timeout() { + fn create_fs_device_with_cache_policy(policy: &str) -> VirtioFs> { + let epoll_manager = EpollManager::default(); + let rate_limiter = RateLimiter::new(100, 0, 300, 10, 0, 300).unwrap(); + let fs: VirtioFs> = VirtioFs::new( + TAG, + NUM_QUEUES, + QUEUE_SIZE, + CACHE_SIZE, + policy, + THREAD_NUM, + WB_CACHE, + NO_OPEN, + KILLPRIV_V2, + XATTR, + DROP_SYS_RSC, + NO_READDIR, + new_dummy_handler_helper(), + epoll_manager, + Some(rate_limiter), + ) + .unwrap(); + fs + } + let fs = create_fs_device_with_cache_policy("auto"); + assert_eq!(fs.get_timeout(), Duration::from_secs(CACHE_AUTO_TIMEOUT)); + let fs = create_fs_device_with_cache_policy("always"); + assert_eq!(fs.get_timeout(), Duration::from_secs(CACHE_ALWAYS_TIMEOUT)); + let fs = create_fs_device_with_cache_policy("never"); + assert_eq!(fs.get_timeout(), Duration::from_secs(CACHE_NONE_TIMEOUT)); + } + + #[test] + fn test_register_mmap_region() { + let epoll_manager = EpollManager::default(); + let rate_limiter = RateLimiter::new(100, 0, 300, 10, 0, 300).unwrap(); + let mut fs: VirtioFs> = VirtioFs::new( + TAG, + NUM_QUEUES, + QUEUE_SIZE, + CACHE_SIZE, + CACHE_POLICY, + THREAD_NUM, + WB_CACHE, + NO_OPEN, + KILLPRIV_V2, + XATTR, + DROP_SYS_RSC, + NO_READDIR, + new_dummy_handler_helper(), + epoll_manager, + Some(rate_limiter), + ) + .unwrap(); + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let mut resources = DeviceResources::new(); + let entry = dbs_device::resources::Resource::MmioAddressRange { + base: 0x1000, + size: 0x1000, + }; + resources.append(entry); + let entry = dbs_device::resources::Resource::KvmMemSlot(0); + resources.append(entry); + + let mmio_res = resources.get_mmio_address_ranges(); + let slot_res = resources.get_kvm_mem_slots(); + let start = mmio_res[0].0; + let len = mmio_res[0].1; + let res = fs.register_mmap_region(vm_fd, start, len, &slot_res); + assert!(res.is_ok()); + assert_eq!(res.unwrap().start_addr(), GuestAddress(0x1000)); + } + + #[test] + fn test_get_resource_requirements() { + let epoll_manager = EpollManager::default(); + let rate_limiter = RateLimiter::new(100, 0, 300, 10, 0, 300).unwrap(); + let dax_on = 0x4000; + let fs: VirtioFs> = VirtioFs::new( + TAG, + NUM_QUEUES, + QUEUE_SIZE, + dax_on, + CACHE_POLICY, + THREAD_NUM, + WB_CACHE, + NO_OPEN, + KILLPRIV_V2, + XATTR, + DROP_SYS_RSC, + NO_READDIR, + new_dummy_handler_helper(), + epoll_manager, + Some(rate_limiter), + ) + .unwrap(); + let mut requirements = vec![ + ResourceConstraint::new_mmio(0x1), + ResourceConstraint::new_mmio(0x2), + ]; + VirtioDevice::, QueueSync, GuestRegionMmap>::get_resource_requirements( + &fs, + &mut requirements, + true, + ); + + assert_eq!(requirements[2], ResourceConstraint::LegacyIrq { irq: None }); + assert_eq!(requirements[3], ResourceConstraint::GenericIrq { size: 3 }); + assert_eq!( + requirements[5], + ResourceConstraint::KvmMemSlot { + slot: None, + size: 1 + } + ); + } + + #[test] + fn test_set_resource() { + let epoll_manager = EpollManager::default(); + let rate_limiter = RateLimiter::new(100, 0, 300, 10, 0, 300).unwrap(); + let mut fs: VirtioFs> = VirtioFs::new( + TAG, + NUM_QUEUES, + QUEUE_SIZE, + CACHE_SIZE, + CACHE_POLICY, + THREAD_NUM, + WB_CACHE, + NO_OPEN, + KILLPRIV_V2, + XATTR, + DROP_SYS_RSC, + NO_READDIR, + new_dummy_handler_helper(), + epoll_manager, + Some(rate_limiter), + ) + .unwrap(); + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let mut resources = DeviceResources::new(); + let entry = dbs_device::resources::Resource::MmioAddressRange { + base: 0x1000, + size: 0x1000, + }; + resources.append(entry); + let entry = dbs_device::resources::Resource::KvmMemSlot(0); + resources.append(entry); + + let res = VirtioDevice::, QueueSync, GuestRegionMmap>::set_resource( + &mut fs, vm_fd, resources, + ); + assert!(res.is_ok()); + let content = res.unwrap().unwrap(); + assert_eq!(content.kvm_userspace_memory_region_slot, 0); + assert_eq!(content.region_list[0].offset, 0); + assert_eq!(content.region_list[0].len, 0x1000); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/fs/handler.rs b/src/dragonball/src/dbs_virtio_devices/src/fs/handler.rs new file mode 100644 index 000000000..b976c89a1 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/fs/handler.rs @@ -0,0 +1,781 @@ +// Copyright 2020 Alibaba Cloud. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::io::Error as IOError; +use std::ops::Deref; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::{mpsc, Arc, Mutex}; + +use dbs_utils::epoll_manager::{EventOps, EventSet, Events, MutEventSubscriber}; +use dbs_utils::rate_limiter::{BucketUpdate, RateLimiter, TokenType}; +use fuse_backend_rs::abi::virtio_fs::RemovemappingOne; +use fuse_backend_rs::api::server::Server; +use fuse_backend_rs::api::Vfs; +use fuse_backend_rs::transport::{FsCacheReqHandler, Reader, VirtioFsWriter, Writer}; +use log::{debug, error, info, trace}; +use threadpool::ThreadPool; +use virtio_queue::{QueueOwnedT, QueueT}; +use vm_memory::{GuestAddressSpace, GuestMemoryRegion}; +use vmm_sys_util::eventfd::EventFd; + +use crate::{Error, Result, VirtioDeviceConfig}; + +use super::{Error as FsError, VIRTIO_FS_NAME}; + +// New descriptors are pending on the virtio queue. +const QUEUE_AVAIL_EVENT: u32 = 0; + +// two rate limiter events +const RATE_LIMITER_EVENT_COUNT: u32 = 2; + +/// CacheHandler handles DAX window mmap/unmap operations +#[derive(Clone)] +pub struct CacheHandler { + /// the size of memory region allocated for virtiofs + pub(crate) cache_size: u64, + + /// the address of mmap region corresponding to the memory region + pub(crate) mmap_cache_addr: u64, + + /// the device ID + pub(crate) id: String, +} + +impl CacheHandler { + /// Make sure request is within cache range + fn is_req_valid(&self, offset: u64, len: u64) -> bool { + // TODO: do we need to validate alignment here? + match offset.checked_add(len) { + Some(n) => n <= self.cache_size, + None => false, + } + } +} + +impl FsCacheReqHandler for CacheHandler { + // Do not close fd in here. The fd is automatically closed in the setupmapping + // of passthrough_fs when destructing. + fn map( + &mut self, + foffset: u64, + moffset: u64, + len: u64, + flags: u64, + fd: RawFd, + ) -> std::result::Result<(), IOError> { + let addr = self.mmap_cache_addr + moffset; + trace!( + target: VIRTIO_FS_NAME, + "{}: CacheHandler::map(): fd={}, foffset=0x{:x}, moffset=0x{:x}(host addr: 0x{:x}), len=0x{:x}, flags=0x{:x}", + self.id, + fd, + foffset, + moffset, + addr, + len, + flags + ); + + if !self.is_req_valid(moffset, len) { + error!( + "{}: CacheHandler::map(): Wrong offset or length, offset=0x{:x} len=0x{:x} cache_size=0x{:x}", + self.id, moffset, len, self.cache_size + ); + return Err(IOError::from_raw_os_error(libc::EINVAL)); + } + + // TODO: + // In terms of security, DAX does not easily handle all kinds of write + // scenarios, especially append write. Therefore, to prevent guest users + // from using the DAX to write files maliciously, we do not support guest + // write permission configuration. If DAX needs to support write, we can + // add write permissions by Control path. + let ret = unsafe { + libc::mmap( + addr as *mut libc::c_void, + len as usize, + libc::PROT_READ, + libc::MAP_SHARED | libc::MAP_FIXED, + fd, + foffset as libc::off_t, + ) + }; + if ret == libc::MAP_FAILED { + let e = IOError::last_os_error(); + error!("{}: CacheHandler::map() failed: {}", VIRTIO_FS_NAME, e); + return Err(e); + } + + Ok(()) + } + + fn unmap(&mut self, requests: Vec) -> std::result::Result<(), IOError> { + trace!(target: VIRTIO_FS_NAME, "{}: CacheHandler::unmap()", self.id,); + + for req in requests { + let mut offset = req.moffset; + let mut len = req.len; + + // Ignore if the length is 0. + if len == 0 { + continue; + } + + debug!( + "{}: do unmap(): offset=0x{:x} len=0x{:x} cache_size=0x{:x}", + self.id, offset, len, self.cache_size + ); + + // Need to handle a special case where the slave ask for the unmapping + // of the entire mapping. + if len == 0xffff_ffff_ffff_ffff { + len = self.cache_size; + offset = 0; + } + + if !self.is_req_valid(offset, len) { + error!( + "{}: CacheHandler::unmap(): Wrong offset or length, offset=0x{:x} len=0x{:x} cache_size=0x{:x}", + self.id, offset, len, self.cache_size + ); + return Err(IOError::from_raw_os_error(libc::EINVAL)); + } + + let addr = self.mmap_cache_addr + offset; + // Use mmap + PROT_NONE can reserve host userspace address while unmap memory. + // In this way, guest will not be able to access the memory, and dragonball + // also can reserve the HVA. + let ret = unsafe { + libc::mmap( + addr as *mut libc::c_void, + len as usize, + libc::PROT_NONE, + libc::MAP_ANONYMOUS | libc::MAP_PRIVATE | libc::MAP_FIXED, + -1, + 0_i64, + ) + }; + if ret == libc::MAP_FAILED { + let e = IOError::last_os_error(); + error!("{}: CacheHandler::unmap() failed, {}", self.id, e); + return Err(e); + } + } + + Ok(()) + } +} + +pub(crate) struct VirtioFsEpollHandler< + AS: 'static + GuestAddressSpace, + Q: QueueT, + R: GuestMemoryRegion, +> { + pub(crate) config: Arc>>, + server: Arc>>, + cache_handler: Option, + thread_pool: Option, + id: String, + rate_limiter: RateLimiter, + patch_rate_limiter_fd: EventFd, + receiver: Option>, +} + +impl VirtioFsEpollHandler +where + AS: GuestAddressSpace + Clone + Send, + AS::T: Send, + AS::M: Sync + Send, + Q: QueueT + Send + 'static, + R: GuestMemoryRegion + Send + Sync + 'static, +{ + #[allow(clippy::too_many_arguments)] + pub(crate) fn new( + config: VirtioDeviceConfig, + fs: Arc, + cache_handler: Option, + thread_pool_size: u16, + id: String, + rate_limiter: RateLimiter, + patch_rate_limiter_fd: EventFd, + receiver: Option>, + ) -> Self { + let thread_pool = if thread_pool_size > 0 { + Some(ThreadPool::with_name( + "virtiofs-thread".to_string(), + thread_pool_size as usize, + )) + } else { + None + }; + Self { + config: Arc::new(Mutex::new(config)), + server: Arc::new(Server::new(fs)), + cache_handler, + thread_pool, + id, + rate_limiter, + patch_rate_limiter_fd, + receiver, + } + } + + fn process_queue(&mut self, queue_index: usize) -> Result<()> { + let mut config_guard = self.config.lock().unwrap(); + let mem = config_guard.lock_guest_memory(); + let vm_as = config_guard.vm_as.clone(); + let queue = &mut config_guard.queues[queue_index]; + let (tx, rx) = mpsc::channel::<(u16, u32)>(); + let mut used_count = 0; + let mut rate_limited = false; + // TODO: use multiqueue to process new entries. + + let mut queue_guard = queue.queue_mut().lock(); + let mut iter = queue_guard + .iter(mem.clone()) + .map_err(Error::VirtioQueueError)?; + + for desc_chain in &mut iter { + // Prepare a set of objects that can be moved to the worker thread. + if !self.rate_limiter.consume(1, TokenType::Ops) { + rate_limited = true; + break; + } + + let head_index = desc_chain.head_index(); + let server = self.server.clone(); + let vm_as = vm_as.clone(); + let config = self.config.clone(); + let pooled = self.is_multi_thread(); + let tx = tx.clone(); + used_count += 1; + let mut cache_handler = self.cache_handler.clone(); + + let work_func = move || { + let guard = vm_as.memory(); + let mem = guard.deref(); + let reader = Reader::from_descriptor_chain(mem, desc_chain.clone()) + .map_err(FsError::InvalidDescriptorChain) + .unwrap(); + let writer = Writer::VirtioFs( + VirtioFsWriter::new(mem, desc_chain) + .map_err(FsError::InvalidDescriptorChain) + .unwrap(), + ); + let total = server + .handle_message( + reader, + writer, + cache_handler + .as_mut() + .map(|x| x as &mut dyn FsCacheReqHandler), + None, + ) + .map_err(FsError::ProcessQueue) + .unwrap(); + + if pooled { + let queue = &mut config.lock().unwrap().queues[queue_index]; + queue.add_used(mem, head_index, total as u32); + if let Err(e) = queue.notify() { + error!("failed to signal used queue: {:?}", e); + } + } else { + tx.send((head_index, total as u32)) + .expect("virtiofs: failed to send fuse result"); + } + }; + + if let Some(pool) = &self.thread_pool { + trace!("{}: poping new fuse req to thread pool.", VIRTIO_FS_NAME,); + pool.execute(work_func); + } else { + work_func(); + } + } + if rate_limited { + iter.go_to_previous_position(); + } + + let notify = !self.is_multi_thread() && used_count > 0; + // unlock QueueT + drop(queue_guard); + while !self.is_multi_thread() && used_count > 0 { + used_count -= 1; + let (idx, ret) = rx + .recv() + .expect("virtiofs: failed to recv result from thread pool"); + queue.add_used(mem.deref(), idx, ret); + } + + if notify { + if let Err(e) = queue.notify() { + error!("failed to signal used queue: {:?}", e); + } + } + + Ok(()) + } + + pub fn get_patch_rate_limiters(&mut self, bytes: BucketUpdate, ops: BucketUpdate) { + info!("{}: Update rate limiter for fs device", VIRTIO_FS_NAME); + match &bytes { + BucketUpdate::Update(tb) => { + info!( + "{}: update bandwidth, \"size\": {}, \"one_time_burst\": {}, \"refill_time\": {}", + VIRTIO_FS_NAME, + tb.capacity(), + tb.one_time_burst(), + tb.refill_time_ms() + ); + } + BucketUpdate::None => { + info!("{}: no update for bandwidth", VIRTIO_FS_NAME); + } + _ => { + info!("{}: bandwidth limiting is disabled", VIRTIO_FS_NAME); + } + } + match &ops { + BucketUpdate::Update(tb) => { + info!( + "{}: update ops, \"size\": {}, \"one_time_burst\": {}, \"refill_time\": {}", + VIRTIO_FS_NAME, + tb.capacity(), + tb.one_time_burst(), + tb.refill_time_ms() + ); + } + BucketUpdate::None => { + info!("{}: no update for ops", VIRTIO_FS_NAME); + } + _ => { + info!("{}: ops limiting is disabled", VIRTIO_FS_NAME); + } + } + self.rate_limiter.update_buckets(bytes, ops); + } + + // True if thread pool is enabled. + fn is_multi_thread(&self) -> bool { + self.thread_pool.is_some() + } +} + +impl MutEventSubscriber for VirtioFsEpollHandler +where + AS: GuestAddressSpace + Send + Sync + 'static + Clone, + AS::T: Send, + AS::M: Sync + Send, + Q: QueueT + Send + 'static, + R: GuestMemoryRegion + Send + Sync + 'static, +{ + fn process(&mut self, events: Events, _ops: &mut EventOps) { + trace!( + target: VIRTIO_FS_NAME, + "{}: VirtioFsHandler::process({})", + self.id, + events.data() + ); + + let slot = events.data(); + let config = &self.config.clone(); + let guard = config.lock().unwrap(); + let queues = &guard.queues; + + let queues_len = queues.len() as u32; + // Rate limiter budget is now available. + let rate_limiter_event = QUEUE_AVAIL_EVENT + queues_len; + // patch request of rate limiter has arrived + let patch_rate_limiter_event = rate_limiter_event + 1; + + match slot { + s if s >= RATE_LIMITER_EVENT_COUNT + QUEUE_AVAIL_EVENT + queues_len => { + error!("{}: unknown epoll event slot {}", VIRTIO_FS_NAME, slot); + } + + s if s == rate_limiter_event => match self.rate_limiter.event_handler() { + Ok(()) => { + drop(guard); + for idx in QUEUE_AVAIL_EVENT as usize..(QUEUE_AVAIL_EVENT + queues_len) as usize + { + if let Err(e) = self.process_queue(idx) { + error!("{}: error in queue {}, {:?}", VIRTIO_FS_NAME, idx, e); + } + } + } + Err(e) => { + error!( + "{}: the rate limiter is disabled or is not blocked, {:?}", + VIRTIO_FS_NAME, e + ); + } + }, + + s if s == patch_rate_limiter_event => { + if let Err(e) = self.patch_rate_limiter_fd.read() { + error!("{}: failed to get patch event, {:?}", VIRTIO_FS_NAME, e); + } + if let Some(receiver) = &self.receiver { + if let Ok((bytes, ops)) = receiver.try_recv() { + self.get_patch_rate_limiters(bytes, ops); + } + } + } + + // QUEUE_AVAIL_EVENT + _ => { + let idx = (slot - QUEUE_AVAIL_EVENT) as usize; + if let Err(e) = queues[idx].consume_event() { + error!("{}: failed to read queue event, {:?}", VIRTIO_FS_NAME, e); + return; + } + drop(guard); + + if let Err(e) = self.process_queue(idx) { + error!( + "{}: process_queue failed due to error {:?}", + VIRTIO_FS_NAME, e + ); + } + } + } + } + + fn init(&mut self, ops: &mut EventOps) { + trace!( + target: VIRTIO_FS_NAME, + "{}: VirtioFsHandler::init()", + self.id + ); + + let queues = &self.config.lock().unwrap().queues; + + for (idx, queue) in queues.iter().enumerate() { + let events = Events::with_data( + queue.eventfd.as_ref(), + QUEUE_AVAIL_EVENT + idx as u32, + EventSet::IN, + ); + if let Err(e) = ops.add(events) { + error!( + "{}: failed to register epoll event for event queue {}, {:?}", + VIRTIO_FS_NAME, idx, e + ); + } + } + + let rate_limiter_fd = self.rate_limiter.as_raw_fd(); + if rate_limiter_fd != -1 { + if let Err(e) = ops.add(Events::with_data_raw( + rate_limiter_fd, + QUEUE_AVAIL_EVENT + queues.len() as u32, + EventSet::IN, + )) { + error!( + "{}: failed to register rate limiter event, {:?}", + VIRTIO_FS_NAME, e + ); + } + } + + if let Err(e) = ops.add(Events::with_data( + &self.patch_rate_limiter_fd, + 1 + QUEUE_AVAIL_EVENT + queues.len() as u32, + EventSet::IN, + )) { + error!( + "{}: failed to register rate limiter patch event {:?}", + VIRTIO_FS_NAME, e + ); + } + } +} + +#[cfg(test)] +pub mod tests { + use std::io::Seek; + use std::io::Write; + use std::sync::Arc; + + use dbs_interrupt::NoopNotifier; + use dbs_utils::epoll_manager::EpollManager; + use dbs_utils::epoll_manager::SubscriberOps; + use dbs_utils::rate_limiter::TokenBucket; + use vm_memory::{GuestAddress, GuestMemoryMmap}; + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::fs::device::tests::*; + use crate::fs::*; + use crate::tests::{VirtQueue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; + use crate::VirtioQueueConfig; + + #[test] + fn test_is_req_valid() { + let handler = CacheHandler { + cache_size: 0x1000, + mmap_cache_addr: 0x1000, + id: "test".to_string(), + }; + + // Normal case. + assert!(handler.is_req_valid(0x0, 0x500)); + + // Invalid case. + assert!(!handler.is_req_valid(0x500, 0x1000)); + } + + #[test] + fn test_map() { + let mmap_addr = 0x10000; + let moffset = 0x5000; + let mut handler = CacheHandler { + cache_size: 0x10000, + mmap_cache_addr: mmap_addr, + id: "test".to_string(), + }; + + // Normal case. + let original_content = b"hello world"; + let mut file = TempFile::new().unwrap().into_file(); + file.set_len(0x1000).unwrap(); + file.write_all(original_content).unwrap(); + file.rewind().unwrap(); + let fd = file.as_raw_fd(); + handler.map(0x0, moffset, 0x5000, 0, fd).unwrap(); + let mapped_addr = (mmap_addr + moffset) as *const [u8; 11]; + unsafe { + let content = mapped_addr.read(); + assert_eq!(&content, original_content); + } + + // Invalid argument case. + assert!(matches!( + handler + .map(0x0, 0x5000, 0xc000, 0, fd) + .err() + .unwrap() + .kind(), + std::io::ErrorKind::InvalidInput + )); + + // Bad file descriptor case. + let fd = TempFile::new().unwrap().as_file().as_raw_fd(); + assert!(format!( + "{:?}", + handler.map(0x0, 0x5000, 0x5000, 0, fd).err().unwrap() + ) + .contains("Bad file descriptor")); + } + + #[test] + fn test_unmap() { + let mmap_addr = 0x10000; + let moffset = 0x5000; + let mut handler = CacheHandler { + cache_size: 0x10000, + mmap_cache_addr: mmap_addr, + id: "test".to_string(), + }; + + // Normal case after map. + let original_content = b"hello world"; + let mut file = TempFile::new().unwrap().into_file(); + file.set_len(0x1000).unwrap(); + file.write_all(original_content).unwrap(); + file.rewind().unwrap(); + let fd = file.as_raw_fd(); + handler.map(0x0, moffset, 0x5000, 0, fd).unwrap(); + let mapped_addr = (mmap_addr + moffset) as *const [u8; 11]; + unsafe { + let content = mapped_addr.read(); + assert_eq!(&content, original_content); + } + let requests = vec![ + RemovemappingOne { + moffset: 0x5000, + len: 0x1000, + }, + RemovemappingOne { + moffset: 0x6000, + len: 0x2500, + }, + ]; + assert!(handler.unmap(requests).is_ok()); + + // Normal case. + let mut handler = CacheHandler { + cache_size: 0x10000, + mmap_cache_addr: mmap_addr, + id: "test".to_string(), + }; + let requests = vec![ + RemovemappingOne { + moffset: 0x5000, + len: 0x1000, + }, + RemovemappingOne { + moffset: 0x6000, + len: 0x2500, + }, + ]; + assert!(handler.unmap(requests).is_ok()); + + // Invalid argument case. + let requests = vec![RemovemappingOne { + moffset: 0x5000, + len: 0x10000, + }]; + assert!(matches!( + handler.unmap(requests).err().unwrap().kind(), + std::io::ErrorKind::InvalidInput + )); + } + + #[test] + fn test_fs_get_patch_rate_limiters() { + let mut handler = create_fs_epoll_handler(String::from("1")); + let tokenbucket = TokenBucket::new(1, 1, 4); + + handler.get_patch_rate_limiters( + BucketUpdate::None, + BucketUpdate::Update(tokenbucket.clone()), + ); + assert_eq!(handler.rate_limiter.ops().unwrap(), &tokenbucket); + + handler.get_patch_rate_limiters( + BucketUpdate::Update(tokenbucket.clone()), + BucketUpdate::None, + ); + assert_eq!(handler.rate_limiter.bandwidth().unwrap(), &tokenbucket); + + handler.get_patch_rate_limiters(BucketUpdate::None, BucketUpdate::None); + assert_eq!(handler.rate_limiter.ops().unwrap(), &tokenbucket); + + handler.get_patch_rate_limiters(BucketUpdate::None, BucketUpdate::Disabled); + assert_eq!(handler.rate_limiter.ops(), None); + + handler.get_patch_rate_limiters(BucketUpdate::Disabled, BucketUpdate::None); + assert_eq!(handler.rate_limiter.bandwidth(), None); + } + + #[test] + fn test_fs_set_patch_rate_limiters() { + let epoll_manager = EpollManager::default(); + let rate_limiter = RateLimiter::new(100, 0, 300, 10, 0, 300).unwrap(); + let mut fs: VirtioFs> = VirtioFs::new( + TAG, + NUM_QUEUES, + QUEUE_SIZE, + CACHE_SIZE, + CACHE_POLICY, + THREAD_NUM, + WB_CACHE, + NO_OPEN, + KILLPRIV_V2, + XATTR, + DROP_SYS_RSC, + NO_READDIR, + new_dummy_handler_helper(), + epoll_manager, + Some(rate_limiter), + ) + .unwrap(); + + // No sender + assert!(fs + .set_patch_rate_limiters(BucketUpdate::None, BucketUpdate::None) + .is_err()); + + // Success + let (sender, receiver) = mpsc::channel(); + fs.sender = Some(sender); + assert!(fs + .set_patch_rate_limiters(BucketUpdate::None, BucketUpdate::None) + .is_ok()); + + // Send error + drop(receiver); + assert!(fs + .set_patch_rate_limiters(BucketUpdate::None, BucketUpdate::None) + .is_err()); + } + + #[test] + fn test_fs_epoll_handler_handle_event() { + let handler = create_fs_epoll_handler("test_1".to_string()); + let event_fd = EventFd::new(0).unwrap(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_op = inner_mgr.event_ops(id).unwrap(); + let event_set = EventSet::EDGE_TRIGGERED; + let mut handler = create_fs_epoll_handler("test_2".to_string()); + + // test for QUEUE_AVAIL_EVENT + let events = Events::with_data(&event_fd, QUEUE_AVAIL_EVENT, event_set); + handler.process(events, &mut event_op); + handler.config.lock().unwrap().queues[0] + .generate_event() + .unwrap(); + handler.process(events, &mut event_op); + + // test for RATE_LIMITER_EVENT + let queues_len = handler.config.lock().unwrap().queues.len() as u32; + let events = Events::with_data(&event_fd, QUEUE_AVAIL_EVENT + queues_len, event_set); + handler.process(events, &mut event_op); + + // test for PATCH_RATE_LIMITER_EVENT + if let Err(e) = handler.patch_rate_limiter_fd.write(1) { + error!( + "{} test: failed to write patch_rate_limiter_fd, {:?}", + VIRTIO_FS_NAME, e + ); + } + let events = Events::with_data(&event_fd, 1 + QUEUE_AVAIL_EVENT + queues_len, event_set); + handler.process(events, &mut event_op); + } + + #[test] + fn test_fs_epoll_handler_handle_unknown_event() { + let handler = create_fs_epoll_handler("test_1".to_string()); + let event_fd = EventFd::new(0).unwrap(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_op = inner_mgr.event_ops(id).unwrap(); + let event_set = EventSet::EDGE_TRIGGERED; + let mut handler = create_fs_epoll_handler("test_2".to_string()); + + // test for unknown event + let events = Events::with_data(&event_fd, FS_EVENTS_COUNT + 10, event_set); + handler.process(events, &mut event_op); + } + + #[test] + fn test_fs_epoll_handler_process_queue() { + { + let mut handler = create_fs_epoll_handler("test_1".to_string()); + + let m = &handler.config.lock().unwrap().vm_as.clone(); + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + let q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + + handler.config.lock().unwrap().queues = vec![VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + )]; + assert!(handler.process_queue(0).is_ok()); + } + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/fs/mod.rs b/src/dragonball/src/dbs_virtio_devices/src/fs/mod.rs new file mode 100644 index 000000000..a505bb306 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/fs/mod.rs @@ -0,0 +1,44 @@ +// Copyright 2020 Alibaba Cloud. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +mod handler; +pub(crate) use self::handler::*; +mod device; +pub use self::device::*; + +use std::io::Error as IOError; + +use fuse_backend_rs::transport::Error as FuseTransportError; +use fuse_backend_rs::Error as FuseServerError; +use nix::Error as NixError; + +pub const VIRTIO_FS_NAME: &str = "virtio-fs"; + +/// Error for virtio fs device. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Invalid Virtio descriptor chain. + #[error("invalid descriptorchain: {0}")] + InvalidDescriptorChain(FuseTransportError), + /// Processing queue failed. + #[error("process queue failed: {0}")] + ProcessQueue(FuseServerError), + #[error("invalid data.")] + InvalidData, + /// Failed to attach/detach a backend fs. + #[error("attach/detach a backend filesystem failed:: {0}")] + BackendFs(String), + /// Error from IO error. + #[error("io error: {0}")] + IOError(#[from] IOError), + /// Failed to create memfd + #[error("failed to create memfd: {0}")] + MemFdCreate(NixError), + /// Failed to set file size + #[error("failed to set file size: {0}")] + SetFileSize(IOError), +} + +/// Specialized std::result::Result for Virtio fs device operations. +pub type Result = std::result::Result; diff --git a/src/dragonball/src/dbs_virtio_devices/src/lib.rs b/src/dragonball/src/dbs_virtio_devices/src/lib.rs new file mode 100644 index 000000000..ec5fcdc14 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/lib.rs @@ -0,0 +1,498 @@ +// Copyright 2019-2020 Alibaba Cloud. All rights reserved. +// Portions Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! Interfaces and implementations of virtio devices. +//! +//! Please refer to [Virtio Specification] +//! (http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-1090002) +//! for more information. + +mod device; +pub use self::device::*; + +mod notifier; +pub use self::notifier::*; + +pub mod epoll_helper; + +#[cfg(feature = "virtio-mmio")] +pub mod mmio; + +#[cfg(feature = "virtio-vsock")] +pub mod vsock; + +#[cfg(feature = "virtio-net")] +pub mod net; + +#[cfg(feature = "virtio-blk")] +pub mod block; + +#[cfg(feature = "virtio-fs")] +pub mod fs; + +#[cfg(feature = "virtio-mem")] +pub mod mem; + +#[cfg(feature = "virtio-balloon")] +pub mod balloon; + +use std::io::Error as IOError; + +use virtio_queue::Error as VqError; +use vm_memory::{GuestAddress, GuestAddressSpace, GuestMemoryError}; + +pub trait DbsGuestAddressSpace: GuestAddressSpace + 'static + Clone + Send + Sync {} + +impl DbsGuestAddressSpace for T where T: GuestAddressSpace + 'static + Clone + Send + Sync {} + +/// Version of virtio specifications supported by PCI virtio devices. +#[allow(non_camel_case_types)] +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum VirtioVersion { + /// Unknown/non-virtio VFIO device. + VIRTIO_VERSION_UNKNOWN, + /// Virtio specification 0.95(Legacy). + VIRTIO_VERSION_0_95, + /// Virtio specification 1.0/1.1. + VIRTIO_VERSION_1_X, +} + +/// Page size for legacy PCI virtio devices. Assume it's 4K. +pub const VIRTIO_LEGACY_PAGE_SIZE: u32 = 0x1000; + +/// Initial state after device initialization/reset. +pub const DEVICE_INIT: u32 = 0x0; +/// Indicates that the guest OS has found the device and recognized it as a valid virtio device. +pub const DEVICE_ACKNOWLEDGE: u32 = 0x01; +/// Indicates that the guest OS knows how to drive the device. +pub const DEVICE_DRIVER: u32 = 0x02; +/// Indicates that the driver is set up and ready to drive the device. +pub const DEVICE_DRIVER_OK: u32 = 0x04; +/// Indicates that the driver has acknowledged all the features it understands, and feature +/// negotiation is complete. +pub const DEVICE_FEATURES_OK: u32 = 0x08; +/// Indicates that the device has experienced an error from which it can’t recover. +pub const DEVICE_NEEDS_RESET: u32 = 0x40; +/// Indicates that something went wrong in the guest, and it has given up on the device. +/// This could be an internal error, or the driver didn’t like the device for some reason, or even +/// a fatal error during device operation. +pub const DEVICE_FAILED: u32 = 0x80; + +/// Virtio network card device. +pub const TYPE_NET: u32 = 1; +/// Virtio block device. +pub const TYPE_BLOCK: u32 = 2; +/// Virtio-rng device. +pub const TYPE_RNG: u32 = 4; +/// Virtio balloon device. +pub const TYPE_BALLOON: u32 = 5; +/// Virtio vsock device. +pub const TYPE_VSOCK: u32 = 19; +/// Virtio mem device. +pub const TYPE_MEM: u32 = 24; +/// Virtio-fs virtual device. +pub const TYPE_VIRTIO_FS: u32 = 26; +/// Virtio-pmem device. +pub const TYPE_PMEM: u32 = 27; + +// Interrupt status flags for legacy interrupts. It happens to be the same for both PCI and MMIO +// virtio devices. +/// Data available in used queue. +pub const VIRTIO_INTR_VRING: u32 = 0x01; +/// Device configuration changed. +pub const VIRTIO_INTR_CONFIG: u32 = 0x02; + +/// Error code for VirtioDevice::activate(). +#[derive(Debug, thiserror::Error)] +pub enum ActivateError { + #[error("Invalid param.")] + InvalidParam, + #[error("Internal error.")] + InternalError, + #[error("Invalid queue config.")] + InvalidQueueConfig, + #[error("IO: {0}.")] + IOError(#[from] IOError), +} + +/// Error code for VirtioDevice::read_config()/write_config(). +#[derive(Debug, thiserror::Error, Eq, PartialEq)] +pub enum ConfigError { + #[error("Invalid offset: {0}.")] + InvalidOffset(u64), + #[error("Offset({0}) plus data length ({0}) overflow.")] + PlusOverflow(u64, u64), + #[error("Invalid offset plus data length: {0}.")] + InvalidOffsetPlusDataLen(u64), +} + +/// Specialized std::result::Result for VirtioDevice::activate(). +pub type ActivateResult = std::result::Result<(), ActivateError>; +/// Specialized std::result::Result for VirtioDevice::read_config()/write_config(). +pub type ConfigResult = std::result::Result<(), ConfigError>; + +/// Error for virtio devices to handle requests from guests. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Guest gave us too few descriptors in a descriptor chain. + #[error("not enough descriptors for request.")] + DescriptorChainTooShort, + /// Guest gave us a descriptor that was too short to use. + #[error("descriptor length too small.")] + DescriptorLengthTooSmall, + /// Guest gave us a descriptor that was too big to use. + #[error("descriptor length too big.")] + DescriptorLengthTooBig, + /// Guest gave us a write only descriptor that protocol says to read from. + #[error("unexpected write only descriptor.")] + UnexpectedWriteOnlyDescriptor, + /// Guest gave us a read only descriptor that protocol says to write to. + #[error("unexpected read only descriptor.")] + UnexpectedReadOnlyDescriptor, + /// Invalid input parameter or status. + #[error("invalid input parameter or status.")] + InvalidInput, + /// The requested operation would cause a seek beyond disk end. + #[error("invalid offset.")] + InvalidOffset, + /// Internal unspecific error + #[error("internal unspecific error.")] + InternalError, + /// Device resource doesn't match what requested + #[error("invalid resource.")] + InvalidResource, + /// Generic IO error + #[error("IO: {0}.")] + IOError(#[from] IOError), + /// Error from virtio_queue + #[error("virtio queue error: {0}")] + VirtioQueueError(#[from] VqError), + /// Error from Device activate. + #[error("Device activate error: {0}")] + ActivateError(#[from] ActivateError), + /// Error from Interrupt. + #[error("Interrupt error: {0}")] + InterruptError(IOError), + /// Guest gave us bad memory addresses. + #[error("failed to access guest memory. {0}")] + GuestMemory(GuestMemoryError), + /// Guest gave us an invalid guest memory address. + #[error("invalid guest memory address. {0:?}")] + InvalidGuestAddress(GuestAddress), + /// Failed creating a new MmapRegion instance. + #[error("new mmap region failed: {0}")] + NewMmapRegion(vm_memory::mmap::MmapRegionError), + /// Failed setting kvm user memory region. + #[error("set user memory region failed: {0}")] + SetUserMemoryRegion(kvm_ioctls::Error), + /// Inserting mmap region failed. + #[error("inserting mmap region failed: {0}")] + InsertMmap(vm_memory::mmap::Error), + /// Failed to set madvise on guest memory region. + #[error("failed to set madvice() on guest memory region")] + Madvise(#[source] nix::Error), + + #[cfg(feature = "virtio-vsock")] + #[error("virtio-vsock error: {0}")] + VirtioVsockError(#[from] self::vsock::VsockError), + + #[cfg(feature = "virtio-net")] + #[error("Virtio-net error: {0}")] + VirtioNetError(#[from] crate::net::NetError), + + #[cfg(feature = "virtio-fs")] + /// Error from Virtio fs. + #[error("virtio-fs error: {0}")] + VirtioFs(fs::Error), + + #[cfg(feature = "virtio-mem")] + #[error("Virtio-mem error: {0}")] + VirtioMemError(#[from] mem::MemError), + + #[cfg(feature = "virtio-balloon")] + #[error("Virtio-balloon error: {0}")] + VirtioBalloonError(#[from] balloon::BalloonError), +} + +/// Specialized std::result::Result for Virtio device operations. +pub type Result = std::result::Result; + +#[allow(unused_macros)] +macro_rules! warn_or_panic { + ($($arg:tt)*) => { + if cfg!(test) { + panic!($($arg)*) + } else { + log::warn!($($arg)*) + } + } +} +#[allow(unused_imports)] +pub(crate) use warn_or_panic; + +#[cfg(test)] +pub mod tests { + use std::marker::PhantomData; + use std::mem; + use std::sync::Arc; + + use dbs_interrupt::KvmIrqManager; + use kvm_ioctls::{Kvm, VmFd}; + use virtio_queue::{QueueSync, QueueT}; + use vm_memory::{ + Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestUsize, VolatileMemory, + VolatileRef, VolatileSlice, + }; + + pub const VIRTQ_DESC_F_NEXT: u16 = 0x1; + pub const VIRTQ_DESC_F_WRITE: u16 = 0x2; + + pub fn create_vm_and_irq_manager() -> (Arc, Arc) { + let kvm = Kvm::new().unwrap(); + let vmfd = Arc::new(kvm.create_vm().unwrap()); + assert!(vmfd.create_irq_chip().is_ok()); + let irq_manager = Arc::new(KvmIrqManager::new(vmfd.clone())); + assert!(irq_manager.initialize().is_ok()); + + (vmfd, irq_manager) + } + + // Represents a virtio descriptor in guest memory. + pub struct VirtqDesc<'a> { + pub desc: VolatileSlice<'a>, + } + + #[repr(C)] + // Used to calculate field offset + pub struct DescriptorTmp { + addr: vm_memory::Le64, + len: vm_memory::Le32, + flags: vm_memory::Le16, + next: vm_memory::Le16, + } + + macro_rules! offset_of { + ($ty:ty, $field:ident) => { + unsafe { + let base = std::mem::MaybeUninit::<$ty>::uninit(); + let base_ptr = base.as_ptr(); + let c = std::ptr::addr_of!((*base_ptr).$field); + (c as usize) - (base_ptr as usize) + } + }; + } + + impl<'a> VirtqDesc<'a> { + fn new(dtable: &'a VolatileSlice<'a>, i: u16) -> Self { + let desc = dtable + .get_slice((i as usize) * Self::dtable_len(1), Self::dtable_len(1)) + .unwrap(); + VirtqDesc { desc } + } + + pub fn addr(&self) -> VolatileRef { + self.desc.get_ref(offset_of!(DescriptorTmp, addr)).unwrap() + } + + pub fn len(&self) -> VolatileRef { + self.desc.get_ref(offset_of!(DescriptorTmp, len)).unwrap() + } + + pub fn flags(&self) -> VolatileRef { + self.desc.get_ref(offset_of!(DescriptorTmp, flags)).unwrap() + } + + pub fn next(&self) -> VolatileRef { + self.desc.get_ref(offset_of!(DescriptorTmp, next)).unwrap() + } + + pub fn set(&self, addr: u64, len: u32, flags: u16, next: u16) { + self.addr().store(addr); + self.len().store(len); + self.flags().store(flags); + self.next().store(next); + } + + fn dtable_len(nelem: u16) -> usize { + 16 * nelem as usize + } + } + + // Represents a virtio queue ring. The only difference between the used and available rings, + // is the ring element type. + pub struct VirtqRing<'a, T> { + pub ring: VolatileSlice<'a>, + pub start: GuestAddress, + pub qsize: u16, + _marker: PhantomData<*const T>, + } + + impl<'a, T> VirtqRing<'a, T> + where + T: vm_memory::ByteValued, + { + fn new( + start: GuestAddress, + mem: &'a GuestMemoryMmap, + qsize: u16, + alignment: GuestUsize, + ) -> Self { + assert_eq!(start.0 & (alignment - 1), 0); + + let (region, addr) = mem.to_region_addr(start).unwrap(); + let size = Self::ring_len(qsize); + let ring = region.get_slice(addr.0 as usize, size).unwrap(); + + let result = VirtqRing { + ring, + start, + qsize, + _marker: PhantomData, + }; + + result.flags().store(0); + result.idx().store(0); + result.event().store(0); + result + } + + pub fn start(&self) -> GuestAddress { + self.start + } + + pub fn end(&self) -> GuestAddress { + self.start.unchecked_add(self.ring.len() as GuestUsize) + } + + pub fn flags(&self) -> VolatileRef { + self.ring.get_ref(0).unwrap() + } + + pub fn idx(&self) -> VolatileRef { + self.ring.get_ref(2).unwrap() + } + + fn ring_offset(i: u16) -> usize { + 4 + mem::size_of::() * (i as usize) + } + + pub fn ring(&self, i: u16) -> VolatileRef { + assert!(i < self.qsize); + self.ring.get_ref(Self::ring_offset(i)).unwrap() + } + + pub fn event(&self) -> VolatileRef { + self.ring.get_ref(Self::ring_offset(self.qsize)).unwrap() + } + + fn ring_len(qsize: u16) -> usize { + Self::ring_offset(qsize) + 2 + } + } + + #[repr(C)] + #[derive(Clone, Copy, Default)] + pub struct VirtqUsedElem { + pub id: u32, + pub len: u32, + } + + unsafe impl vm_memory::ByteValued for VirtqUsedElem {} + + pub type VirtqAvail<'a> = VirtqRing<'a, u16>; + pub type VirtqUsed<'a> = VirtqRing<'a, VirtqUsedElem>; + + trait GuestAddressExt { + fn align_up(&self, x: GuestUsize) -> GuestAddress; + } + impl GuestAddressExt for GuestAddress { + fn align_up(&self, x: GuestUsize) -> GuestAddress { + Self((self.0 + (x - 1)) & !(x - 1)) + } + } + + pub struct VirtQueue<'a> { + pub start: GuestAddress, + pub dtable: VolatileSlice<'a>, + pub avail: VirtqAvail<'a>, + pub used: VirtqUsed<'a>, + } + + impl<'a> VirtQueue<'a> { + // We try to make sure things are aligned properly :-s + pub fn new(start: GuestAddress, mem: &'a GuestMemoryMmap, qsize: u16) -> Self { + // power of 2? + assert!(qsize > 0 && qsize & (qsize - 1) == 0); + + let (region, addr) = mem.to_region_addr(start).unwrap(); + let dtable = region + .get_slice(addr.0 as usize, VirtqDesc::dtable_len(qsize)) + .unwrap(); + + const AVAIL_ALIGN: GuestUsize = 2; + + let avail_addr = start + .unchecked_add(VirtqDesc::dtable_len(qsize) as GuestUsize) + .align_up(AVAIL_ALIGN); + let avail = VirtqAvail::new(avail_addr, mem, qsize, AVAIL_ALIGN); + + const USED_ALIGN: GuestUsize = 4; + + let used_addr = avail.end().align_up(USED_ALIGN); + let used = VirtqUsed::new(used_addr, mem, qsize, USED_ALIGN); + + VirtQueue { + start, + dtable, + avail, + used, + } + } + + fn size(&self) -> u16 { + (self.dtable.len() / VirtqDesc::dtable_len(1)) as u16 + } + + pub fn dtable(&self, i: u16) -> VirtqDesc { + VirtqDesc::new(&self.dtable, i) + } + + fn dtable_start(&self) -> GuestAddress { + self.start + } + + fn avail_start(&self) -> GuestAddress { + self.avail.start() + } + + fn used_start(&self) -> GuestAddress { + self.used.start() + } + + // Creates a new QueueSync, using the underlying memory regions represented by the VirtQueue. + pub fn create_queue(&self) -> QueueSync { + let mut q = QueueSync::new(self.size()).unwrap(); + + q.set_size(self.size()); + q.set_ready(true); + let _ = q.lock().try_set_desc_table_address(self.dtable_start()); + let _ = q.lock().try_set_avail_ring_address(self.avail_start()); + let _ = q.lock().try_set_used_ring_address(self.used_start()); + + q + } + + pub fn start(&self) -> GuestAddress { + self.dtable_start() + } + + pub fn end(&self) -> GuestAddress { + self.used.end() + } + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/mem.rs b/src/dragonball/src/dbs_virtio_devices/src/mem.rs new file mode 100644 index 000000000..d71aa0c40 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/mem.rs @@ -0,0 +1,2061 @@ +// Copyright (C) 2020 Alibaba Cloud Computing. All rights reserved. +// Copyright (c) 2020 Ant Financial +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; +use std::cmp; +use std::io::{self, Write}; +use std::marker::PhantomData; +use std::mem::size_of; +use std::ops::Deref; +use std::os::unix::io::RawFd; +use std::sync::{Arc, Mutex}; + +use dbs_device::resources::{DeviceResources, ResourceConstraint}; +use dbs_interrupt::{InterruptNotifier, NoopNotifier}; +use dbs_utils::epoll_manager::{ + EpollManager, EventOps, EventSet, Events, MutEventSubscriber, SubscriberId, +}; +use kvm_ioctls::VmFd; +use log::{debug, error, info, trace, warn}; +use virtio_bindings::bindings::virtio_blk::VIRTIO_F_VERSION_1; +use virtio_queue::{DescriptorChain, QueueOwnedT, QueueSync, QueueT}; +use vm_memory::{ + ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryError, + GuestMemoryRegion, GuestRegionMmap, GuestUsize, MemoryRegionAddress, +}; + +use crate::device::{VirtioDevice, VirtioDeviceConfig, VirtioDeviceInfo}; +use crate::{ + ActivateError, ActivateResult, ConfigResult, DbsGuestAddressSpace, Error, Result, + VirtioSharedMemoryList, TYPE_MEM, +}; + +/// Use 4 MiB alignment because current kernel use it as the subblock_size. +pub const VIRTIO_MEM_DEFAULT_BLOCK_SIZE: u64 = 4 << 20; + +/// The memory block size of guest when initial memory is less than 64GiB. +/// When initial memory is more than 64GiB, the memory block size maybe 1GiB or +/// 2GiB, and the specific algorithm is in +/// `arch/x86/mm/int_64.c:memory_block_size_bytes()`. So if we want to use +/// virtio-mem when initial memory is larger than 64GiB, we should use the +/// algorithm in kernel to get the actual memory block size. +pub const VIRTIO_MEM_DEFAULT_BLOCK_ALIGNMENT: u64 = 128 * 1024 * 1024; + +const VIRTIO_MEM_MAP_REGION_SHIFT: u64 = 31; +const VIRTIO_MEM_MAP_REGION_SIZE: u64 = 1 << VIRTIO_MEM_MAP_REGION_SHIFT; +const VIRTIO_MEM_MAP_REGION_MASK: u64 = !(std::u64::MAX << VIRTIO_MEM_MAP_REGION_SHIFT); + +/// Max memory block size used in guest kernel. +const MAX_MEMORY_BLOCK_SIZE: u64 = 2 << 30; +/// Amount of boot ram to judge whether to use large memory blocks. +const BOOT_MEM_SIZE_FOR_LARGE_BLOCK: u64 = 64 << 30; + +const MEM_DRIVER_NAME: &str = "virtio-mem"; + +const QUEUE_SIZE: u16 = 128; +const NUM_QUEUES: usize = 1; +const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE]; + +// Request processed successfully, applicable for +// - VIRTIO_MEM_REQ_PLUG +// - VIRTIO_MEM_REQ_UNPLUG +// - VIRTIO_MEM_REQ_UNPLUG_ALL +// - VIRTIO_MEM_REQ_STATE +const VIRTIO_MEM_RESP_ACK: u16 = 0; + +// Request denied - e.g. trying to plug more than requested, applicable for +// - VIRTIO_MEM_REQ_PLUG +const VIRTIO_MEM_RESP_NACK: u16 = 1; + +// Request cannot be processed right now, try again later, applicable for +// - VIRTIO_MEM_REQ_PLUG +// - VIRTIO_MEM_REQ_UNPLUG +// - VIRTIO_MEM_REQ_UNPLUG_ALL +// VIRTIO_MEM_RESP_BUSY: u16 = 2; + +// Error in request (e.g. addresses/alignment), applicable for +// - VIRTIO_MEM_REQ_PLUG +// - VIRTIO_MEM_REQ_UNPLUG +// - VIRTIO_MEM_REQ_STATE +const VIRTIO_MEM_RESP_ERROR: u16 = 3; + +// State of memory blocks is "plugged" +const VIRTIO_MEM_STATE_PLUGGED: u16 = 0; +// State of memory blocks is "unplugged" +const VIRTIO_MEM_STATE_UNPLUGGED: u16 = 1; +// State of memory blocks is "mixed" +const VIRTIO_MEM_STATE_MIXED: u16 = 2; + +// request to plug memory blocks +const VIRTIO_MEM_REQ_PLUG: u16 = 0; +// request to unplug memory blocks +const VIRTIO_MEM_REQ_UNPLUG: u16 = 1; +// request to unplug all blocks and shrink the usable size +const VIRTIO_MEM_REQ_UNPLUG_ALL: u16 = 2; +// request information about the plugged state of memory blocks +const VIRTIO_MEM_REQ_STATE: u16 = 3; + +// Virtio features +const VIRTIO_MEM_F_ACPI_PXM: u8 = 0; + +type MapRegions = Arc)>>>; + +type MultiRegions = Option<(MapRegions, Arc>)>; + +#[derive(Debug, thiserror::Error)] +pub enum MemError { + /// Guest gave us bad memory addresses. + #[error("failed to access guest memory. {0}")] + GuestMemory(GuestMemoryError), + /// Guest gave us a write only descriptor that protocol says to read from. + #[error("unexpected write only descriptor.")] + UnexpectedWriteOnlyDescriptor, + /// Guest gave us a read only descriptor that protocol says to write to. + #[error("unexpected read only descriptor.")] + UnexpectedReadOnlyDescriptor, + #[error("not enough descriptors for request.")] + /// Guest gave us too few descriptors in a descriptor chain. + DescriptorChainTooShort, + /// Guest gave us a descriptor that was too short to use. + #[error("descriptor length too small.")] + DescriptorLengthTooSmall, + /// Guest sent us invalid request. + #[error("Guest sent us invalid request.")] + InvalidRequest, + /// virtio-mem resize usable region fail + #[error("resize usable region fail: {0}")] + RsizeUsabeRegionFail(String), +} + +/// Specialied std::result::Result for virtio-mem related operations. +pub type MemResult = std::result::Result; + +// Got from qemu/include/standard-headers/linux/virtio_mem.h +// rust union doesn't support std::default::Default that +// need by mem.read_obj. +// Then move virtio_mem_req_plug, virtio_mem_req_unplug and +// virtio_mem_req_state to virtio_mem_req. +#[repr(C)] +#[derive(Copy, Clone, Debug, Default)] +struct VirtioMemReq { + req_type: u16, + padding: [u16; 3], + addr: u64, + nb_blocks: u16, +} + +// Safe because it only has data and has no implicit padding. +unsafe impl ByteValued for VirtioMemReq {} + +// Got from qemu/include/standard-headers/linux/virtio_mem.h +#[repr(C)] +#[derive(Copy, Clone, Debug, Default)] +struct VirtioMemRespState { + state: u16, +} + +#[repr(C)] +#[derive(Copy, Clone, Debug, Default)] +struct VirtioMemResp { + resp_type: u16, + padding: [u16; 3], + state: VirtioMemRespState, +} + +// Safe because it only has data and has no implicit padding. +unsafe impl ByteValued for VirtioMemResp {} + +// Got from qemu/include/standard-headers/linux/virtio_mem.h +#[repr(C, packed)] +#[derive(Copy, Clone, Debug, Default, PartialEq)] +pub(crate) struct VirtioMemConfig { + /// Block size and alignment. Cannot change. + pub(crate) block_size: u64, + /// Valid with VIRTIO_MEM_F_ACPI_PXM. Cannot change. + pub(crate) node_id: u16, + pub(crate) padding: [u8; 6], + /// Start address of the memory region. Cannot change. + pub(crate) addr: u64, + /// Region size (maximum). Cannot change. + pub(crate) region_size: u64, + /// Currently usable region size. Can grow up to region_size. Can + /// shrink due to VIRTIO_MEM_REQ_UNPLUG_ALL (in which case no config + /// update will be sent). + pub(crate) usable_region_size: u64, + /// Currently used size. Changes due to plug/unplug requests, but no + /// config updates will be sent. + pub(crate) plugged_size: u64, + /// Requested size. New plug requests cannot exceed it. Can change. + pub(crate) requested_size: u64, +} + +// Safe because it only has data and has no implicit padding. +unsafe impl ByteValued for VirtioMemConfig {} + +struct Request { + req: VirtioMemReq, + status_addr: GuestAddress, +} + +impl Request { + fn parse(desc_chain: &mut DescriptorChain<&M>, mem: &M) -> MemResult { + let avail_desc = desc_chain.next().ok_or(MemError::DescriptorChainTooShort)?; + // The head contains the request type which MUST be readable. + if avail_desc.is_write_only() { + return Err(MemError::UnexpectedWriteOnlyDescriptor); + } + if avail_desc.len() as usize != size_of::() { + return Err(MemError::InvalidRequest); + } + let req: VirtioMemReq = mem + .read_obj(avail_desc.addr()) + .map_err(MemError::GuestMemory)?; + + let status_desc = desc_chain.next().ok_or(MemError::DescriptorChainTooShort)?; + + // The status MUST always be writable + if !status_desc.is_write_only() { + return Err(MemError::UnexpectedReadOnlyDescriptor); + } + + if (status_desc.len() as usize) < size_of::() { + return Err(MemError::DescriptorLengthTooSmall); + } + + Ok(Request { + req, + status_addr: status_desc.addr(), + }) + } +} + +struct StateChangeRequest<'a> { + id: &'a str, + config: &'a VirtioMemConfig, + mem_state: &'a mut Vec, + addr: u64, + size: u64, + nb_blocks: u16, + multi_region: bool, + map_regions: MapRegions, + host_fd: Option, + plug: bool, +} + +impl<'a> StateChangeRequest<'a> { + #[allow(clippy::too_many_arguments)] + fn new( + r: &Request, + id: &'a str, + config: &'a VirtioMemConfig, + mem_state: &'a mut Vec, + multi_region: bool, + map_regions: MapRegions, + host_fd: Option, + plug: bool, + ) -> StateChangeRequest<'a> { + let size: u64 = r.req.nb_blocks as u64 * config.block_size; + + StateChangeRequest { + id, + config, + mem_state, + addr: r.req.addr, + size, + nb_blocks: r.req.nb_blocks, + multi_region, + map_regions, + host_fd, + plug, + } + } +} + +/// A hook for the VMM to create memory region for virtio-mem devices. +pub trait MemRegionFactory: Send { + fn create_region( + &mut self, + guest_addr: GuestAddress, + region_len: GuestUsize, + kvm_slot: u32, + ) -> std::result::Result, Error>; + + fn restore_region_addr(&self, guest_addr: GuestAddress) -> std::result::Result<*mut u8, Error>; + + fn get_host_numa_node_id(&self) -> Option; + + fn set_host_numa_node_id(&mut self, host_numa_node_id: Option); +} + +struct MemTool {} + +impl MemTool { + fn virtio_mem_valid_range(config: &VirtioMemConfig, addr: u64, size: u64) -> bool { + // address properly aligned? + if addr % config.block_size != 0 || size % config.block_size != 0 { + return false; + } + + // reasonable size + if addr.checked_add(size).is_none() || size == 0 { + return false; + } + + // start address in usable range? + if addr < config.addr || addr >= config.addr + config.usable_region_size { + return false; + } + + // end address in usable range? + if addr + size > config.addr + config.usable_region_size { + return false; + } + + true + } + + fn virtio_mem_check_bitmap( + bit_index: usize, + nb_blocks: u16, + mem_state: &[bool], + plug: bool, + ) -> bool { + for state in mem_state.iter().skip(bit_index).take(nb_blocks as usize) { + if *state != plug { + return false; + } + } + true + } + + fn virtio_mem_set_bitmap(bit_index: usize, nb_blocks: u16, mem_state: &mut [bool], plug: bool) { + for state in mem_state + .iter_mut() + .skip(bit_index) + .take(nb_blocks as usize) + { + *state = plug; + } + } + + fn virtio_mem_state_change_request(r: &mut StateChangeRequest) -> u16 { + if r.plug && (r.config.plugged_size + r.size > r.config.requested_size) { + return VIRTIO_MEM_RESP_NACK; + } + if !MemTool::virtio_mem_valid_range(r.config, r.addr, r.size) { + return VIRTIO_MEM_RESP_ERROR; + } + + let offset = r.addr - r.config.addr; + let bit_index = (offset / r.config.block_size) as usize; + if !MemTool::virtio_mem_check_bitmap(bit_index, r.nb_blocks, r.mem_state, !r.plug) { + return VIRTIO_MEM_RESP_ERROR; + } + + let host_addr = if r.multi_region { + // Handle map_region + let map_regions = r.map_regions.lock().unwrap(); + let map_region_index = (offset >> VIRTIO_MEM_MAP_REGION_SHIFT) as usize; + if (offset + r.size - 1) >> VIRTIO_MEM_MAP_REGION_SHIFT != map_region_index as u64 { + error!( + target: MEM_DRIVER_NAME, + "{}: {}: try to change more than one map_region", MEM_DRIVER_NAME, r.id, + ); + return VIRTIO_MEM_RESP_ERROR; + } + if map_region_index >= map_regions.len() { + error!( + target: MEM_DRIVER_NAME, + "{}: {}: map_region index {} is not right {:?}", + MEM_DRIVER_NAME, + r.id, + map_region_index, + map_regions, + ); + return VIRTIO_MEM_RESP_ERROR; + } + + let region_host_addr = if let Some(addr_tuple) = map_regions[map_region_index].1 { + addr_tuple.0 + } else { + error!( + "{}: try to access unmap region offset {} size {}", + MEM_DRIVER_NAME, offset, r.size + ); + return VIRTIO_MEM_RESP_ERROR; + }; + (offset & VIRTIO_MEM_MAP_REGION_MASK) + region_host_addr + } else { + let map_regions = r.map_regions.lock().unwrap(); + if let Some(addr_tuple) = map_regions[0].1 { + addr_tuple.0 + offset + } else { + error!( + target: MEM_DRIVER_NAME, + "{}: {}: try to unplug unmap region", MEM_DRIVER_NAME, r.id + ); + return VIRTIO_MEM_RESP_ERROR; + } + }; + + if !r.plug { + if let Some(fd) = r.host_fd { + let res = unsafe { + libc::fallocate64( + fd, + libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, + offset as libc::off64_t, + r.size as libc::off64_t, + ) + }; + if res != 0 { + error!( + target: MEM_DRIVER_NAME, + "{}: {}: fallocate64 get error {}", + MEM_DRIVER_NAME, + r.id, + io::Error::last_os_error() + ); + return VIRTIO_MEM_RESP_ERROR; + } + } + let res = unsafe { + libc::madvise( + host_addr as *mut libc::c_void, + r.size as libc::size_t, + libc::MADV_REMOVE, + ) + }; + if res != 0 { + error!( + target: MEM_DRIVER_NAME, + "{}: {}: madvise get error {}", + MEM_DRIVER_NAME, + r.id, + io::Error::last_os_error() + ); + return VIRTIO_MEM_RESP_ERROR; + } + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: unplug host_addr {} size {}", + MEM_DRIVER_NAME, + r.id, + host_addr, + r.size, + ); + } else { + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: plug host_addr {} size {}", + MEM_DRIVER_NAME, + r.id, + host_addr, + r.size, + ); + } + + MemTool::virtio_mem_set_bitmap(bit_index, r.nb_blocks, r.mem_state, r.plug); + + VIRTIO_MEM_RESP_ACK + } + + #[allow(clippy::too_many_arguments)] + fn virtio_mem_unplug_all( + id: &str, + config: &VirtioMemConfig, + mem_state: &mut Vec, + multi_region: bool, + map_regions: MapRegions, + host_fd: Option, + ) -> u16 { + for x in 0..(config.region_size / config.block_size) as usize { + if mem_state[x] { + let mut request = StateChangeRequest { + id, + config, + addr: config.addr + x as u64 * config.block_size, + size: config.block_size, + nb_blocks: 1, + mem_state, + multi_region, + map_regions: map_regions.clone(), + host_fd, + plug: false, + }; + let resp_type = MemTool::virtio_mem_state_change_request(&mut request); + if resp_type != VIRTIO_MEM_RESP_ACK { + return resp_type; + } + mem_state[x] = false; + } + } + + VIRTIO_MEM_RESP_ACK + } + + fn virtio_mem_state_request( + config: &VirtioMemConfig, + addr: u64, + nb_blocks: u16, + mem_state: &mut [bool], + ) -> (u16, u16) { + let size: u64 = nb_blocks as u64 * config.block_size; + let resp_type = if MemTool::virtio_mem_valid_range(config, addr, size) { + VIRTIO_MEM_RESP_ACK + } else { + VIRTIO_MEM_RESP_ERROR + }; + + let offset = addr - config.addr; + let bit_index = (offset / config.block_size) as usize; + let resp_state = if MemTool::virtio_mem_check_bitmap(bit_index, nb_blocks, mem_state, true) + { + VIRTIO_MEM_STATE_PLUGGED + } else if MemTool::virtio_mem_check_bitmap(bit_index, nb_blocks, mem_state, false) { + VIRTIO_MEM_STATE_UNPLUGGED + } else { + VIRTIO_MEM_STATE_MIXED + }; + + (resp_type, resp_state) + } + + /// The idea of virtio_mem_resize_usable_region is get from QEMU virtio_mem_resize_usable_region + /// use alignment to calculate usable extent. + fn virtio_mem_resize_usable_region( + id: &str, + config: &mut VirtioMemConfig, + can_shrink: bool, + alignment: u64, + // map_regions, factory + multi_regions: MultiRegions, + ) -> Result<()> { + let mut newsize = cmp::min(config.region_size, config.requested_size + 2 * alignment); + + /* The usable region size always has to be multiples of the block size. */ + newsize &= !(config.block_size - 1); + + if config.requested_size == 0 { + newsize = 0; + } + + if newsize > config.usable_region_size { + if let Some((map_regions, factory)) = multi_regions { + let mut map_regions = map_regions.lock().unwrap(); + let mut first_index = + (config.usable_region_size >> VIRTIO_MEM_MAP_REGION_SHIFT) as usize; + let mut last_index = (newsize >> VIRTIO_MEM_MAP_REGION_SHIFT) as usize; + if first_index >= map_regions.len() { + first_index = map_regions.len() - 1; + } + if last_index >= map_regions.len() { + last_index = map_regions.len() - 1; + } + // Find the first unmap index + let mut first_unmap_index = None; + for index in first_index..last_index + 1 { + if map_regions[index].1.is_none() { + first_unmap_index = Some(index); + break; + } + } + if let Some(first_index) = first_unmap_index { + let regions_num = (last_index - first_index + 1) as u64; + // Setup a new map region + let mut guest_addr = + config.addr + ((first_index as u64) << VIRTIO_MEM_MAP_REGION_SHIFT); + let region_len = ((regions_num - 1) << VIRTIO_MEM_MAP_REGION_SHIFT) + + if last_index + 1 == map_regions.len() { + config.region_size + - ((last_index as u64) << VIRTIO_MEM_MAP_REGION_SHIFT) + } else { + VIRTIO_MEM_MAP_REGION_SIZE + }; + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: try to get new map_region index {}-{} guest_addr 0x{:x} len 0x{:x} slot {}", + MEM_DRIVER_NAME, + id, + first_index, + last_index, + guest_addr, + region_len, + map_regions[first_index].0, + ); + let region = factory.lock().unwrap().create_region( + GuestAddress(guest_addr), + region_len, + map_regions[first_index].0, + )?; + let mut host_addr = region + .get_host_address(MemoryRegionAddress(0)) + .map_err(|e| MemError::RsizeUsabeRegionFail(format!("{:?}", e)))? + as u64; + info!(target: MEM_DRIVER_NAME, + "{}: {}: new map_region index {}-{} new region guest_addr 0x{:x}-0x{:x} host_addr 0x{:x} len 0x{:x}", + MEM_DRIVER_NAME, id, first_index, last_index, guest_addr, guest_addr + region_len, host_addr, region_len); + for index in first_index..last_index + 1 { + map_regions[index].1 = Some((host_addr, guest_addr)); + host_addr += VIRTIO_MEM_MAP_REGION_SIZE; + guest_addr += VIRTIO_MEM_MAP_REGION_SIZE; + } + } + } + } + if newsize < config.usable_region_size && !can_shrink { + return Ok(()); + } + + let oldsize = config.usable_region_size; + info!( + target: MEM_DRIVER_NAME, + "{}: {}: virtio_mem_resize_usable_region {:?} {:?}", + MEM_DRIVER_NAME, + id, + oldsize, + newsize + ); + config.usable_region_size = newsize; + + Ok(()) + } +} + +pub(crate) struct MemEpollHandler< + AS: GuestAddressSpace, + Q: QueueT + Send = QueueSync, + R: GuestMemoryRegion = GuestRegionMmap, +> { + pub(crate) config: VirtioDeviceConfig, + mem_config: Arc>, + pub(crate) multi_region: bool, + // kvm_slot, Option(host_addr, guest_addr) + pub(crate) map_regions: MapRegions, + host_fd: Option, + pub(crate) mem_state: Vec, + id: String, +} + +impl MemEpollHandler { + fn process_queue(&mut self, queue_index: usize) -> bool { + // Do not expect poisoned lock. + let config = &mut self.mem_config.lock().unwrap(); + let conf = &mut self.config; + let guard = conf.lock_guest_memory(); + let mem = guard.deref(); + let queue = &mut conf.queues[queue_index]; + let mut guard = queue.queue_mut().lock(); + let mut used_desc_heads = Vec::with_capacity(QUEUE_SIZE as usize); + + let mut iter = match guard.iter(mem) { + Err(e) => { + error!( + "{}: {}: failed to process queue. {}", + MEM_DRIVER_NAME, self.id, e + ); + return false; + } + Ok(iter) => iter, + }; + + for mut avail_desc in &mut iter { + let len = match Request::parse(&mut avail_desc, mem) { + Err(e) => { + debug!( + target: MEM_DRIVER_NAME, + "{}: {}: failed parse VirtioMemReq, {:?}", MEM_DRIVER_NAME, self.id, e + ); + 0 + } + Ok(r) => match r.req.req_type { + VIRTIO_MEM_REQ_PLUG => { + let mut request = StateChangeRequest::new( + &r, + &self.id, + config, + &mut self.mem_state, + self.multi_region, + self.map_regions.clone(), + self.host_fd, + true, + ); + let resp_type = MemTool::virtio_mem_state_change_request(&mut request); + let size = request.size; + drop(request); + if resp_type == VIRTIO_MEM_RESP_ACK { + config.plugged_size += size; + let new_plugged_size = config.plugged_size; + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: process_queue VIRTIO_MEM_REQ_PLUG {:?} plugged_size {:?}", + MEM_DRIVER_NAME, + self.id, + size, + new_plugged_size + ); + } + Self::send_response(&self.id, mem, r.status_addr, resp_type, 0) + } + VIRTIO_MEM_REQ_UNPLUG => { + let mut request = StateChangeRequest::new( + &r, + &self.id, + config, + &mut self.mem_state, + self.multi_region, + self.map_regions.clone(), + self.host_fd, + false, + ); + let resp_type = MemTool::virtio_mem_state_change_request(&mut request); + let size = request.size; + drop(request); + if resp_type == VIRTIO_MEM_RESP_ACK { + config.plugged_size -= size; + let new_plugged_size = config.plugged_size; + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: process_queue VIRTIO_MEM_REQ_UNPLUG {:?} plugged_size {:?}", + MEM_DRIVER_NAME, self.id, size, new_plugged_size + ); + } + Self::send_response(&self.id, mem, r.status_addr, resp_type, 0) + } + VIRTIO_MEM_REQ_UNPLUG_ALL => { + let resp_type = MemTool::virtio_mem_unplug_all( + &self.id, + config, + &mut self.mem_state, + self.multi_region, + self.map_regions.clone(), + self.host_fd, + ); + if resp_type == VIRTIO_MEM_RESP_ACK { + config.plugged_size = 0; + /* Does not call MemTool::virtio_mem_resize_usable_region because current doesn't support unmap region. */ + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: process_queue VIRTIO_MEM_REQ_UNPLUG_ALL", + MEM_DRIVER_NAME, + self.id, + ); + } + Self::send_response(&self.id, mem, r.status_addr, resp_type, 0) + } + VIRTIO_MEM_REQ_STATE => { + let (resp_type, resp_state) = MemTool::virtio_mem_state_request( + config, + r.req.addr, + r.req.nb_blocks, + &mut self.mem_state, + ); + Self::send_response(&self.id, mem, r.status_addr, resp_type, resp_state) + } + _ => { + debug!( + target: MEM_DRIVER_NAME, + "{}: {}: VirtioMemReq unknown request type {:?}", + MEM_DRIVER_NAME, + self.id, + r.req.req_type + ); + 0 + } + }, + }; + + used_desc_heads.push((avail_desc.head_index(), len)); + } + + drop(guard); + + for &(desc_index, len) in &used_desc_heads { + queue.add_used(mem, desc_index, len); + } + + !used_desc_heads.is_empty() + } + + fn send_response( + id: &str, + mem: &AS::M, + status_addr: GuestAddress, + resp_type: u16, + state: u16, + ) -> u32 { + let mut resp = VirtioMemResp { + resp_type, + ..VirtioMemResp::default() + }; + resp.state.state = state; + match mem.write_obj(resp, status_addr) { + Ok(_) => size_of::() as u32, + Err(e) => { + debug!( + target: MEM_DRIVER_NAME, + "{}: {}: bad guest memory address, {}", MEM_DRIVER_NAME, id, e + ); + 0 + } + } + } +} + +impl MutEventSubscriber + for MemEpollHandler +{ + fn process(&mut self, events: Events, _ops: &mut EventOps) { + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: MemEpollHandler::process()", + MEM_DRIVER_NAME, + self.id + ); + + let idx = events.data() as usize; + if idx >= self.config.queues.len() { + error!( + target: MEM_DRIVER_NAME, + "{}: {}: invalid queue index {}", MEM_DRIVER_NAME, self.id, idx + ); + return; + } + + if let Err(e) = self.config.queues[idx].consume_event() { + error!( + target: MEM_DRIVER_NAME, + "{}: {}: failed to get queue event, {:?}", MEM_DRIVER_NAME, self.id, e + ); + } else if self.process_queue(idx) { + if let Err(e) = self.config.queues[idx].notify() { + error!( + target: MEM_DRIVER_NAME, + "{}: {}: failed to signal used queue, {}", MEM_DRIVER_NAME, self.id, e + ); + } + } + } + + fn init(&mut self, ops: &mut EventOps) { + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: MemEpollHandler::init()", + MEM_DRIVER_NAME, + self.id + ); + + for (idx, queue) in self.config.queues.iter().enumerate() { + ops.add(Events::with_data( + queue.eventfd.as_ref(), + idx as u32, + EventSet::IN, + )) + .unwrap_or_else(|_| { + panic!( + "{}: {}: failed to register queue event handler", + MEM_DRIVER_NAME, self.id + ) + }); + } + } +} + +fn get_map_regions_num(region_size: u64) -> usize { + ((region_size >> VIRTIO_MEM_MAP_REGION_SHIFT) + + u64::from(region_size & VIRTIO_MEM_MAP_REGION_MASK > 0)) as usize +} + +/// Virtio device for exposing memory hotplug to the guest OS through virtio. +pub struct Mem { + pub(crate) device_info: VirtioDeviceInfo, + config: Arc>, + capacity: u64, + factory: Arc>, + host_fd: Option, + device_change_notifier: Arc, + subscriber_id: Option, + id: String, + phantom: PhantomData, + alignment: u64, + // used for liveupgrade to record the memory state map in epoll handler + mem_state_map: Option>, + multi_region: bool, + // kvm_slot, Option(host_addr, guest_addr) + map_regions: MapRegions, +} + +impl Mem { + /// Create a new virtio-mem device. + #[allow(clippy::too_many_arguments)] + pub fn new( + id: String, + mut capacity: u64, + requested_size_mib: u64, + mut multi_region: bool, + numa_node_id: Option, + epoll_mgr: EpollManager, + factory: Arc>, + boot_mem_byte: u64, + ) -> Result { + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: Mem::new()", + MEM_DRIVER_NAME, + id + ); + + let mut avail_features = 1u64 << VIRTIO_F_VERSION_1 as u64; + + // calculate alignment depending on boot memory size + // algorithm is from kernel (arch/x86/mm/init_64.c: probe_memory_block_size()) + let alignment = { + if boot_mem_byte < BOOT_MEM_SIZE_FOR_LARGE_BLOCK { + VIRTIO_MEM_DEFAULT_BLOCK_ALIGNMENT + } else { + let mut bz = MAX_MEMORY_BLOCK_SIZE; + while bz > VIRTIO_MEM_DEFAULT_BLOCK_ALIGNMENT { + if boot_mem_byte & (bz - 1) == 0 { + break; + } + bz >>= 1 + } + bz + } + }; + + // Align to 2 * alignment (256MB when boot mem size < 64G). + capacity = capacity * 1024 * 1024; + let usable_extent = 2 * alignment; + capacity = (capacity + usable_extent - 1) & !(usable_extent - 1); + let requested_size = requested_size_mib * 1024 * 1024; + if capacity == 0 + || requested_size > capacity + || requested_size % VIRTIO_MEM_DEFAULT_BLOCK_SIZE != 0 + { + return Err(Error::InvalidInput); + } + + let mut config = VirtioMemConfig::default(); + if let Some(node_id) = numa_node_id { + avail_features |= 1u64 << VIRTIO_MEM_F_ACPI_PXM; + config.node_id = node_id; + } + config.block_size = VIRTIO_MEM_DEFAULT_BLOCK_SIZE; + config.region_size = capacity; + config.requested_size = requested_size; + //config.usable_region_size will be setup in set_resource through virtio_mem_resize_usable_region + + if config.region_size <= VIRTIO_MEM_MAP_REGION_SIZE { + multi_region = false; + } + + // For warning unaligned_references + // adding curly braces means that a copy of the field is made, stored + // in a (properly aligned) temporary, and a reference to that temporary + // is being formatted. + info!(target: MEM_DRIVER_NAME, "{}: {}: new block_size: 0x{:x} region_size: 0x{:x} requested_size: 0x{:x} usable_region_size: 0x{:x} multi_region: {} numa_node_id: {:?}", + MEM_DRIVER_NAME, id, {config.block_size}, {config.region_size}, {config.requested_size}, {config.usable_region_size}, multi_region, numa_node_id); + + let device_info = VirtioDeviceInfo::new( + MEM_DRIVER_NAME.to_string(), + avail_features, + Arc::new(vec![QUEUE_SIZE; NUM_QUEUES]), + config.as_slice().to_vec(), + epoll_mgr, + ); + + Ok(Mem { + device_info, + config: Arc::new(Mutex::new(config)), + capacity, + factory, + device_change_notifier: Arc::new(NoopNotifier::new()), + host_fd: None, + subscriber_id: None, + id, + phantom: PhantomData, + alignment, + mem_state_map: None, + multi_region, + map_regions: Arc::new(Mutex::new(Vec::new())), + }) + } + + /// Set requested size of the memory device. + pub fn set_requested_size(&self, requested_size_mb: u64) -> Result<()> { + // Align to 4MB. + let requested_size = requested_size_mb * 1024 * 1024; + if requested_size > self.capacity || requested_size % VIRTIO_MEM_DEFAULT_BLOCK_SIZE != 0 { + return Err(Error::InvalidInput); + } + + let mem_config = &mut self.config.lock().unwrap(); + /* + * QEMU set config.requested_size after call + * virtio_mem_resize_usable_region. + * But virtio_mem_resize_usable_region of QEMU use new size as + * the requested_size. + * So this part should set requested_size before call + * MemTool::virtio_mem_resize_usable_region. + * Then MemTool::virtio_mem_resize_usable_region will get the new size + * from mem_config.requested_size. + */ + info!( + target: MEM_DRIVER_NAME, + "{}: {}: set_requested_size {} Mib", MEM_DRIVER_NAME, self.id, requested_size_mb + ); + mem_config.requested_size = requested_size; + MemTool::virtio_mem_resize_usable_region( + &self.id, + mem_config, + false, + self.alignment, + if self.multi_region { + Some((self.map_regions.clone(), self.factory.clone())) + } else { + None + }, + )?; + if let Err(e) = self.device_change_notifier.notify() { + error!( + target: MEM_DRIVER_NAME, + "{}: {}: failed to signal device change event: {}", MEM_DRIVER_NAME, self.id, e + ); + return Err(Error::IOError(e)); + } + + Ok(()) + } +} + +impl VirtioDevice for Mem +where + AS: DbsGuestAddressSpace, + Q: QueueT + Send + 'static, + R: GuestMemoryRegion + Sync + Send + 'static, +{ + fn device_type(&self) -> u32 { + TYPE_MEM + } + + fn queue_max_sizes(&self) -> &[u16] { + QUEUE_SIZES + } + + fn get_avail_features(&self, page: u32) -> u32 { + self.device_info.get_avail_features(page) + } + + fn set_acked_features(&mut self, page: u32, value: u32) { + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: VirtioDevice::set_acked_features({}, 0x{:x})", + MEM_DRIVER_NAME, + self.id, + page, + value + ); + + self.device_info.set_acked_features(page, value) + } + + fn read_config(&mut self, offset: u64, mut data: &mut [u8]) -> ConfigResult { + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: VirtioDevice::read_config(0x{:x}, {:?})", + MEM_DRIVER_NAME, + self.id, + offset, + data + ); + + // Do not expect poisoned lock. + let mem_config = self.config.lock().unwrap(); + let config_space = mem_config.as_slice().to_vec(); + let config_len = config_space.len() as u64; + + if offset >= config_len { + debug!( + target: MEM_DRIVER_NAME, + "{}: {}: config space read request out of range, offset {}", + MEM_DRIVER_NAME, + self.id, + offset + ); + } else if let Some(end) = offset.checked_add(data.len() as u64) { + let end = cmp::min(end, config_len) as usize; + // This write can't fail, offset and end are checked against config_len. + let _ = data.write(&config_space[offset as usize..end]).unwrap(); + } + Ok(()) + } + + fn write_config(&mut self, _offset: u64, _data: &[u8]) -> ConfigResult { + debug!( + target: MEM_DRIVER_NAME, + "{}: {}: device configuration is read-only", MEM_DRIVER_NAME, self.id + ); + Ok(()) + } + + fn activate(&mut self, config: VirtioDeviceConfig) -> ActivateResult { + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: VirtioDevice::activate()", + MEM_DRIVER_NAME, + self.id + ); + + // Do not support control queue and multi queue. + if config.queues.len() != 1 { + error!( + target: MEM_DRIVER_NAME, + "{}: {}: failed to activate, invalid queue_num {}.", + MEM_DRIVER_NAME, + self.id, + config.queues.len() + ); + return Err(ActivateError::InvalidParam); + } + self.device_info.check_queue_sizes(&config.queues)?; + + self.device_change_notifier = config.device_change_notifier.clone(); + + // Do not expect poisoned lock + let mem_config = self.config.lock().unwrap(); + + let slot_num = if self.multi_region { + get_map_regions_num(mem_config.region_size) + } else { + 1 + }; + + let map_regions_len = self.map_regions.lock().unwrap().len(); + if map_regions_len != slot_num { + error!( + target: MEM_DRIVER_NAME, + "{}: {}: map_region.len {}, slot_num {}", + MEM_DRIVER_NAME, + self.id, + map_regions_len, + slot_num + ); + return Err(ActivateError::InternalError); + } + + let mem_state = self.mem_state_map.take().unwrap_or_else(|| { + vec![false; mem_config.region_size as usize / mem_config.block_size as usize] + }); + + let handler = Box::new(MemEpollHandler { + config, + mem_config: self.config.clone(), + multi_region: self.multi_region, + map_regions: self.map_regions.clone(), + host_fd: self.host_fd, + mem_state, + id: self.id.clone(), + }); + + self.subscriber_id = Some(self.device_info.register_event_handler(handler)); + + Ok(()) + } + + fn remove(&mut self) { + if let Some(subscriber_id) = self.subscriber_id { + // Remove MemEpollHandler from event manager, so it could be dropped and the resources + // could be freed. + match self.device_info.remove_event_handler(subscriber_id) { + Ok(_) => debug!("virtio-mem: removed subscriber_id {:?}", subscriber_id), + Err(e) => { + warn!("virtio-mem: failed to remove event handler: {:?}", e); + } + } + } + self.subscriber_id = None; + } + + fn get_resource_requirements( + &self, + requests: &mut Vec, + use_generic_irq: bool, + ) { + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: VirtioDevice::get_resource_requirements()", + MEM_DRIVER_NAME, + self.id + ); + + requests.push(ResourceConstraint::LegacyIrq { irq: None }); + if use_generic_irq { + // Allocate one irq for device configuration change events, and one irq for each queue. + requests.push(ResourceConstraint::GenericIrq { + size: (self.device_info.queue_sizes.len() + 1) as u32, + }); + } + + // Do not expect poisoned lock. + let config = self.config.lock().unwrap(); + + // The memory needs to be 2MiB aligned in order to support huge pages. + // And we also need to align the memory's start address to guest's + // memory block size (usually 128MB), or the virtio-mem driver in guest + // kernel would cause some memory unusable which outside the alignment. + // Then, the memory needs to be above 4G to avoid conflicts with + // lapic/ioapic devices. + requests.push(ResourceConstraint::MemAddress { + range: None, + align: self.alignment, + size: config.region_size, + }); + + // Request for new kvm memory slot. + let slot_num = if self.multi_region { + get_map_regions_num(config.region_size) + } else { + 1 + }; + for _ in 0..slot_num { + requests.push(ResourceConstraint::KvmMemSlot { + slot: None, + size: 1, + }); + } + } + + fn set_resource( + &mut self, + _vm_fd: Arc, + resource: DeviceResources, + ) -> Result>> { + trace!( + target: MEM_DRIVER_NAME, + "{}: {}: VirtioDevice::set_resource()", + MEM_DRIVER_NAME, + self.id + ); + + let mem_res = resource.get_mem_address_ranges(); + let slot_res = resource.get_kvm_mem_slots(); + + // Check if we get memory resource. + if mem_res.is_empty() { + return Err(Error::InvalidResource); + } + + let mut mem_config = self.config.lock().unwrap(); + + let slot_num = if self.multi_region { + get_map_regions_num(mem_config.region_size) + } else { + 1 + }; + + // Make sure we have the correct resource as requested. + if slot_res.len() != slot_num + || mem_res.len() != 1 + || mem_res[0].1 != mem_config.region_size + { + error!( + target: MEM_DRIVER_NAME, + "{}: {}: wrong mem or kvm slot resource ({:?}, {:?})", + MEM_DRIVER_NAME, + self.id, + mem_res.len(), + slot_res.len() + ); + return Err(Error::InvalidResource); + } + + // update mem config's addr + mem_config.addr = mem_res[0].0; + + // Setup map_regions + let mut map_regions = self.map_regions.lock().unwrap(); + if map_regions.is_empty() { + if self.multi_region { + for slot in slot_res { + map_regions.push((slot, None)); + } + } else { + let region = self.factory.lock().unwrap().create_region( + GuestAddress(mem_config.addr), + mem_config.region_size, + slot_res[0], + )?; + let addr = region.get_host_address(MemoryRegionAddress(0)).unwrap() as u64; + map_regions.push((slot_res[0], Some((addr, mem_config.addr)))); + let guest_addr = mem_config.addr; + let size = mem_config.region_size; + info!( + "{}: {}: set_resource new region guest addr 0x{:x}-0x{:x} host addr 0x{:x} size {}", + MEM_DRIVER_NAME, + self.id, + guest_addr, + guest_addr + size, + addr, + size, + ); + } + } + drop(map_regions); + + MemTool::virtio_mem_resize_usable_region( + &self.id, + &mut mem_config, + false, + self.alignment, + if self.multi_region { + Some((self.map_regions.clone(), self.factory.clone())) + } else { + None + }, + )?; + + Ok(None) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + +#[cfg(test)] +pub(crate) mod tests { + use std::ffi::CString; + use std::fs::File; + use std::os::unix::io::FromRawFd; + + use dbs_device::resources::DeviceResources; + use dbs_interrupt::NoopNotifier; + use dbs_utils::epoll_manager::SubscriberOps; + use kvm_ioctls::Kvm; + use nix::sys::memfd; + use virtio_queue::QueueSync; + use vm_memory::{ + FileOffset, GuestAddress, GuestMemoryMmap, GuestRegionMmap, GuestUsize, MmapRegion, + }; + use vmm_sys_util::eventfd::EventFd; + + use super::*; + use crate::tests::{VirtQueue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; + use crate::VirtioQueueConfig; + + struct DummyMemRegionFactory {} + + impl MemRegionFactory for DummyMemRegionFactory { + fn create_region( + &mut self, + guest_addr: GuestAddress, + region_len: GuestUsize, + _kvm_slot: u32, + ) -> std::result::Result, Error> { + let file_offset = { + let fd = memfd::memfd_create( + // safe to unwrap, no nul byte in file name + &CString::new("virtio_fs_mem").unwrap(), + memfd::MemFdCreateFlag::empty(), + ) + .map_err(|_| Error::InvalidInput)?; + let file: File = unsafe { File::from_raw_fd(fd) }; + file.set_len(region_len).map_err(|_| Error::InvalidInput)?; + Some(FileOffset::new(file, 0)) + }; + + // unmap will be handled on MmapRegion'd Drop. + let mmap_region = MmapRegion::build( + file_offset, + region_len as usize, + libc::PROT_NONE, + libc::MAP_ANONYMOUS | libc::MAP_NORESERVE | libc::MAP_PRIVATE, + ) + .map_err(Error::NewMmapRegion)?; + + let region = + Arc::new(GuestRegionMmap::new(mmap_region, guest_addr).map_err(Error::InsertMmap)?); + + Ok(region) + } + + fn restore_region_addr( + &self, + _guest_addr: GuestAddress, + ) -> std::result::Result<*mut u8, Error> { + Err(Error::InvalidInput) + } + + fn get_host_numa_node_id(&self) -> Option { + None + } + + fn set_host_numa_node_id(&mut self, _host_numa_node_id: Option) {} + } + + fn create_mem_epoll_handler(id: String) -> MemEpollHandler> { + let mem = Arc::new(GuestMemoryMmap::from_ranges(&[(GuestAddress(0x0), 0x10000)]).unwrap()); + let queues = vec![VirtioQueueConfig::create(256, 0).unwrap()]; + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::new( + mem, + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + let mem_config = Arc::new(Mutex::new(VirtioMemConfig::default())); + let map_regions = vec![(0, Some((0, 0)))]; + MemEpollHandler { + config, + mem_config, + multi_region: false, + map_regions: Arc::new(Mutex::new(map_regions)), + host_fd: None, + mem_state: Vec::new(), + id, + } + } + + #[test] + fn test_mem_request_parse() { + let m = &GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let vq = VirtQueue::new(GuestAddress(0), m, 16); + + assert!(vq.end().0 < 0x1000); + + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + // write only request type descriptor + { + let mut queue = vq.create_queue(); + let mut q = queue.lock(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_WRITE, 1); + m.write_obj::(114, GuestAddress(0x1000 + 8)).unwrap(); + assert!(matches!( + Request::parse(&mut q.iter(m).unwrap().next().unwrap(), m), + Err(MemError::UnexpectedWriteOnlyDescriptor) + )); + } + // desc len error + { + let mut queue = vq.create_queue(); + let mut q = queue.lock(); + vq.dtable(0).flags().store(0); + m.write_obj::(114, GuestAddress(0x1000 + 8)).unwrap(); + assert!(matches!( + Request::parse(&mut q.iter(m).unwrap().next().unwrap(), m), + Err(MemError::InvalidRequest) + )); + } + // desc chain too short + { + let mut queue = vq.create_queue(); + let mut q = queue.lock(); + vq.dtable(0).flags().store(0); + vq.dtable(0).set(0x1000, 0x18, 0, 1); + assert!(matches!( + Request::parse(&mut q.iter(m).unwrap().next().unwrap(), m), + Err(MemError::DescriptorChainTooShort) + )); + } + // unexpected read only descriptor + { + let mut queue = vq.create_queue(); + let mut q = queue.lock(); + vq.dtable(0).set(0x1000, 0x18, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1).set(0x2000, 0x18, VIRTQ_DESC_F_NEXT, 2); + assert!(matches!( + Request::parse(&mut q.iter(m).unwrap().next().unwrap(), m), + Err(MemError::UnexpectedReadOnlyDescriptor) + )); + } + // desc len too short + { + let mut queue = vq.create_queue(); + let mut q = queue.lock(); + vq.dtable(0).set(0x1000, 0x18, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1).set(0x2000, 0x9, VIRTQ_DESC_F_WRITE, 2); + assert!(matches!( + Request::parse(&mut q.iter(m).unwrap().next().unwrap(), m), + Err(MemError::DescriptorLengthTooSmall) + )); + } + // success + { + let mut queue = vq.create_queue(); + let mut q = queue.lock(); + vq.dtable(0).set(0x1000, 0x18, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1).set(0x2000, 0x18, VIRTQ_DESC_F_WRITE, 2); + assert!(Request::parse(&mut q.iter(m).unwrap().next().unwrap(), m).is_ok()); + } + } + + #[test] + fn test_mem_tool_valid_range() { + let config = VirtioMemConfig { + block_size: 0x100, + addr: 0x1000, + usable_region_size: 0x1000, + ..Default::default() + }; + + // address not properly aligned. + assert!(!MemTool::virtio_mem_valid_range(&config, 0x14, 0x100)); + assert!(!MemTool::virtio_mem_valid_range(&config, 0x100, 5)); + + // unreasonable size. + assert!(!MemTool::virtio_mem_valid_range( + &config, + 0x1000, + i32::MAX as u64 + )); + assert!(!MemTool::virtio_mem_valid_range(&config, 0x1000, 0)); + + // start address not in usable range. + assert!(!MemTool::virtio_mem_valid_range(&config, 0x200, 0x200)); + assert!(!MemTool::virtio_mem_valid_range(&config, 0x3000, 0x200),); + + // end address not in usable range. + assert!(!MemTool::virtio_mem_valid_range(&config, 0x1000, 0x2000),); + + // success + assert!(MemTool::virtio_mem_valid_range(&config, 0x1000, 0x500),); + } + + #[test] + fn test_mem_tool_check_bitmap() { + let bit_index = 2; + let nb_blocks = 2; + let mut mem_state = [false, false, false, false]; + let plug = false; + + // true + assert!(MemTool::virtio_mem_check_bitmap( + bit_index, nb_blocks, &mem_state, plug + ),); + + mem_state[2] = true; + // false + assert!(!MemTool::virtio_mem_check_bitmap( + bit_index, nb_blocks, &mem_state, plug + ),); + } + + #[test] + fn test_mem_tool_set_bitmap() { + let bit_index = 2; + let nb_blocks = 2; + let mut mem_state = vec![false, false, false, false]; + let plug = true; + + MemTool::virtio_mem_set_bitmap(bit_index, nb_blocks, &mut mem_state, plug); + assert!(mem_state[2]); + assert!(mem_state[3]); + } + + #[test] + fn test_mem_tool_state_request() { + let config = VirtioMemConfig { + block_size: 0x100, + addr: 0x1000, + usable_region_size: 0x1000, + ..Default::default() + }; + let mut mem_state = vec![false, false, false, false]; + + // invalid range. + let (resp_type, resp_state) = + MemTool::virtio_mem_state_request(&config, 0x2000, 0, &mut mem_state); + assert_eq!(resp_type, VIRTIO_MEM_RESP_ERROR); + assert_eq!(resp_state, VIRTIO_MEM_STATE_PLUGGED); + + // valid range & unplugged. + let (resp_type, resp_state) = + MemTool::virtio_mem_state_request(&config, 0x1200, 2, &mut mem_state); + assert_eq!(resp_type, VIRTIO_MEM_RESP_ACK); + assert_eq!(resp_state, VIRTIO_MEM_STATE_UNPLUGGED); + + // mixed mem state. + mem_state = vec![false, false, true, false]; + let (resp_type, resp_state) = + MemTool::virtio_mem_state_request(&config, 0x1200, 2, &mut mem_state); + assert_eq!(resp_type, VIRTIO_MEM_RESP_ACK); + assert_eq!(resp_state, VIRTIO_MEM_STATE_MIXED); + + // plugged. + mem_state = vec![true, true, true, true]; + let (resp_type, resp_state) = + MemTool::virtio_mem_state_request(&config, 0x1200, 2, &mut mem_state); + assert_eq!(resp_type, VIRTIO_MEM_RESP_ACK); + assert_eq!(resp_state, VIRTIO_MEM_STATE_PLUGGED); + } + + #[test] + fn test_mem_tool_resize_usable_region() { + use std::ptr::{addr_of, read_unaligned}; + + let mut config = VirtioMemConfig { + region_size: 0x200, + block_size: 0x100, + usable_region_size: 0x1000, + requested_size: 0, + ..Default::default() + }; + + let id = "mem0".to_string(); + + // unshrink. + MemTool::virtio_mem_resize_usable_region( + &id, + &mut config, + false, + VIRTIO_MEM_DEFAULT_BLOCK_ALIGNMENT, + None, + ) + .unwrap(); + assert_eq!( + unsafe { read_unaligned(addr_of!(config.usable_region_size)) }, + 0x1000 + ); + + // request size is 0. + MemTool::virtio_mem_resize_usable_region( + &id, + &mut config, + true, + VIRTIO_MEM_DEFAULT_BLOCK_ALIGNMENT, + None, + ) + .unwrap(); + assert_eq!( + unsafe { read_unaligned(addr_of!(config.usable_region_size)) }, + 0 + ); + + // shrink. + config.requested_size = 0x5; + MemTool::virtio_mem_resize_usable_region( + &id, + &mut config, + true, + VIRTIO_MEM_DEFAULT_BLOCK_ALIGNMENT, + None, + ) + .unwrap(); + assert_eq!( + unsafe { read_unaligned(addr_of!(config.usable_region_size)) }, + 0x200 + ); + + // test alignment + config.region_size = 2 << 30; + config.requested_size = 1 << 30; + // alignment unchanged. + MemTool::virtio_mem_resize_usable_region( + &id, + &mut config, + true, + VIRTIO_MEM_DEFAULT_BLOCK_ALIGNMENT, + None, + ) + .unwrap(); + assert_eq!( + unsafe { read_unaligned(addr_of!(config.usable_region_size)) }, + (1 << 30) + 2 * VIRTIO_MEM_DEFAULT_BLOCK_ALIGNMENT + ); + // alignemnt changed. + MemTool::virtio_mem_resize_usable_region( + &id, + &mut config, + true, + MAX_MEMORY_BLOCK_SIZE, + None, + ) + .unwrap(); + assert_eq!( + unsafe { read_unaligned(addr_of!(config.usable_region_size)) }, + 2 << 30 + ); + } + + #[test] + fn test_mem_virtio_device_normal() { + let epoll_mgr = EpollManager::default(); + let id = "mem0".to_string(); + let factory = Arc::new(Mutex::new(DummyMemRegionFactory {})); + let mut dev = + Mem::>::new(id, 200, 200, false, None, epoll_mgr, factory, 200) + .unwrap(); + + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::device_type(&dev), + TYPE_MEM + ); + let queue_size = vec![128]; + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::queue_max_sizes( + &dev + ), + &queue_size[..] + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 0), + dev.device_info.get_avail_features(0) + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 1), + dev.device_info.get_avail_features(1) + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 2), + dev.device_info.get_avail_features(2) + ); + VirtioDevice::>, QueueSync, GuestRegionMmap>::set_acked_features( + &mut dev, 2, 0, + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 2), + 0, + ); + + let mut data: [u8; 8] = [1; 8]; + VirtioDevice::>, QueueSync, GuestRegionMmap>::read_config( + &mut dev, 0, &mut data, + ) + .unwrap(); + let config: [u8; 8] = [0; 8]; + VirtioDevice::>, QueueSync, GuestRegionMmap>::write_config( + &mut dev, 0, &config, + ) + .unwrap(); + let mut data2: [u8; 8] = [1; 8]; + VirtioDevice::>, QueueSync, GuestRegionMmap>::read_config( + &mut dev, 0, &mut data2, + ) + .unwrap(); + assert_eq!(data, data2); + } + + #[test] + fn test_mem_virtio_device_get_resource_requirements() { + let epoll_mgr = EpollManager::default(); + let id = "mem0".to_string(); + let factory = Arc::new(Mutex::new(DummyMemRegionFactory {})); + let dev = Mem::>::new( + id, 0x100, 0x100, false, None, epoll_mgr, factory, 0xc0000000, + ) + .unwrap(); + let mut requirements = vec![ + ResourceConstraint::new_mmio(0x1000), + ResourceConstraint::new_mmio(0x1000), + ]; + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_resource_requirements( + &dev, &mut requirements, true, + ); + assert_eq!(requirements[2], ResourceConstraint::LegacyIrq { irq: None }); + assert_eq!(requirements[3], ResourceConstraint::GenericIrq { size: 2 }); + assert_eq!( + requirements[4], + ResourceConstraint::MemAddress { + range: None, + align: VIRTIO_MEM_DEFAULT_BLOCK_ALIGNMENT, + size: 0x100 << 20, + } + ); + assert_eq!( + requirements[5], + ResourceConstraint::KvmMemSlot { + slot: None, + size: 1 + } + ); + } + + #[test] + fn test_mem_virtio_device_set_resource() { + let epoll_mgr = EpollManager::default(); + let id = "mem0".to_string(); + let factory = Arc::new(Mutex::new(DummyMemRegionFactory {})); + + // enable multi-region in virtio-mem + { + let mut dev = Mem::>::new( + id.clone(), + 0xc00, + 0xc00, + true, + None, + epoll_mgr.clone(), + factory.clone(), + 0xc0000000, + ) + .unwrap(); + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let mut resources = DeviceResources::new(); + let entry = dbs_device::resources::Resource::MemAddressRange { + base: 0x100000000, + size: 0xc00 << 20, + }; + resources.append(entry); + let entry = dbs_device::resources::Resource::KvmMemSlot(0); + resources.append(entry); + let entry = dbs_device::resources::Resource::KvmMemSlot(1); + resources.append(entry); + let content = + VirtioDevice::>, QueueSync, GuestRegionMmap>::set_resource( + &mut dev, vm_fd, resources, + ) + .unwrap(); + assert!(content.is_none()); + } + + // disable multi-region in virtio-mem + { + let mut dev = Mem::>::new( + id, 0xc00, 0xc00, false, None, epoll_mgr, factory, 0xc0000000, + ) + .unwrap(); + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let mut resources = DeviceResources::new(); + let entry = dbs_device::resources::Resource::MemAddressRange { + base: 0x100000000, + size: 0xc00 << 20, + }; + resources.append(entry); + let entry = dbs_device::resources::Resource::KvmMemSlot(0); + resources.append(entry); + let content = + VirtioDevice::>, QueueSync, GuestRegionMmap>::set_resource( + &mut dev, vm_fd, resources, + ) + .unwrap(); + assert!(content.is_none()); + } + } + + #[test] + fn test_mem_virtio_device_spec() { + let epoll_mgr = EpollManager::default(); + let id = "mem0".to_string(); + let factory = Arc::new(Mutex::new(DummyMemRegionFactory {})); + let dev = + Mem::>::new(id, 200, 200, false, None, epoll_mgr, factory, 200) + .unwrap(); + assert!(dev.set_requested_size(200).is_ok()); + } + + #[test] + fn test_mem_virtio_device_activate() { + let epoll_mgr = EpollManager::default(); + let id = "mem0".to_string(); + let factory = Arc::new(Mutex::new(DummyMemRegionFactory {})); + // queue length error + { + let mut dev = Mem::>::new( + id.clone(), + 200, + 200, + false, + None, + epoll_mgr.clone(), + factory.clone(), + 200, + ) + .unwrap(); + + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues = vec![ + VirtioQueueConfig::::create(16, 0).unwrap(), + VirtioQueueConfig::::create(16, 0).unwrap(), + ]; + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::>>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + let result = dev.activate(config); + assert!(matches!(result, Err(ActivateError::InvalidParam))); + } + // fail because map_regions should not be empty + { + let mut dev = Mem::>::new( + id.clone(), + 200, + 200, + false, + None, + epoll_mgr.clone(), + factory.clone(), + 200, + ) + .unwrap(); + + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues = vec![VirtioQueueConfig::::create(128, 0).unwrap()]; + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::>>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + let result = dev.activate(config); + assert!(matches!(result, Err(ActivateError::InternalError))); + } + // test activate mem device is correct + { + let mut dev = Mem::>::new( + id, 200, 200, false, None, epoll_mgr, factory, 200, + ) + .unwrap(); + + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues = vec![VirtioQueueConfig::::create(128, 0).unwrap()]; + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::>>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + dev.map_regions.lock().unwrap().push((0, None)); + assert!(dev.activate(config).is_ok()); + } + } + + #[test] + fn test_mem_virtio_device_remove() { + let epoll_mgr = EpollManager::default(); + let id = "mem0".to_string(); + let factory = Arc::new(Mutex::new(DummyMemRegionFactory {})); + let mut dev = + Mem::>::new(id, 200, 200, false, None, epoll_mgr, factory, 200) + .unwrap(); + + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues = vec![VirtioQueueConfig::::create(128, 0).unwrap()]; + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::>>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + dev.map_regions.lock().unwrap().push((0, None)); + + // test activate mem device is correct + assert!(dev.activate(config).is_ok()); + assert!(dev.subscriber_id.is_some()); + // test remove mem device is correct + VirtioDevice::>, QueueSync, GuestRegionMmap>::remove(&mut dev); + assert!(dev.subscriber_id.is_none()); + } + + #[test] + fn test_mem_epoll_handler_handle_event() { + let handler = create_mem_epoll_handler("test_1".to_string()); + let event_fd = EventFd::new(0).unwrap(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_op = inner_mgr.event_ops(id).unwrap(); + let event_set = EventSet::EDGE_TRIGGERED; + let mut handler = create_mem_epoll_handler("test_2".to_string()); + + //invalid queue index + let events = Events::with_data(&event_fd, 1024, event_set); + handler.config.queues[0].generate_event().unwrap(); + handler.process(events, &mut event_op); + //valid + let events = Events::with_data(&event_fd, 0, event_set); + handler.config.queues[0].generate_event().unwrap(); + handler.process(events, &mut event_op); + } + + #[test] + fn test_mem_epoll_handler_process_queue() { + let mut handler = create_mem_epoll_handler("test_1".to_string()); + let m = &handler.config.vm_as.clone(); + // fail to parse available descriptor chain + { + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + let q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x400, VIRTQ_DESC_F_NEXT, 1); + handler.config.queues = vec![VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + )]; + handler.config.queues[0].generate_event().unwrap(); + assert!(handler.process_queue(0)); + } + // success + { + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + let q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x4, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1).set(0x2000, 0x4, VIRTQ_DESC_F_WRITE, 2); + handler.config.queues = vec![VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + )]; + handler.config.queues[0].generate_event().unwrap(); + assert!(handler.process_queue(0)); + } + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/mmio/dragonball.rs b/src/dragonball/src/dbs_virtio_devices/src/mmio/dragonball.rs new file mode 100644 index 000000000..7cceb2094 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/mmio/dragonball.rs @@ -0,0 +1,203 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 or BSD-3-Clause + +//! Related to Dragonball MMIO extension. + +/// Device Vendor ID for virtio devices emulated by Dragonball. +/// The upper 24 bits are used as vendor id, and the lower 8 bits are used as features. +pub const MMIO_VENDOR_ID_DRAGONBALL: u32 = 0xdbfcdb00; + +/// Mask for feature flags in the vendor id field +pub const DRAGONBALL_FEATURE_MASK: u32 = 0xff; + +/// Assume `MMIO_INT_VRING` is always set in the interrupt status register when handling interrupts. +/// With this feature available, the device driver may optimize the way to handle interrupts. +pub const DRAGONBALL_FEATURE_INTR_USED: u32 = 0x1; + +/// The device supports Message Signaled Interrupt. +pub const DRAGONBALL_FEATURE_MSI_INTR: u32 = 0x2; + +/// The device implements per-queue notification register. +/// If this feature bit is set, the VIRTIO_MMIO_QUEUE_NOTIFY register becomes read-write. +/// On reading, the lower 16-bit contains doorbell base offset starting from the MMIO window base, +/// and the upper 16-bit contains scale for the offset. The notification register address for +/// virtque is: +/// offset = base + doorbell_base + doorbell_scale * queue_idx +pub const DRAGONBALL_FEATURE_PER_QUEUE_NOTIFY: u32 = 0x4; + +/// PVDMA feature enabled +pub const DRAGONBALL_FEATURE_PVDMA: u32 = 0x08; + +/// Default size resrved for virtio-mmio doorbell address space. +/// +/// This represents the size of the mmio device reserved for doorbell which used to per queue notify, +/// we need to request resource with the `MMIO_DEFAULT_CFG_SIZE + DRAGONBALL_MMIO_DOORBELL_SIZE` +pub const DRAGONBALL_MMIO_DOORBELL_SIZE: u64 = 0x1000; + +/// Default offset of the mmio doorbell +pub const DRAGONBALL_MMIO_DOORBELL_OFFSET: u64 = 0x1000; + +/// Max queue num when the `fast-mmio` enabled, because we only reserved 0x200 memory region for +/// per queue notify +pub const DRAGONBALL_MMIO_MAX_QUEUE_NUM: u64 = 255; + +/// Scale of the doorbell for per queue notify +pub const DRAGONBALL_MMIO_DOORBELL_SCALE: u64 = 0x04; + +/// This represents the offset at which the device should call DeviceIo::write in order to write +/// to its configuration space. +pub const MMIO_CFG_SPACE_OFF: u64 = 0x100; + +// The format of the 16-bit MSI Control and Status register. +// On read: +// - bit 15: 1 if MSI is supported, 0 if MSI is not supported. +// - bit 0-14: reserved, read as zero. +// On write: +// - bit 15: 1 to enable MSI, 0 to disable MSI. +// - bit 0-14: ignored. + +/// Message Signaled Interrupt is supported when reading from the CSR. +pub const MMIO_MSI_CSR_SUPPORTED: u16 = 0x8000; + +/// Enable MSI if this bit is set when writing to the CSR, otherwise disable MSI. +pub const MMIO_MSI_CSR_ENABLED: u16 = 0x8000; + +// The format of the 16-bit write-only MSI Command register. +// - bit 12-15: command code +// - bit 0-11: command parameter + +/// Mask for the command code in the MSI command register. +pub const MMIO_MSI_CMD_CODE_MASK: u16 = 0xf000; + +/// Mask for the command argument in the MSI command register. +pub const MMIO_MSI_CMD_ARG_MASK: u16 = 0x0fff; + +/// Command code to update MSI entry configuration. +/// The argument is the MSI vector number to update. +pub const MMIO_MSI_CMD_CODE_UPDATE: u16 = 0x1000; +/// Comamnd to mask and unmask msi interrupt +pub const MMIO_MSI_CMD_CODE_INT_MASK: u16 = 0x2000; +pub const MMIO_MSI_CMD_CODE_INT_UNMASK: u16 = 0x3000; + +// Define a 16-byte area to control MMIO MSI + +// MSI control/status register offset +pub const REG_MMIO_MSI_CSR: u64 = 0x0c0; +// MSI command register offset +pub const REG_MMIO_MSI_COMMAND: u64 = 0x0c2; +// MSI address_lo register offset +pub const REG_MMIO_MSI_ADDRESS_L: u64 = 0x0c4; +// MSI address_hi register offset +pub const REG_MMIO_MSI_ADDRESS_H: u64 = 0x0c8; +// MSI data register offset +pub const REG_MMIO_MSI_DATA: u64 = 0x0cc; + +// RW: MSI feature enabled +pub const REG_MMIO_MSI_CSR_ENABLE: u64 = 0x8000; +// RO: Maximum queue size available +pub const REG_MMIO_MSI_CSR_QMASK: u64 = 0x07ff; +// Reserved +pub const REG_MMIO_MSI_CSR_RESERVED: u64 = 0x7800; + +pub const REG_MMIO_MSI_CMD_UPDATE: u64 = 0x1; + +/// Defines the offset and scale of the mmio doorbell. +/// +/// Support per-virtque doorbell, so the guest kernel may directly write to the doorbells provided +/// by hardware virtio devices. +#[derive(Default, Debug, PartialEq, Eq)] +pub struct DoorBell { + offset: u32, + scale: u32, +} + +impl DoorBell { + /// Creates a Doorbell. + pub fn new(offset: u32, scale: u32) -> Self { + Self { offset, scale } + } + + /// Returns the offset. + pub fn offset(&self) -> u32 { + self.offset + } + + /// Returns the scale. + pub fn scale(&self) -> u32 { + self.scale + } + + /// Returns the offset with the specified index of virtio queue. + pub fn queue_offset(&self, queue_index: usize) -> u64 { + (self.offset as u64) + (self.scale as u64) * (queue_index as u64) + } + + /// Returns the register data. + pub fn register_data(&self) -> u32 { + self.offset | (self.scale << 16) + } +} + +/// MSI interrupts. +#[derive(Default, Debug, PartialEq, Eq)] +pub struct Msi { + pub index_select: u32, + pub address_low: u32, + pub address_high: u32, + pub data: u32, +} + +impl Msi { + /// Sets index select. + pub fn set_index_select(&mut self, v: u32) { + self.index_select = v; + } + /// Sets address low. + pub fn set_address_low(&mut self, v: u32) { + self.address_low = v; + } + /// Sets address high. + pub fn set_address_high(&mut self, v: u32) { + self.address_high = v; + } + /// Sets msi data. + pub fn set_data(&mut self, v: u32) { + self.data = v; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_doorbell() { + let door = DoorBell::new( + DRAGONBALL_MMIO_DOORBELL_OFFSET as u32, + DRAGONBALL_MMIO_DOORBELL_SCALE as u32, + ); + assert_eq!(door.offset(), DRAGONBALL_MMIO_DOORBELL_OFFSET as u32); + assert_eq!(door.scale(), DRAGONBALL_MMIO_DOORBELL_SCALE as u32); + assert_eq!(door.queue_offset(0), DRAGONBALL_MMIO_DOORBELL_OFFSET); + assert_eq!(door.queue_offset(4), 0x1010); + assert_eq!(door.register_data(), 0x1000 | 0x40000); + } + + #[test] + fn test_msi() { + let mut msi = Msi::default(); + msi.set_index_select(1); + msi.set_address_low(2); + msi.set_address_high(3); + msi.set_data(4); + assert_eq!( + msi, + Msi { + index_select: 1, + address_low: 2, + address_high: 3, + data: 4 + } + ); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/mmio/mmio_state.rs b/src/dragonball/src/dbs_virtio_devices/src/mmio/mmio_state.rs new file mode 100644 index 000000000..434be51a9 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/mmio/mmio_state.rs @@ -0,0 +1,665 @@ +// Copyright (C) 2019 Alibaba Cloud Computing. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +/////////////////////////////////////////////////////////////// +// TODO: we really need better support of device reset, error recovery, exceptions handling. +/////////////////////////////////////////////////////////////// + +use std::ops::Deref; +use std::sync::Arc; + +use dbs_device::resources::DeviceResources; +use dbs_interrupt::{DeviceInterruptManager, DeviceInterruptMode, InterruptIndex, KvmIrqManager}; +use kvm_bindings::kvm_userspace_memory_region; +use kvm_ioctls::{IoEventAddress, NoDatamatch, VmFd}; +use log::{debug, error, info, warn}; +use virtio_queue::QueueT; +use vm_memory::{GuestAddressSpace, GuestMemoryRegion}; + +use crate::{ + mmio::*, warn_or_panic, ActivateError, Error, Result, VirtioDevice, VirtioDeviceConfig, + VirtioQueueConfig, VirtioSharedMemory, VirtioSharedMemoryList, DEVICE_DRIVER_OK, DEVICE_FAILED, +}; + +/// The state of Virtio Mmio device. +pub struct MmioV2DeviceState { + device: Box>, + vm_fd: Arc, + vm_as: AS, + intr_mgr: DeviceInterruptManager>, + device_resources: DeviceResources, + queues: Vec>, + + mmio_base: u64, + has_ctrl_queue: bool, + device_activated: bool, + ioevent_registered: bool, + + features_select: u32, + acked_features_select: u32, + queue_select: u32, + + msi: Option, + doorbell: Option, + + shm_region_id: u32, + shm_regions: Option>, +} + +impl MmioV2DeviceState +where + AS: GuestAddressSpace + Clone, + Q: QueueT + Clone, + R: GuestMemoryRegion, +{ + /// Returns a reference to the internal device object. + pub fn get_inner_device(&self) -> &dyn VirtioDevice { + self.device.as_ref() + } + + /// Returns a mutable reference to the internal device object. + pub fn get_inner_device_mut(&mut self) -> &mut dyn VirtioDevice { + self.device.as_mut() + } + + pub(crate) fn new( + mut device: Box>, + vm_fd: Arc, + vm_as: AS, + irq_manager: Arc, + device_resources: DeviceResources, + mmio_base: u64, + doorbell_enabled: bool, + ) -> Result { + let intr_mgr = + DeviceInterruptManager::new(irq_manager, &device_resources).map_err(Error::IOError)?; + + let (queues, has_ctrl_queue) = Self::create_queues(device.as_ref())?; + + // Assign requested device resources back to virtio device and let it do necessary setups, + // as only virtio device knows how to use such resources. And if there's + // VirtioSharedMemoryList returned, assigned it to MmioV2DeviceState + let shm_regions = device + .set_resource(vm_fd.clone(), device_resources.clone()) + .map_err(|e| { + error!("Failed to assign device resource to virtio device: {}", e); + e + })?; + + let doorbell = if doorbell_enabled { + Some(DoorBell::new( + DRAGONBALL_MMIO_DOORBELL_OFFSET as u32, + DRAGONBALL_MMIO_DOORBELL_SCALE as u32, + )) + } else { + None + }; + + Ok(MmioV2DeviceState { + device, + vm_fd, + vm_as, + intr_mgr, + device_resources, + queues, + mmio_base, + has_ctrl_queue, + ioevent_registered: false, + device_activated: false, + features_select: 0, + acked_features_select: 0, + queue_select: 0, + doorbell, + msi: None, + shm_region_id: 0, + shm_regions, + }) + } + + pub(crate) fn activate(&mut self, device: &MmioV2Device) -> Result<()> { + if self.device_activated { + return Ok(()); + } + + // If the driver incorrectly sets up the queues, the following check will fail and take + // the device into an unusable state. + if !self.check_queues_valid() { + return Err(Error::ActivateError(ActivateError::InvalidQueueConfig)); + } + + self.register_ioevent()?; + + self.intr_mgr.enable()?; + + let config = self.create_device_config(device)?; + + self.device + .activate(config) + .map(|_| self.device_activated = true) + .map_err(|e| { + error!("device activate error: {:?}", e); + Error::ActivateError(e) + }) + } + + fn create_queues( + device: &dyn VirtioDevice, + ) -> Result<(Vec>, bool)> { + let mut queues = Vec::new(); + for (idx, size) in device.queue_max_sizes().iter().enumerate() { + queues.push(VirtioQueueConfig::create(*size, idx as u16)?); + } + + // The ctrl queue must be append to Queue Vec, because the guest will + // configure it which is same with other queues. + let has_ctrl_queue = device.ctrl_queue_max_sizes() > 0; + if has_ctrl_queue { + queues.push(VirtioQueueConfig::create( + device.ctrl_queue_max_sizes(), + queues.len() as u16, + )?); + } + + Ok((queues, has_ctrl_queue)) + } + + fn create_queue_config( + &mut self, + device: &MmioV2Device, + ) -> Result>> { + // Safe because we have just called self.intr_mgr.enable(). + let group = self.intr_mgr.get_group().unwrap(); + let mut queues = Vec::new(); + for queue in self.queues.iter() { + //The first interrupt index is device config change. + let queue_notifier = crate::notifier::create_queue_notifier( + group.clone(), + device.interrupt_status(), + queue.index() as InterruptIndex + 1, + ); + queues.push(VirtioQueueConfig::new( + queue.queue.clone(), + queue.eventfd.clone(), + queue_notifier, + queue.index(), + )); + } + Ok(queues) + } + + fn create_device_config( + &mut self, + device: &MmioV2Device, + ) -> Result> { + let mut queues = self.create_queue_config(device)?; + let ctrl_queue = if self.has_ctrl_queue { + queues.pop() + } else { + None + }; + + // Safe because we have just called self.intr_mgr.enable(). + let group = self.intr_mgr.get_group().unwrap(); + //The first interrupt index is device config change. + let notifier = crate::notifier::create_device_notifier(group, device.interrupt_status(), 0); + + let mut config = VirtioDeviceConfig::new( + self.vm_as.clone(), + self.vm_fd.clone(), + self.device_resources.clone(), + queues, + ctrl_queue, + notifier, + ); + if let Some(shm_regions) = self.shm_regions.as_ref() { + config.set_shm_regions((*shm_regions).clone()); + } + Ok(config) + } + + fn register_ioevent(&mut self) -> Result<()> { + for (i, queue) in self.queues.iter().enumerate() { + if let Some(doorbell) = self.doorbell.as_ref() { + let io_addr = IoEventAddress::Mmio(self.mmio_base + doorbell.queue_offset(i)); + if let Err(e) = self + .vm_fd + .register_ioevent(&queue.eventfd, &io_addr, NoDatamatch) + { + self.revert_ioevent(i, &io_addr, true); + return Err(Error::IOError(std::io::Error::from_raw_os_error(e.errno()))); + } + } + // always register ioeventfd in MMIO_NOTIFY_REG_OFFSET to avoid guest kernel which not support doorbell + let io_addr = IoEventAddress::Mmio(self.mmio_base + MMIO_NOTIFY_REG_OFFSET as u64); + if let Err(e) = self + .vm_fd + .register_ioevent(&queue.eventfd, &io_addr, i as u32) + { + self.unregister_ioevent_doorbell(); + self.revert_ioevent(i, &io_addr, false); + return Err(Error::IOError(std::io::Error::from_raw_os_error(e.errno()))); + } + } + self.ioevent_registered = true; + + Ok(()) + } + + #[inline] + #[allow(dead_code)] + pub(crate) fn queues(&self) -> &Vec> { + &self.queues + } + + #[inline] + #[allow(dead_code)] + pub(crate) fn queues_mut(&mut self) -> &mut Vec> { + &mut self.queues + } + + #[inline] + pub(crate) fn features_select(&self) -> u32 { + self.features_select + } + + #[inline] + pub(crate) fn set_features_select(&mut self, v: u32) { + self.features_select = v; + } + + #[inline] + #[allow(dead_code)] + pub(crate) fn acked_features_select(&mut self) -> u32 { + self.acked_features_select + } + + #[inline] + pub(crate) fn set_acked_features_select(&mut self, v: u32) { + self.acked_features_select = v; + } + + #[inline] + #[allow(dead_code)] + pub(crate) fn queue_select(&mut self) -> u32 { + self.queue_select + } + + #[inline] + pub(crate) fn set_queue_select(&mut self, v: u32) { + self.queue_select = v; + } + + #[inline] + pub(crate) fn set_acked_features(&mut self, v: u32) { + self.device + .set_acked_features(self.acked_features_select, v) + } + + #[inline] + pub(crate) fn set_shm_region_id(&mut self, v: u32) { + self.shm_region_id = v; + } + + #[inline] + pub(crate) fn set_msi_address_low(&mut self, v: u32) { + if let Some(m) = self.msi.as_mut() { + m.set_address_low(v) + } + } + + #[inline] + pub(crate) fn set_msi_address_high(&mut self, v: u32) { + if let Some(m) = self.msi.as_mut() { + m.set_address_high(v) + } + } + + #[inline] + pub(crate) fn set_msi_data(&mut self, v: u32) { + if let Some(m) = self.msi.as_mut() { + m.set_data(v) + } + } + + #[inline] + pub(crate) fn shm_regions(&self) -> Option<&VirtioSharedMemoryList> { + self.shm_regions.as_ref() + } + + #[inline] + pub(crate) fn device_activated(&self) -> bool { + self.device_activated + } + + #[inline] + pub(crate) fn doorbell(&self) -> Option<&DoorBell> { + self.doorbell.as_ref() + } + + pub(crate) fn deactivate(&mut self) { + if self.device_activated { + self.device_activated = false; + } + } + + pub(crate) fn reset(&mut self) -> Result<()> { + if self.device_activated { + warn!("reset device while it's still in active state"); + Ok(()) + } else { + // . Keep interrupt_evt and queue_evts as is. There may be pending + // notifications in those eventfds, but nothing will happen other + // than supurious wakeups. + // . Do not reset config_generation and keep it monotonically increasing + for queue in self.queues.iter_mut() { + let new_queue = Q::new(queue.queue.max_size()); + if let Err(e) = new_queue { + warn!("reset device failed because new virtio-queue could not be created due to {:?}", e); + return Err(Error::VirtioQueueError(e)); + } else { + // unwrap is safe here since we have checked new_queue result above. + queue.queue = new_queue.unwrap(); + } + } + + let _ = self.intr_mgr.reset(); + self.unregister_ioevent(); + self.features_select = 0; + self.acked_features_select = 0; + self.queue_select = 0; + self.msi = None; + self.doorbell = None; + Ok(()) + } + } + + fn unregister_ioevent(&mut self) { + if self.ioevent_registered { + let io_addr = IoEventAddress::Mmio(self.mmio_base + MMIO_NOTIFY_REG_OFFSET as u64); + for (i, queue) in self.queues.iter().enumerate() { + let _ = self + .vm_fd + .unregister_ioevent(&queue.eventfd, &io_addr, i as u32); + self.ioevent_registered = false; + } + } + } + + fn revert_ioevent(&mut self, num: usize, io_addr: &IoEventAddress, wildcard: bool) { + assert!(num < self.queues.len()); + let mut idx = num; + while idx > 0 { + let datamatch = if wildcard { + NoDatamatch.into() + } else { + idx as u64 + }; + idx -= 1; + let _ = self + .vm_fd + .unregister_ioevent(&self.queues[idx].eventfd, io_addr, datamatch); + } + } + + fn unregister_ioevent_doorbell(&mut self) { + if let Some(doorbell) = self.doorbell.as_ref() { + for (i, queue) in self.queues.iter().enumerate() { + let io_addr = IoEventAddress::Mmio(self.mmio_base + doorbell.queue_offset(i)); + let _ = self + .vm_fd + .unregister_ioevent(&queue.eventfd, &io_addr, NoDatamatch); + } + } + } + + pub(crate) fn check_queues_valid(&self) -> bool { + let mem = self.vm_as.memory(); + // All queues must have been enabled, we doesn't allow disabled queues. + self.queues.iter().all(|c| c.queue.is_valid(mem.deref())) + } + + pub(crate) fn with_queue(&self, d: U, f: F) -> U + where + F: FnOnce(&Q) -> U, + { + match self.queues.get(self.queue_select as usize) { + Some(config) => f(&config.queue), + None => d, + } + } + + pub(crate) fn with_queue_mut(&mut self, f: F) -> bool { + if let Some(config) = self.queues.get_mut(self.queue_select as usize) { + f(&mut config.queue); + true + } else { + false + } + } + + pub(crate) fn get_shm_field(&mut self, d: U, f: F) -> U + where + F: FnOnce(&VirtioSharedMemory) -> U, + { + if let Some(regions) = self.shm_regions.as_ref() { + match regions.region_list.get(self.shm_region_id as usize) { + Some(region) => f(region), + None => d, + } + } else { + d + } + } + + pub(crate) fn update_msi_enable(&mut self, v: u16, device: &MmioV2Device) { + // Can't switch interrupt mode once the device has been activated. + if device.driver_status() & DEVICE_DRIVER_OK != 0 { + if device.driver_status() & DEVICE_FAILED == 0 { + debug!("mmio_v2: can not switch interrupt mode for active device"); + device.set_driver_failed(); + } + return; + } + + if v & MMIO_MSI_CSR_ENABLED != 0 { + // Guest enable msi interrupt + if self.msi.is_none() { + debug!("mmio_v2: switch to MSI interrupt mode"); + match self + .intr_mgr + .set_working_mode(DeviceInterruptMode::GenericMsiIrq) + { + Ok(_) => self.msi = Some(Msi::default()), + Err(e) => { + warn!("mmio_v2: failed to switch to MSI interrupt mode: {:?}", e); + device.set_driver_failed(); + } + } + } + } else if self.msi.is_some() { + // Guest disable msi interrupt + match self + .intr_mgr + .set_working_mode(DeviceInterruptMode::LegacyIrq) + { + Ok(_) => self.msi = None, + Err(e) => { + warn!( + "mmio_v2: failed to switch to legacy interrupt mode: {:?}", + e + ); + device.set_driver_failed(); + } + } + } + } + + fn update_msi_cfg(&mut self, v: u16) -> Result<()> { + if let Some(msi) = self.msi.as_mut() { + msi.index_select = v as u32; + self.intr_mgr + .set_msi_low_address(msi.index_select, msi.address_low) + .map_err(Error::InterruptError)?; + self.intr_mgr + .set_msi_high_address(msi.index_select, msi.address_high) + .map_err(Error::InterruptError)?; + self.intr_mgr + .set_msi_data(msi.index_select, msi.data) + .map_err(Error::InterruptError)?; + if self.intr_mgr.is_enabled() { + self.intr_mgr + .update(msi.index_select) + .map_err(Error::InterruptError)?; + } + } + + Ok(()) + } + + fn mask_msi_int(&mut self, index: u32, mask: bool) -> Result<()> { + if self.intr_mgr.is_enabled() { + if let Some(group) = self.intr_mgr.get_group() { + let old_mask = self + .intr_mgr + .get_msi_mask(index) + .map_err(Error::InterruptError)?; + debug!("mmio_v2 old mask {}, mask {}", old_mask, mask); + + if !old_mask && mask { + group.mask(index)?; + self.intr_mgr + .set_msi_mask(index, true) + .map_err(Error::InterruptError)?; + } else if old_mask && !mask { + group.unmask(index)?; + self.intr_mgr + .set_msi_mask(index, false) + .map_err(Error::InterruptError)?; + } + } + } + + Ok(()) + } + + pub(crate) fn handle_msi_cmd(&mut self, v: u16, device: &MmioV2Device) { + let arg = v & MMIO_MSI_CMD_ARG_MASK; + match v & MMIO_MSI_CMD_CODE_MASK { + MMIO_MSI_CMD_CODE_UPDATE => { + if arg > self.device.queue_max_sizes().len() as u16 { + info!("mmio_v2: configure interrupt for invalid vector {}", v,); + } else if let Err(e) = self.update_msi_cfg(arg) { + warn_or_panic!("mmio_v2: failed to configure vector {}, {:?}", v, e); + } + } + MMIO_MSI_CMD_CODE_INT_MASK => { + if let Err(e) = self.mask_msi_int(arg as u32, true) { + warn_or_panic!("mmio_v2: failed to mask {}, {:?}", v, e); + } + } + MMIO_MSI_CMD_CODE_INT_UNMASK => { + if let Err(e) = self.mask_msi_int(arg as u32, false) { + warn_or_panic!("mmio_v2: failed to unmask {}, {:?}", v, e); + } + } + _ => { + warn!("mmio_v2: unknown msi command: 0x{:x}", v); + device.set_driver_failed(); + } + } + } +} + +impl Drop for MmioV2DeviceState +where + AS: GuestAddressSpace + Clone, + Q: QueueT, + R: GuestMemoryRegion, +{ + fn drop(&mut self) { + if let Some(memlist) = &self.shm_regions { + let mmio_res = self.device_resources.get_mmio_address_ranges(); + let slots_res = self.device_resources.get_kvm_mem_slots(); + let shm_regions_num = mmio_res.len(); + let slots_num = slots_res.len(); + assert_eq!((shm_regions_num, slots_num), (1, 1)); + let kvm_mem_region = kvm_userspace_memory_region { + slot: slots_res[0], + flags: 0, + guest_phys_addr: memlist.guest_addr.0, + memory_size: 0, + userspace_addr: memlist.host_addr, + }; + unsafe { + self.vm_fd.set_user_memory_region(kvm_mem_region).unwrap(); + } + } + } +} + +#[cfg(test)] +pub(crate) mod tests { + use kvm_ioctls::Kvm; + use virtio_queue::QueueSync; + use vm_memory::{GuestAddress, GuestMemoryMmap, GuestRegionMmap}; + + use super::*; + use crate::mmio::mmio_v2::tests::*; + + pub fn get_mmio_state( + have_msi: bool, + doorbell: bool, + ctrl_queue_size: u16, + ) -> MmioV2DeviceState, QueueSync, GuestRegionMmap> { + let mem = Arc::new(GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x1000)]).unwrap()); + + let mmio_base = 0; + let device_resources = get_device_resource(have_msi, false); + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + vm_fd.create_irq_chip().unwrap(); + + let irq_manager = Arc::new(KvmIrqManager::new(vm_fd.clone())); + irq_manager.initialize().unwrap(); + + let device = MmioDevice::new(ctrl_queue_size); + + MmioV2DeviceState::new( + Box::new(device), + vm_fd, + mem, + irq_manager, + device_resources, + mmio_base, + doorbell, + ) + .unwrap() + } + + #[test] + fn test_virtio_mmio_state_new() { + let mut state = get_mmio_state(false, false, 1); + + assert_eq!(state.queues.len(), 3); + assert!(!state.check_queues_valid()); + + state.queue_select = 0; + assert_eq!(state.with_queue(0, |q| q.max_size()), 16); + assert!(state.with_queue_mut(|q| q.set_size(16))); + assert_eq!(state.queues[state.queue_select as usize].queue.size(), 16); + + state.queue_select = 1; + assert_eq!(state.with_queue(0, |q| q.max_size()), 32); + assert!(state.with_queue_mut(|q| q.set_size(8))); + assert_eq!(state.queues[state.queue_select as usize].queue.size(), 8); + + state.queue_select = 3; + assert_eq!(state.with_queue(0xff, |q| q.max_size()), 0xff); + assert!(!state.with_queue_mut(|q| q.set_size(16))); + + assert!(!state.check_queues_valid()); + + drop(state); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/mmio/mmio_v2.rs b/src/dragonball/src/dbs_virtio_devices/src/mmio/mmio_v2.rs new file mode 100644 index 000000000..2b6df1b21 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/mmio/mmio_v2.rs @@ -0,0 +1,1237 @@ +// Copyright (C) 2019 Alibaba Cloud Computing. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, Mutex, MutexGuard}; + +use byteorder::{ByteOrder, LittleEndian}; +use dbs_device::resources::{DeviceResources, Resource}; +use dbs_device::{DeviceIo, IoAddress}; +use dbs_interrupt::{InterruptStatusRegister32, KvmIrqManager}; +use kvm_ioctls::VmFd; +use log::{debug, info, warn}; +use virtio_queue::QueueT; +use vm_memory::{GuestAddressSpace, GuestMemoryRegion}; + +use crate::{ + mmio::*, Error, Result, VirtioDevice, DEVICE_ACKNOWLEDGE, DEVICE_DRIVER, DEVICE_DRIVER_OK, + DEVICE_FAILED, DEVICE_FEATURES_OK, DEVICE_INIT, VIRTIO_INTR_VRING, +}; + +const DEVICE_STATUS_INIT: u32 = DEVICE_INIT; +const DEVICE_STATUS_ACKNOWLEDE: u32 = DEVICE_STATUS_INIT | DEVICE_ACKNOWLEDGE; +const DEVICE_STATUS_DRIVER: u32 = DEVICE_STATUS_ACKNOWLEDE | DEVICE_DRIVER; +const DEVICE_STATUS_FEATURE_OK: u32 = DEVICE_STATUS_DRIVER | DEVICE_FEATURES_OK; +const DEVICE_STATUS_DRIVER_OK: u32 = DEVICE_STATUS_FEATURE_OK | DEVICE_DRIVER_OK; + +/// Implements the +/// [MMIO](http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-1090002) +/// transport for virtio devices. +/// +/// This requires 3 points of installation to work with a VM: +/// +/// 1. Mmio reads and writes must be sent to this device at what is referred to here as MMIO base. +/// 1. `Mmio::queue_evts` must be installed at `MMIO_NOTIFY_REG_OFFSET` offset from the MMIO +/// base. Each event in the array must be signaled if the index is written at that offset. +/// 1. `Mmio::interrupt_evt` must signal an interrupt that the guest driver is listening to when it +/// is written to. +/// +/// Typically one page (4096 bytes) of MMIO address space is sufficient to handle this transport +/// and inner virtio device. +pub struct MmioV2Device { + state: Mutex>, + assigned_resources: DeviceResources, + mmio_cfg_res: Resource, + device_vendor: u32, + driver_status: AtomicU32, + config_generation: AtomicU32, + interrupt_status: Arc, +} + +impl MmioV2Device +where + AS: GuestAddressSpace + Clone, + Q: QueueT + Clone, + R: GuestMemoryRegion, +{ + /// Constructs a new MMIO transport for the given virtio device. + pub fn new( + vm_fd: Arc, + vm_as: AS, + irq_manager: Arc, + device: Box>, + resources: DeviceResources, + mut features: Option, + ) -> Result { + let mut device_resources = DeviceResources::new(); + let mut mmio_cfg_resource = None; + let mut mmio_base = 0; + let mut doorbell_enabled = false; + + for res in resources.iter() { + if let Resource::MmioAddressRange { base, size } = res { + if mmio_cfg_resource.is_none() + && *size == MMIO_DEFAULT_CFG_SIZE + DRAGONBALL_MMIO_DOORBELL_SIZE + { + mmio_base = *base; + mmio_cfg_resource = Some(res.clone()); + continue; + } + } + device_resources.append(res.clone()); + } + let mmio_cfg_res = match mmio_cfg_resource { + Some(v) => v, + None => return Err(Error::InvalidInput), + }; + + let msi_feature = if resources.get_generic_msi_irqs().is_some() { + DRAGONBALL_FEATURE_MSI_INTR + } else { + 0 + }; + + if let Some(ref mut ft) = features { + if (*ft & DRAGONBALL_FEATURE_PER_QUEUE_NOTIFY != 0) + && vm_fd.check_extension(kvm_ioctls::Cap::IoeventfdNoLength) + { + doorbell_enabled = true; + } else { + *ft &= !DRAGONBALL_FEATURE_PER_QUEUE_NOTIFY; + } + } + + debug!("mmiov2: fast-mmio enabled: {}", doorbell_enabled); + + let state = MmioV2DeviceState::new( + device, + vm_fd, + vm_as, + irq_manager, + device_resources, + mmio_base, + doorbell_enabled, + )?; + + let mut device_vendor = MMIO_VENDOR_ID_DRAGONBALL | msi_feature; + if let Some(ft) = features { + debug!("mmiov2: feature bit is 0x{:0X}", ft); + device_vendor |= ft & DRAGONBALL_FEATURE_MASK; + } + + Ok(MmioV2Device { + state: Mutex::new(state), + assigned_resources: resources, + mmio_cfg_res, + device_vendor, + driver_status: AtomicU32::new(DEVICE_INIT), + config_generation: AtomicU32::new(0), + interrupt_status: Arc::new(InterruptStatusRegister32::new()), + }) + } + + /// Acquires the state while holding the lock. + pub fn state(&self) -> MutexGuard> { + // Safe to unwrap() because we don't expect poisoned lock here. + self.state.lock().unwrap() + } + + /// Removes device. + pub fn remove(&self) { + self.state().get_inner_device_mut().remove(); + } + + /// Returns the Resource. + pub fn get_mmio_cfg_res(&self) -> Resource { + self.mmio_cfg_res.clone() + } + + /// Returns the type of device. + pub fn get_device_type(&self) -> u32 { + self.state().get_inner_device().device_type() + } + + pub(crate) fn interrupt_status(&self) -> Arc { + self.interrupt_status.clone() + } + + #[inline] + /// Atomic sets the drive state to fail. + pub(crate) fn set_driver_failed(&self) { + self.driver_status.fetch_or(DEVICE_FAILED, Ordering::SeqCst); + } + + #[inline] + pub(crate) fn driver_status(&self) -> u32 { + self.driver_status.load(Ordering::SeqCst) + } + + #[inline] + fn check_driver_status(&self, set: u32, clr: u32) -> bool { + self.driver_status() & (set | clr) == set + } + + #[inline] + fn exchange_driver_status(&self, old: u32, new: u32) -> std::result::Result { + self.driver_status + .compare_exchange(old, new, Ordering::SeqCst, Ordering::SeqCst) + } + + /// Update driver status according to the state machine defined by VirtIO Spec 1.0. + /// Please refer to VirtIO Spec 1.0, section 2.1.1 and 3.1.1. + /// + /// The driver MUST update device status, setting bits to indicate the completed steps + /// of the driver initialization sequence specified in 3.1. The driver MUST NOT clear + /// a device status bit. If the driver sets the FAILED bit, the driver MUST later reset + /// the device before attempting to re-initialize. + fn update_driver_status(&self, v: u32) { + // Serialize to update device state. + let mut state = self.state(); + let mut result = Err(DEVICE_FAILED); + if v == DEVICE_STATUS_ACKNOWLEDE { + result = self.exchange_driver_status(DEVICE_STATUS_INIT, DEVICE_STATUS_ACKNOWLEDE); + } else if v == DEVICE_STATUS_DRIVER { + result = self.exchange_driver_status(DEVICE_STATUS_ACKNOWLEDE, DEVICE_STATUS_DRIVER); + } else if v == DEVICE_STATUS_FEATURE_OK { + result = self.exchange_driver_status(DEVICE_STATUS_DRIVER, DEVICE_STATUS_FEATURE_OK); + } else if v == DEVICE_STATUS_DRIVER_OK { + result = self.exchange_driver_status(DEVICE_STATUS_FEATURE_OK, DEVICE_STATUS_DRIVER_OK); + if result.is_ok() { + if let Err(e) = state.activate(self) { + // Reset internal status to initial state on failure. + // Error is ignored since the device will go to DEVICE_FAILED status. + let _ = state.reset(); + warn!("failed to activate MMIO Virtio device: {:?}", e); + result = Err(DEVICE_FAILED); + } + } + } else if v == 0 { + if self.driver_status() == DEVICE_INIT { + result = Ok(0); + } else if state.device_activated() { + let ret = state.get_inner_device_mut().reset(); + if ret.is_err() { + warn!("failed to reset MMIO Virtio device: {:?}.", ret); + } else { + state.deactivate(); + // it should reset the device's status to init, otherwise, the guest would + // get the wrong device's status. + if let Err(e) = state.reset() { + warn!("failed to reset device state due to {:?}", e); + result = Err(DEVICE_FAILED); + } else { + result = self + .exchange_driver_status(DEVICE_STATUS_DRIVER_OK, DEVICE_STATUS_INIT); + } + } + } + } else if v == self.driver_status() { + // No real state change, nothing to do. + result = Ok(0); + } else if v & DEVICE_FAILED != 0 { + // Guest driver marks device as failed. + self.set_driver_failed(); + result = Ok(0); + } + + if result.is_err() { + warn!( + "invalid virtio driver status transition: 0x{:x} -> 0x{:x}", + self.driver_status(), + v + ); + // TODO: notify backend driver to stop the device + self.set_driver_failed(); + } + } + + fn update_queue_field(&self, f: F) { + // Use mutex for state to protect device.write_config() + let mut state = self.state(); + if self.check_driver_status(DEVICE_FEATURES_OK, DEVICE_DRIVER_OK | DEVICE_FAILED) { + state.with_queue_mut(f); + } else { + info!( + "update virtio queue in invalid state 0x{:x}", + self.driver_status() + ); + } + } + + fn tweak_intr_flags(&self, flags: u32) -> u32 { + // The MMIO virtio transport layer only supports legacy IRQs. And the typical way to + // inject interrupt into the guest is: + // 1) the vhost-user-net slave sends notifcaticaiton to dragonball by writing to eventfd. + // 2) dragonball consumes the notification by read the eventfd. + // 3) dragonball updates interrupt status register. + // 4) dragonball injects interrupt to the guest by writing to an irqfd. + // + // We play a trick here to always report "descriptor ready in the used virtque". + // This trick doesn't break the virtio spec because it allow virtio devices to inject + // supurous interrupts. By applying this trick, the way to inject interrupts gets + // simplified as: + // 1) the vhost-user-net slave sends interrupt to the guest by writing to the irqfd. + if self.device_vendor & DRAGONBALL_FEATURE_INTR_USED != 0 { + flags | VIRTIO_INTR_VRING + } else { + flags + } + } + + fn device_features(&self) -> u32 { + let state = self.state(); + let features_select = state.features_select(); + let mut features = state.get_inner_device().get_avail_features(features_select); + if features_select == 1 { + features |= 0x1; // enable support of VirtIO Version 1 + } + features + } + + fn set_acked_features(&self, v: u32) { + // Use mutex for state to protect device.ack_features() + let mut state = self.state(); + if self.check_driver_status(DEVICE_DRIVER, DEVICE_FEATURES_OK | DEVICE_FAILED) { + state.set_acked_features(v); + } else { + info!( + "ack virtio features in invalid state 0x{:x}", + self.driver_status() + ); + } + } + + fn get_device_config(&self, offset: u64, data: &mut [u8]) { + // Use mutex for state to protect device.write_config() + let mut state = self.state(); + if self.check_driver_status(DEVICE_DRIVER, DEVICE_FAILED) { + if let Err(e) = state.get_inner_device_mut().read_config(offset, data) { + warn!("device read config err: {}", e); + } + } else { + info!("can not read from device config data area before driver is ready"); + } + } + + fn set_device_config(&self, offset: u64, data: &[u8]) { + // Use mutex for state to protect device.write_config() + let mut state = self.state(); + if self.check_driver_status(DEVICE_DRIVER, DEVICE_FAILED) { + if let Err(e) = state.get_inner_device_mut().write_config(offset, data) { + warn!("device write config err: {}", e); + } + } else { + info!("can not write to device config data area before driver is ready"); + } + } + + fn get_shm_base_low(&self) -> u32 { + let mut state = self.state(); + let guest_addr: u64 = match state.shm_regions() { + Some(regions) => regions.guest_addr.0, + None => 0, + }; + state.get_shm_field(0xffff_ffff, |s| (s.offset + guest_addr) as u32) + } + + fn get_shm_base_high(&self) -> u32 { + let mut state = self.state(); + let guest_addr: u64 = match state.shm_regions() { + Some(regions) => regions.guest_addr.0, + None => 0, + }; + state.get_shm_field(0xffff_ffff, |s| ((s.offset + guest_addr) >> 32) as u32) + } +} + +impl DeviceIo for MmioV2Device +where + AS: 'static + GuestAddressSpace + Send + Sync + Clone, + Q: 'static + QueueT + Send + Clone, + R: 'static + GuestMemoryRegion + Send + Sync, +{ + fn read(&self, _base: IoAddress, offset: IoAddress, data: &mut [u8]) { + let offset = offset.raw_value(); + + if offset >= MMIO_CFG_SPACE_OFF { + self.get_device_config(offset - MMIO_CFG_SPACE_OFF, data); + } else if data.len() == 4 { + let v = match offset { + REG_MMIO_MAGIC_VALUE => MMIO_MAGIC_VALUE, + REG_MMIO_VERSION => MMIO_VERSION_2, + REG_MMIO_DEVICE_ID => self.state().get_inner_device().device_type(), + REG_MMIO_VENDOR_ID => self.device_vendor, + REG_MMIO_DEVICE_FEATURE => self.device_features(), + REG_MMIO_QUEUE_NUM_MA => self.state().with_queue(0, |q| q.max_size() as u32), + REG_MMIO_QUEUE_READY => self.state().with_queue(0, |q| q.ready() as u32), + REG_MMIO_QUEUE_NOTIF if self.state().doorbell().is_some() => { + // Safe to unwrap() because we have determined the option is a Some value. + self.state() + .doorbell() + .map(|doorbell| doorbell.register_data()) + .unwrap() + } + REG_MMIO_INTERRUPT_STAT => self.tweak_intr_flags(self.interrupt_status.read()), + REG_MMIO_STATUS => self.driver_status(), + REG_MMIO_SHM_LEN_LOW => self.state().get_shm_field(0xffff_ffff, |s| s.len as u32), + REG_MMIO_SHM_LEN_HIGH => self + .state() + .get_shm_field(0xffff_ffff, |s| (s.len >> 32) as u32), + REG_MMIO_SHM_BASE_LOW => self.get_shm_base_low(), + REG_MMIO_SHM_BASE_HIGH => self.get_shm_base_high(), + REG_MMIO_CONFIG_GENERATI => self.config_generation.load(Ordering::SeqCst), + _ => { + info!("unknown virtio mmio readl at 0x{:x}", offset); + return; + } + }; + LittleEndian::write_u32(data, v); + } else if data.len() == 2 { + let v = match offset { + REG_MMIO_MSI_CSR => { + if (self.device_vendor & DRAGONBALL_FEATURE_MSI_INTR) != 0 { + MMIO_MSI_CSR_SUPPORTED + } else { + 0 + } + } + _ => { + info!("unknown virtio mmio readw from 0x{:x}", offset); + return; + } + }; + LittleEndian::write_u16(data, v); + } else { + info!( + "unknown virtio mmio register read: 0x{:x}/0x{:x}", + offset, + data.len() + ); + } + } + + fn write(&self, _base: IoAddress, offset: IoAddress, data: &[u8]) { + let offset = offset.raw_value(); + // Write to the device configuration area. + if (MMIO_CFG_SPACE_OFF..DRAGONBALL_MMIO_DOORBELL_OFFSET).contains(&offset) { + self.set_device_config(offset - MMIO_CFG_SPACE_OFF, data); + } else if data.len() == 4 { + let v = LittleEndian::read_u32(data); + match offset { + REG_MMIO_DEVICE_FEATURES_S => self.state().set_features_select(v), + REG_MMIO_DRIVER_FEATURE => self.set_acked_features(v), + REG_MMIO_DRIVER_FEATURES_S => self.state().set_acked_features_select(v), + REG_MMIO_QUEUE_SEL => self.state().set_queue_select(v), + REG_MMIO_QUEUE_NUM => self.update_queue_field(|q| q.set_size(v as u16)), + REG_MMIO_QUEUE_READY => self.update_queue_field(|q| q.set_ready(v == 1)), + REG_MMIO_INTERRUPT_AC => self.interrupt_status.clear_bits(v), + REG_MMIO_STATUS => self.update_driver_status(v), + REG_MMIO_QUEUE_DESC_LOW => { + self.update_queue_field(|q| q.set_desc_table_address(Some(v), None)) + } + REG_MMIO_QUEUE_DESC_HIGH => { + self.update_queue_field(|q| q.set_desc_table_address(None, Some(v))) + } + REG_MMIO_QUEUE_AVAIL_LOW => { + self.update_queue_field(|q| q.set_avail_ring_address(Some(v), None)) + } + REG_MMIO_QUEUE_AVAIL_HIGH => { + self.update_queue_field(|q| q.set_avail_ring_address(None, Some(v))) + } + REG_MMIO_QUEUE_USED_LOW => { + self.update_queue_field(|q| q.set_used_ring_address(Some(v), None)) + } + REG_MMIO_QUEUE_USED_HIGH => { + self.update_queue_field(|q| q.set_used_ring_address(None, Some(v))) + } + REG_MMIO_SHM_SEL => self.state().set_shm_region_id(v), + REG_MMIO_MSI_ADDRESS_L => self.state().set_msi_address_low(v), + REG_MMIO_MSI_ADDRESS_H => self.state().set_msi_address_high(v), + REG_MMIO_MSI_DATA => self.state().set_msi_data(v), + _ => info!("unknown virtio mmio writel to 0x{:x}", offset), + } + } else if data.len() == 2 { + let v = LittleEndian::read_u16(data); + match offset { + REG_MMIO_MSI_CSR => self.state().update_msi_enable(v, self), + REG_MMIO_MSI_COMMAND => self.state().handle_msi_cmd(v, self), + _ => { + info!("unknown virtio mmio writew to 0x{:x}", offset); + } + } + } else { + info!( + "unknown virtio mmio register write: 0x{:x}/0x{:x}", + offset, + data.len() + ); + } + } + + fn get_assigned_resources(&self) -> DeviceResources { + self.assigned_resources.clone() + } + + fn get_trapped_io_resources(&self) -> DeviceResources { + let mut resources = DeviceResources::new(); + + resources.append(self.mmio_cfg_res.clone()); + + resources + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } +} + +#[cfg(test)] +pub(crate) mod tests { + use std::any::Any; + use std::sync::Mutex; + + use byteorder::{ByteOrder, LittleEndian}; + use dbs_device::resources::{MsiIrqType, Resource, ResourceConstraint}; + use dbs_device::{DeviceIo, IoAddress}; + use dbs_utils::epoll_manager::EpollManager; + use kvm_bindings::kvm_userspace_memory_region; + use kvm_ioctls::Kvm; + use virtio_queue::QueueSync; + use vm_memory::{ + GuestAddress, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, MemoryRegionAddress, + MmapRegion, + }; + + use super::*; + use crate::{ + ActivateResult, ConfigResult, Error, VirtioDeviceConfig, VirtioDeviceInfo, + VirtioSharedMemory, VirtioSharedMemoryList, DEVICE_ACKNOWLEDGE, DEVICE_DRIVER, + DEVICE_FEATURES_OK, + }; + + pub struct MmioDevice { + state: Mutex, + config: Mutex>>>, + ctrl_queue_size: u16, + } + + impl MmioDevice { + pub fn new(ctrl_queue_size: u16) -> Self { + let epoll_mgr = EpollManager::default(); + let state = VirtioDeviceInfo::new( + "dummy".to_string(), + 0xf, + Arc::new(vec![16u16, 32u16]), + vec![0xffu8; 256], + epoll_mgr, + ); + MmioDevice { + state: Mutex::new(state), + config: Mutex::new(None), + ctrl_queue_size, + } + } + } + + impl VirtioDevice, QueueSync, GuestRegionMmap> for MmioDevice { + fn device_type(&self) -> u32 { + 123 + } + + fn queue_max_sizes(&self) -> &[u16] { + &[16, 32] + } + + fn ctrl_queue_max_sizes(&self) -> u16 { + self.ctrl_queue_size + } + + fn set_acked_features(&mut self, page: u32, value: u32) { + self.state.lock().unwrap().set_acked_features(page, value); + } + + fn read_config(&mut self, offset: u64, data: &mut [u8]) -> ConfigResult { + self.state.lock().unwrap().read_config(offset, data) + } + + fn write_config(&mut self, offset: u64, data: &[u8]) -> ConfigResult { + self.state.lock().unwrap().write_config(offset, data) + } + + fn activate(&mut self, config: VirtioDeviceConfig>) -> ActivateResult { + self.config.lock().unwrap().replace(config); + Ok(()) + } + + fn reset(&mut self) -> ActivateResult { + Ok(()) + } + + fn set_resource( + &mut self, + vm_fd: Arc, + resource: DeviceResources, + ) -> Result>> { + let mmio_res = resource.get_mmio_address_ranges(); + let slot_res = resource.get_kvm_mem_slots(); + + if mmio_res.is_empty() || slot_res.is_empty() { + return Ok(None); + } + + let guest_addr = mmio_res[0].0; + let len = mmio_res[0].1; + + let mmap_region = GuestRegionMmap::new( + MmapRegion::new(len as usize).unwrap(), + GuestAddress(guest_addr), + ) + .unwrap(); + let host_addr: u64 = mmap_region + .get_host_address(MemoryRegionAddress(0)) + .unwrap() as u64; + let kvm_mem_region = kvm_userspace_memory_region { + slot: slot_res[0], + flags: 0, + guest_phys_addr: guest_addr, + memory_size: len, + userspace_addr: host_addr, + }; + unsafe { vm_fd.set_user_memory_region(kvm_mem_region).unwrap() }; + Ok(Some(VirtioSharedMemoryList { + host_addr, + guest_addr: GuestAddress(guest_addr), + len, + kvm_userspace_memory_region_flags: 0, + kvm_userspace_memory_region_slot: slot_res[0], + region_list: vec![VirtioSharedMemory { + offset: 0x40_0000, + len, + }], + mmap_region: Arc::new(mmap_region), + })) + } + + fn get_resource_requirements( + &self, + _requests: &mut Vec, + _use_generic_irq: bool, + ) { + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + } + + pub fn set_driver_status( + d: &mut MmioV2Device, QueueSync, GuestRegionMmap>, + status: u32, + ) { + let mut buf = vec![0; 4]; + LittleEndian::write_u32(&mut buf[..], status); + d.write(IoAddress(0), IoAddress(REG_MMIO_STATUS), &buf[..]); + } + + pub fn get_device_resource(have_msi_feature: bool, shared_memory: bool) -> DeviceResources { + let mut resources = DeviceResources::new(); + resources.append(Resource::MmioAddressRange { + base: 0, + size: MMIO_DEFAULT_CFG_SIZE + DRAGONBALL_MMIO_DOORBELL_SIZE, + }); + resources.append(Resource::LegacyIrq(5)); + if have_msi_feature { + resources.append(Resource::MsiIrq { + ty: MsiIrqType::GenericMsi, + base: 24, + size: 1, + }); + } + if shared_memory { + resources.append(Resource::MmioAddressRange { + base: 0x1_0000_0000, + size: 0x1000, + }); + + resources.append(Resource::KvmMemSlot(1)); + } + resources + } + + pub fn get_mmio_device_inner( + doorbell: bool, + ctrl_queue_size: u16, + resources: DeviceResources, + ) -> MmioV2Device, QueueSync, GuestRegionMmap> { + let device = MmioDevice::new(ctrl_queue_size); + let mem = Arc::new(GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x1000)]).unwrap()); + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + vm_fd.create_irq_chip().unwrap(); + let irq_manager = Arc::new(KvmIrqManager::new(vm_fd.clone())); + irq_manager.initialize().unwrap(); + + let features = if doorbell { + Some(DRAGONBALL_FEATURE_PER_QUEUE_NOTIFY) + } else { + None + }; + + MmioV2Device::new( + vm_fd, + mem, + irq_manager, + Box::new(device), + resources, + features, + ) + .unwrap() + } + + pub fn get_mmio_device() -> MmioV2Device, QueueSync, GuestRegionMmap> { + let resources = get_device_resource(false, false); + get_mmio_device_inner(false, 0, resources) + } + + #[test] + fn test_virtio_mmio_v2_device_new() { + // test create error. + let resources = DeviceResources::new(); + let mem = Arc::new(GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x1000)]).unwrap()); + let device = MmioDevice::new(0); + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + vm_fd.create_irq_chip().unwrap(); + let irq_manager = Arc::new(KvmIrqManager::new(vm_fd.clone())); + irq_manager.initialize().unwrap(); + let ret = MmioV2Device::new(vm_fd, mem, irq_manager, Box::new(device), resources, None); + assert!(matches!(ret, Err(Error::InvalidInput))); + + // test create without msi + let mut d = get_mmio_device(); + + set_driver_status(&mut d, DEVICE_ACKNOWLEDGE); + assert_eq!(d.driver_status(), DEVICE_STATUS_ACKNOWLEDE); + set_driver_status(&mut d, DEVICE_ACKNOWLEDGE | DEVICE_DRIVER); + assert_eq!(d.driver_status(), DEVICE_STATUS_DRIVER); + set_driver_status( + &mut d, + DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_FEATURES_OK, + ); + assert_eq!(d.driver_status(), DEVICE_STATUS_FEATURE_OK); + + set_driver_status( + &mut d, + DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_FEATURES_OK | DEVICE_STATUS_DRIVER_OK, + ); + assert_ne!(d.driver_status() & DEVICE_FAILED, 0); + + // test create with msi + let d_mmio_feature = get_mmio_device_inner(false, 0, get_device_resource(true, false)); + assert_ne!( + d_mmio_feature.device_vendor & DRAGONBALL_FEATURE_MSI_INTR, + 0 + ); + + // test create with doorbell features + let d_doorbell = get_mmio_device_inner(true, 0, get_device_resource(false, false)); + assert_ne!( + d_doorbell.device_vendor & DRAGONBALL_FEATURE_PER_QUEUE_NOTIFY, + 0 + ); + + // test ctrl queue + let d_ctrl = get_mmio_device_inner(true, 1, get_device_resource(false, false)); + assert_eq!(d_ctrl.state().queues().len(), 3); + } + + #[test] + fn test_bus_device_read() { + let mut d = get_mmio_device(); + + let mut buf = vec![0xff, 0, 0xfe, 0]; + let buf_copy = buf.to_vec(); + + // The following read shouldn't be valid, because the length of the buf is not 4. + buf.push(0); + d.read(IoAddress(0), IoAddress(0), &mut buf[..]); + assert_eq!(buf[..4], buf_copy[..]); + + // the length is ok again + buf.pop(); + + let mut dev_cfg = vec![0; 4]; + d.read( + IoAddress(0), + IoAddress(MMIO_CFG_SPACE_OFF), + &mut dev_cfg[..], + ); + assert_eq!(LittleEndian::read_u32(&dev_cfg[..]), 0x0); + + // Now we test that reading at various predefined offsets works as intended. + d.read(IoAddress(0), IoAddress(REG_MMIO_MAGIC_VALUE), &mut buf[..]); + assert_eq!(LittleEndian::read_u32(&buf[..]), MMIO_MAGIC_VALUE); + + d.read(IoAddress(0), IoAddress(REG_MMIO_VERSION), &mut buf[..]); + assert_eq!(LittleEndian::read_u32(&buf[..]), MMIO_VERSION_2); + + d.read(IoAddress(0), IoAddress(REG_MMIO_DEVICE_ID), &mut buf[..]); + assert_eq!( + LittleEndian::read_u32(&buf[..]), + d.state().get_inner_device().device_type() + ); + + d.read(IoAddress(0), IoAddress(REG_MMIO_VENDOR_ID), &mut buf[..]); + assert_eq!(LittleEndian::read_u32(&buf[..]), MMIO_VENDOR_ID_DRAGONBALL); + + d.state().set_features_select(0); + d.read( + IoAddress(0), + IoAddress(REG_MMIO_DEVICE_FEATURE), + &mut buf[..], + ); + assert_eq!( + LittleEndian::read_u32(&buf[..]), + d.state().get_inner_device().get_avail_features(0) + ); + + d.state().set_features_select(1); + d.read( + IoAddress(0), + IoAddress(REG_MMIO_DEVICE_FEATURE), + &mut buf[..], + ); + assert_eq!( + LittleEndian::read_u32(&buf[..]), + d.state().get_inner_device().get_avail_features(0) | 0x1 + ); + + d.read(IoAddress(0), IoAddress(REG_MMIO_QUEUE_NUM_MA), &mut buf[..]); + assert_eq!(LittleEndian::read_u32(&buf[..]), 16); + + d.read(IoAddress(0), IoAddress(REG_MMIO_QUEUE_READY), &mut buf[..]); + assert_eq!(LittleEndian::read_u32(&buf[..]), false as u32); + + d.read( + IoAddress(0), + IoAddress(REG_MMIO_INTERRUPT_STAT), + &mut buf[..], + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), 0); + + d.read(IoAddress(0), IoAddress(REG_MMIO_STATUS), &mut buf[..]); + assert_eq!(LittleEndian::read_u32(&buf[..]), 0); + + d.config_generation.store(5, Ordering::SeqCst); + d.read( + IoAddress(0), + IoAddress(REG_MMIO_CONFIG_GENERATI), + &mut buf[..], + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), 5); + + // This read shouldn't do anything, as it's past the readable generic registers, and + // before the device specific configuration space. Btw, reads from the device specific + // conf space are going to be tested a bit later, alongside writes. + buf = buf_copy.to_vec(); + d.read(IoAddress(0), IoAddress(0xfd), &mut buf[..]); + assert_eq!(buf[..], buf_copy[..]); + + // Read from an invalid address in generic register range. + d.read(IoAddress(0), IoAddress(0xfb), &mut buf[..]); + assert_eq!(buf[..], buf_copy[..]); + + // Read from an invalid length in generic register range. + d.read(IoAddress(0), IoAddress(0xfc), &mut buf[..3]); + assert_eq!(buf[..], buf_copy[..]); + + // test for no msi_feature + let mut buf = vec![0; 2]; + d.read(IoAddress(0), IoAddress(REG_MMIO_MSI_CSR), &mut buf[..]); + assert_eq!(LittleEndian::read_u16(&buf[..]), 0); + + // test for msi_feature + d.device_vendor |= DRAGONBALL_FEATURE_MSI_INTR; + let mut buf = vec![0; 2]; + d.read(IoAddress(0), IoAddress(REG_MMIO_MSI_CSR), &mut buf[..]); + assert_eq!(LittleEndian::read_u16(&buf[..]), MMIO_MSI_CSR_SUPPORTED); + + let mut dev_cfg = vec![0; 4]; + assert_eq!( + d.exchange_driver_status(0, DEVICE_DRIVER | DEVICE_INIT) + .unwrap(), + 0 + ); + d.read( + IoAddress(0), + IoAddress(MMIO_CFG_SPACE_OFF), + &mut dev_cfg[..], + ); + assert_eq!(LittleEndian::read_u32(&dev_cfg[..]), 0xffffffff); + } + + #[test] + fn test_bus_device_write() { + let mut d = get_mmio_device(); + + let mut buf = vec![0; 5]; + LittleEndian::write_u32(&mut buf[..4], 1); + + // Nothing should happen, because the slice len > 4. + d.state().set_features_select(0); + d.write( + IoAddress(0), + IoAddress(REG_MMIO_DEVICE_FEATURES_S), + &buf[..], + ); + assert_eq!(d.state().features_select(), 0); + + set_driver_status(&mut d, DEVICE_ACKNOWLEDGE); + assert_eq!(d.driver_status(), DEVICE_STATUS_ACKNOWLEDE); + set_driver_status(&mut d, DEVICE_STATUS_DRIVER); + assert_eq!(d.driver_status(), DEVICE_STATUS_DRIVER); + + let mut buf = vec![0; 4]; + buf[0] = 0xa5; + d.write(IoAddress(0), IoAddress(MMIO_CFG_SPACE_OFF), &buf[..]); + buf[0] = 0; + d.read(IoAddress(0), IoAddress(MMIO_CFG_SPACE_OFF), &mut buf[..]); + assert_eq!(buf[0], 0xa5); + assert_eq!(buf[1], 0); + + // Acking features in invalid state shouldn't take effect. + d.state().set_acked_features_select(0x0); + LittleEndian::write_u32(&mut buf[..], 1); + d.write(IoAddress(0), IoAddress(REG_MMIO_DRIVER_FEATURE), &buf[..]); + // TODO: find a way to check acked features + + // now writes should work + d.state().set_features_select(0); + LittleEndian::write_u32(&mut buf[..], 1); + d.write( + IoAddress(0), + IoAddress(REG_MMIO_DEVICE_FEATURES_S), + &buf[..], + ); + assert_eq!(d.state().features_select(), 1); + + d.state().set_acked_features_select(0x123); + LittleEndian::write_u32(&mut buf[..], 1); + d.write(IoAddress(0), IoAddress(REG_MMIO_DRIVER_FEATURE), &buf[..]); + // TODO: find a way to check acked features + + d.state().set_acked_features_select(0); + LittleEndian::write_u32(&mut buf[..], 2); + d.write( + IoAddress(0), + IoAddress(REG_MMIO_DRIVER_FEATURES_S), + &buf[..], + ); + assert_eq!(d.state().acked_features_select(), 2); + + set_driver_status(&mut d, DEVICE_STATUS_FEATURE_OK); + assert_eq!(d.driver_status(), DEVICE_STATUS_FEATURE_OK); + + // Setup queues + d.state().set_queue_select(0); + LittleEndian::write_u32(&mut buf[..], 3); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_SEL), &buf[..]); + assert_eq!(d.state().queue_select(), 3); + + d.state().set_queue_select(0); + assert_eq!(d.state().queues()[0].queue.size(), 16); + LittleEndian::write_u32(&mut buf[..], 8); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_NUM), &buf[..]); + assert_eq!(d.state().queues()[0].queue.size(), 8); + + assert!(!d.state().queues()[0].queue.ready()); + LittleEndian::write_u32(&mut buf[..], 1); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_READY), &buf[..]); + assert!(d.state().queues()[0].queue.ready()); + + LittleEndian::write_u32(&mut buf[..], 0b111); + d.write(IoAddress(0), IoAddress(REG_MMIO_INTERRUPT_AC), &buf[..]); + + assert_eq!(d.state().queues_mut()[0].queue.lock().desc_table(), 0); + + // When write descriptor, descriptor table will judge like this: + // if desc_table.mask(0xf) != 0 { + // virtio queue descriptor table breaks alignment constraints + // return + // desc_table is the data that will be written. + LittleEndian::write_u32(&mut buf[..], 0x120); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_DESC_LOW), &buf[..]); + assert_eq!(d.state().queues_mut()[0].queue.lock().desc_table(), 0x120); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_DESC_HIGH), &buf[..]); + assert_eq!( + d.state().queues_mut()[0].queue.lock().desc_table(), + 0x120 + (0x120 << 32) + ); + + assert_eq!(d.state().queues_mut()[0].queue.lock().avail_ring(), 0); + LittleEndian::write_u32(&mut buf[..], 124); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_AVAIL_LOW), &buf[..]); + assert_eq!(d.state().queues_mut()[0].queue.lock().avail_ring(), 124); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_AVAIL_HIGH), &buf[..]); + assert_eq!( + d.state().queues_mut()[0].queue.lock().avail_ring(), + 124 + (124 << 32) + ); + + assert_eq!(d.state().queues_mut()[0].queue.lock().used_ring(), 0); + LittleEndian::write_u32(&mut buf[..], 128); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_USED_LOW), &buf[..]); + assert_eq!(d.state().queues_mut()[0].queue.lock().used_ring(), 128); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_USED_HIGH), &buf[..]); + assert_eq!( + d.state().queues_mut()[0].queue.lock().used_ring(), + 128 + (128 << 32) + ); + + // Write to an invalid address in generic register range. + LittleEndian::write_u32(&mut buf[..], 0xf); + d.config_generation.store(0, Ordering::SeqCst); + d.write(IoAddress(0), IoAddress(0xfb), &buf[..]); + assert_eq!(d.config_generation.load(Ordering::SeqCst), 0); + + // Write to an invalid length in generic register range. + d.write(IoAddress(0), IoAddress(REG_MMIO_CONFIG_GENERATI), &buf[..2]); + assert_eq!(d.config_generation.load(Ordering::SeqCst), 0); + } + + #[test] + fn test_bus_device_activate() { + // invalid state transition should failed + let mut d = get_mmio_device(); + + assert!(!d.state().check_queues_valid()); + assert!(!d.state().device_activated()); + assert_eq!(d.driver_status(), DEVICE_INIT); + set_driver_status(&mut d, DEVICE_ACKNOWLEDGE); + assert_eq!(d.driver_status(), DEVICE_ACKNOWLEDGE); + set_driver_status(&mut d, DEVICE_ACKNOWLEDGE | DEVICE_DRIVER); + assert_eq!(d.driver_status(), DEVICE_ACKNOWLEDGE | DEVICE_DRIVER); + // Invalid state set + set_driver_status( + &mut d, + DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK, + ); + assert_eq!( + d.driver_status(), + DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_FAILED + ); + + // valid state transition + let mut d = get_mmio_device(); + + assert!(!d.state().check_queues_valid()); + assert!(!d.state().device_activated()); + assert_eq!(d.driver_status(), DEVICE_INIT); + + set_driver_status(&mut d, DEVICE_ACKNOWLEDGE); + assert_eq!(d.driver_status(), DEVICE_ACKNOWLEDGE); + set_driver_status(&mut d, DEVICE_ACKNOWLEDGE | DEVICE_DRIVER); + assert_eq!(d.driver_status(), DEVICE_ACKNOWLEDGE | DEVICE_DRIVER); + + set_driver_status( + &mut d, + DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_FEATURES_OK, + ); + assert_eq!( + d.driver_status(), + DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_FEATURES_OK + ); + + let mut buf = vec![0; 4]; + let size = d.state().queues().len(); + for q in 0..size { + d.state().set_queue_select(q as u32); + LittleEndian::write_u32(&mut buf[..], 16); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_NUM), &buf[..]); + LittleEndian::write_u32(&mut buf[..], 1); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_READY), &buf[..]); + } + assert!(d.state().check_queues_valid()); + assert!(!d.state().device_activated()); + + // Device should be ready for activation now. + + // A couple of invalid writes; will trigger warnings; shouldn't activate the device. + d.write(IoAddress(0), IoAddress(0xa8), &buf[..]); + assert!(!d.state().device_activated()); + + set_driver_status( + &mut d, + DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_FEATURES_OK | DEVICE_DRIVER_OK, + ); + assert_eq!( + d.driver_status(), + DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_FEATURES_OK | DEVICE_DRIVER_OK + ); + assert!(d.state().device_activated()); + + // activate again + set_driver_status( + &mut d, + DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_FEATURES_OK | DEVICE_DRIVER_OK, + ); + assert!(d.state().device_activated()); + + // A write which changes the size of a queue after activation; currently only triggers + // a warning path and have no effect on queue state. + LittleEndian::write_u32(&mut buf[..], 0); + d.state().set_queue_select(0); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_READY), &buf[..]); + d.read(IoAddress(0), IoAddress(REG_MMIO_QUEUE_READY), &mut buf[..]); + assert_eq!(LittleEndian::read_u32(&buf[..]), 1); + } + + fn activate_device(d: &mut MmioV2Device, QueueSync, GuestRegionMmap>) { + set_driver_status(d, DEVICE_ACKNOWLEDGE); + set_driver_status(d, DEVICE_ACKNOWLEDGE | DEVICE_DRIVER); + set_driver_status(d, DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_FEATURES_OK); + + // Setup queue data structures + let mut buf = vec![0; 4]; + let size = d.state().queues().len(); + for q in 0..size { + d.state().set_queue_select(q as u32); + LittleEndian::write_u32(&mut buf[..], 16); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_NUM), &buf[..]); + LittleEndian::write_u32(&mut buf[..], 1); + d.write(IoAddress(0), IoAddress(REG_MMIO_QUEUE_READY), &buf[..]); + } + assert!(d.state().check_queues_valid()); + assert!(!d.state().device_activated()); + + // Device should be ready for activation now. + set_driver_status( + d, + DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_FEATURES_OK | DEVICE_DRIVER_OK, + ); + assert_eq!( + d.driver_status(), + DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_FEATURES_OK | DEVICE_DRIVER_OK + ); + assert!(d.state().device_activated()); + } + + #[test] + fn test_bus_device_reset() { + let resources = get_device_resource(false, false); + let mut d = get_mmio_device_inner(true, 0, resources); + let mut buf = vec![0; 4]; + + assert!(!d.state().check_queues_valid()); + assert!(!d.state().device_activated()); + assert_eq!(d.driver_status(), 0); + activate_device(&mut d); + + // Marking device as FAILED should not affect device_activated state + LittleEndian::write_u32(&mut buf[..], 0x8f); + d.write(IoAddress(0), IoAddress(REG_MMIO_STATUS), &buf[..]); + assert_eq!(d.driver_status(), 0x8f); + assert!(d.state().device_activated()); + + // Nothing happens when backend driver doesn't support reset + LittleEndian::write_u32(&mut buf[..], 0x0); + d.write(IoAddress(0), IoAddress(REG_MMIO_STATUS), &buf[..]); + assert_eq!(d.driver_status(), 0x8f); + assert!(!d.state().device_activated()); + + // test for reactivate device + // but device don't support reactivate now + d.state().deactivate(); + assert!(!d.state().device_activated()); + } + + #[test] + fn test_mmiov2_device_resources() { + let d = get_mmio_device(); + + let resources = d.get_assigned_resources(); + assert_eq!(resources.len(), 2); + let resources = d.get_trapped_io_resources(); + assert_eq!(resources.len(), 1); + let mmio_cfg_res = resources.get_mmio_address_ranges(); + assert_eq!(mmio_cfg_res.len(), 1); + assert_eq!( + mmio_cfg_res[0].1, + MMIO_DEFAULT_CFG_SIZE + DRAGONBALL_MMIO_DOORBELL_SIZE + ); + } + + #[test] + fn test_mmio_v2_device_msi() { + let resources = get_device_resource(true, false); + let mut d = get_mmio_device_inner(true, 0, resources); + + let mut buf = vec![0; 4]; + LittleEndian::write_u32(&mut buf[..], 0x1234); + d.write(IoAddress(0), IoAddress(REG_MMIO_MSI_ADDRESS_L), &buf[..]); + LittleEndian::write_u32(&mut buf[..], 0x5678); + d.write(IoAddress(0), IoAddress(REG_MMIO_MSI_ADDRESS_H), &buf[..]); + LittleEndian::write_u32(&mut buf[..], 0x11111111); + d.write(IoAddress(0), IoAddress(REG_MMIO_MSI_DATA), &buf[..]); + + // Enable msi + LittleEndian::write_u16(&mut buf[..], MMIO_MSI_CSR_ENABLED); + d.write(IoAddress(0), IoAddress(REG_MMIO_MSI_CSR), &buf[..2]); + + // Activate the device, it will enable interrupts. + activate_device(&mut d); + + // update msi index + LittleEndian::write_u16(&mut buf[..], MMIO_MSI_CMD_CODE_UPDATE); + d.write(IoAddress(0), IoAddress(REG_MMIO_MSI_COMMAND), &buf[..2]); + + // update msi int mask + LittleEndian::write_u16(&mut buf[..], MMIO_MSI_CMD_CODE_INT_MASK); + d.write(IoAddress(0), IoAddress(REG_MMIO_MSI_COMMAND), &buf[..2]); + + // update msi int unmask + LittleEndian::write_u16(&mut buf[..], MMIO_MSI_CMD_CODE_INT_UNMASK); + d.write(IoAddress(0), IoAddress(REG_MMIO_MSI_COMMAND), &buf[..2]); + + // unknown msi command + LittleEndian::write_u16(&mut buf[..], 0x4000); + d.write(IoAddress(0), IoAddress(REG_MMIO_MSI_COMMAND), &buf[..2]); + assert_ne!(d.driver_status() & DEVICE_FAILED, 0); + + // Disable msi + LittleEndian::write_u16(&mut buf[..], 0); + d.write(IoAddress(0), IoAddress(REG_MMIO_MSI_CSR), &buf[..2]); + } + + #[test] + fn test_mmio_shared_memory() { + let resources = get_device_resource(true, true); + let d = get_mmio_device_inner(true, 0, resources); + + let mut buf = vec![0; 4]; + + // shm select 0 + d.write(IoAddress(0), IoAddress(REG_MMIO_SHM_SEL), &buf[..]); + + d.read(IoAddress(0), IoAddress(REG_MMIO_SHM_LEN_LOW), &mut buf[..]); + assert_eq!(LittleEndian::read_u32(&buf[..]), 0x1000); + + d.read(IoAddress(0), IoAddress(REG_MMIO_SHM_LEN_HIGH), &mut buf[..]); + assert_eq!(LittleEndian::read_u32(&buf[..]), 0x0); + + d.read(IoAddress(0), IoAddress(REG_MMIO_SHM_BASE_LOW), &mut buf[..]); + assert_eq!(LittleEndian::read_u32(&buf[..]), 0x40_0000); + + d.read( + IoAddress(0), + IoAddress(REG_MMIO_SHM_BASE_HIGH), + &mut buf[..], + ); + assert_eq!(LittleEndian::read_u32(&buf[..]), 0x1); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/mmio/mod.rs b/src/dragonball/src/dbs_virtio_devices/src/mmio/mod.rs new file mode 100644 index 000000000..d22082411 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/mmio/mod.rs @@ -0,0 +1,137 @@ +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +// +// Portions Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +//! Implementations of the Virtio MMIO Transport Layer. +//! +//! The Virtio specifications have defined two versions for the Virtio MMIO transport layer. The +//! version 1 is called legacy mode, and the version 2 is preferred currently. The common parts +//! of both versions are defined here. + +mod mmio_state; +pub use self::mmio_state::*; + +mod mmio_v2; +pub use self::mmio_v2::*; + +mod dragonball; +pub use self::dragonball::*; + +/// Magic number for MMIO virtio devices. +/// Required by the virtio mmio device register layout at offset 0 from base +pub const MMIO_MAGIC_VALUE: u32 = 0x74726976; + +/// Version number for legacy MMIO virito devices. +pub const MMIO_VERSION_1: u32 = 1; + +/// Current version specified by the mmio standard. +pub const MMIO_VERSION_2: u32 = 2; + +/// Offset from the base MMIO address of a virtio device used by the guest to notify the device of +/// queue events. +pub const MMIO_NOTIFY_REG_OFFSET: u32 = 0x50; + +/// Default size for MMIO device configuration address space. +/// +/// This represents the size of the mmio device specified to the kernel as a cmdline option +/// It has to be larger than 0x100 (the offset where the configuration space starts from +/// the beginning of the memory mapped device registers) + the size of the configuration space +/// Currently hardcoded to 4K +pub const MMIO_DEFAULT_CFG_SIZE: u64 = 0x1000; + +/// +/// Control registers + +// Magic value ("virt" string) - Read Only +pub const REG_MMIO_MAGIC_VALUE: u64 = 0x000; + +// Virtio device version - Read Only +pub const REG_MMIO_VERSION: u64 = 0x004; + +// Virtio device ID - Read Only +pub const REG_MMIO_DEVICE_ID: u64 = 0x008; + +// Virtio vendor ID - Read Only +pub const REG_MMIO_VENDOR_ID: u64 = 0x00c; + +// Bitmask of the features supported by the device (host) +// (32 bits per set) - Read Only +pub const REG_MMIO_DEVICE_FEATURE: u64 = 0x010; + +// Device (host) features set selector - Write Only +pub const REG_MMIO_DEVICE_FEATURES_S: u64 = 0x014; + +// Bitmask of features activated by the driver (guest) +// (32 bits per set) - Write Only +pub const REG_MMIO_DRIVER_FEATURE: u64 = 0x020; + +// Activated features set selector - Write Only */ +pub const REG_MMIO_DRIVER_FEATURES_S: u64 = 0x024; + +// Guest's memory page size in bytes - Write Only +pub const REG_MMIO_GUEST_PAGE_SIZ: u64 = 0x028; + +// Queue selector - Write Only +pub const REG_MMIO_QUEUE_SEL: u64 = 0x030; + +// Maximum size of the currently selected queue - Read Only +pub const REG_MMIO_QUEUE_NUM_MA: u64 = 0x034; + +// Queue size for the currently selected queue - Write Only +pub const REG_MMIO_QUEUE_NUM: u64 = 0x038; + +// Used Ring alignment for the currently selected queue - Write Only +pub const REG_MMIO_QUEUE_ALIGN: u64 = 0x03c; + +// Guest's PFN for the currently selected queue - Read Write +pub const REG_MMIO_QUEUE_PFN: u64 = 0x040; + +// Ready bit for the currently selected queue - Read Write +pub const REG_MMIO_QUEUE_READY: u64 = 0x044; + +// Queue notifier - Write Only +pub const REG_MMIO_QUEUE_NOTIF: u64 = 0x050; + +// Interrupt status - Read Only +pub const REG_MMIO_INTERRUPT_STAT: u64 = 0x060; + +// Interrupt acknowledge - Write Only +pub const REG_MMIO_INTERRUPT_AC: u64 = 0x064; + +// Device status register - Read Write +pub const REG_MMIO_STATUS: u64 = 0x070; + +// Selected queue's Descriptor Table address, 64 bits in two halves +pub const REG_MMIO_QUEUE_DESC_LOW: u64 = 0x080; +pub const REG_MMIO_QUEUE_DESC_HIGH: u64 = 0x084; + +// Selected queue's Available Ring address, 64 bits in two halves +pub const REG_MMIO_QUEUE_AVAIL_LOW: u64 = 0x090; +pub const REG_MMIO_QUEUE_AVAIL_HIGH: u64 = 0x094; + +// Selected queue's Used Ring address, 64 bits in two halves +pub const REG_MMIO_QUEUE_USED_LOW: u64 = 0x0a0; +pub const REG_MMIO_QUEUE_USED_HIGH: u64 = 0x0a4; + +// Shared memory region id +pub const REG_MMIO_SHM_SEL: u64 = 0x0ac; + +// Shared memory region length, 64 bits in two halves +pub const REG_MMIO_SHM_LEN_LOW: u64 = 0x0b0; +pub const REG_MMIO_SHM_LEN_HIGH: u64 = 0x0b4; + +// Shared memory region base address, 64 bits in two halves +pub const REG_MMIO_SHM_BASE_LOW: u64 = 0x0b8; +pub const REG_MMIO_SHM_BASE_HIGH: u64 = 0x0bc; + +// Configuration atomicity value +pub const REG_MMIO_CONFIG_GENERATI: u64 = 0x0fc; + +// The config space is defined by each driver +// the per-driver configuration space - Read Write +pub const REG_MMIO_CONFIG: u64 = 0x100; diff --git a/src/dragonball/src/dbs_virtio_devices/src/net.rs b/src/dragonball/src/dbs_virtio_devices/src/net.rs new file mode 100644 index 000000000..bbae070c3 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/net.rs @@ -0,0 +1,1448 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::any::Any; +use std::cmp; +use std::io::{self, Read, Write}; +use std::marker::PhantomData; +use std::mem; +use std::ops::Deref; +use std::os::unix::io::AsRawFd; +use std::sync::{mpsc, Arc}; + +use dbs_device::resources::ResourceConstraint; +use dbs_utils::epoll_manager::{ + EpollManager, EventOps, EventSet, Events, MutEventSubscriber, SubscriberId, +}; +use dbs_utils::metric::{IncMetric, SharedIncMetric}; +use dbs_utils::net::{net_gen, MacAddr, Tap, MAC_ADDR_LEN}; +use dbs_utils::rate_limiter::{BucketUpdate, RateLimiter, TokenType}; +use libc; +use log::{debug, error, info, trace, warn}; +use serde::Serialize; +use virtio_bindings::bindings::virtio_net::*; +use virtio_queue::{QueueOwnedT, QueueSync, QueueT}; +use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryRegion, GuestRegionMmap}; +use vmm_sys_util::eventfd::EventFd; + +use crate::device::{VirtioDeviceConfig, VirtioDeviceInfo}; +use crate::{ + ActivateError, ActivateResult, ConfigResult, DbsGuestAddressSpace, Error, Result, VirtioDevice, + VirtioQueueConfig, TYPE_NET, +}; + +const NET_DRIVER_NAME: &str = "virtio-net"; + +/// The maximum buffer size when segmentation offload is enabled. This +/// includes the 12-byte virtio net header. +/// http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html#x1-1740003 +const MAX_BUFFER_SIZE: usize = 65562; + +// A frame is available for reading from the tap device to receive in the guest. +const RX_TAP_EVENT: u32 = 0; +// The guest has made a buffer available to receive a frame into. +const RX_QUEUE_EVENT: u32 = 1; +// The transmit queue has a frame that is ready to send from the guest. +const TX_QUEUE_EVENT: u32 = 2; +// rx rate limiter budget is now available. +const RX_RATE_LIMITER_EVENT: u32 = 3; +// tx rate limiter budget is now available. +const TX_RATE_LIMITER_EVENT: u32 = 4; +// patch request of rate limiters has arrived +const PATCH_RATE_LIMITER_EVENT: u32 = 5; +// Number of DeviceEventT events supported by this implementation. +pub const NET_EVENTS_COUNT: u32 = 6; + +/// Error for virtio-net devices to handle requests from guests. +#[derive(Debug, thiserror::Error)] +pub enum NetError { + /// Open tap device failed. + #[error("open tap device failed: {0}")] + TapOpen(#[source] dbs_utils::net::TapError), + /// Setting tap interface offload flags failed. + #[error("set tap device vnet header size failed: {0}")] + TapSetOffload(#[source] dbs_utils::net::TapError), + /// Setting vnet header size failed. + #[error("set tap device vnet header size failed: {0}")] + TapSetVnetHdrSize(#[source] dbs_utils::net::TapError), +} + +/// Metrics specific to the net device. +#[derive(Default, Serialize)] +pub struct NetDeviceMetrics { + /// Number of times when handling events on a network device. + pub event_count: SharedIncMetric, + /// Number of times when activate failed on a network device. + pub activate_fails: SharedIncMetric, + /// Number of times when interacting with the space config of a network device failed. + pub cfg_fails: SharedIncMetric, + /// Number of times when handling events on a network device failed. + pub event_fails: SharedIncMetric, + /// Number of events associated with the receiving queue. + pub rx_queue_event_count: SharedIncMetric, + /// Number of events associated with the rate limiter installed on the receiving path. + pub rx_event_rate_limiter_count: SharedIncMetric, + /// Number of events received on the associated tap. + pub rx_tap_event_count: SharedIncMetric, + /// Number of bytes received. + pub rx_bytes_count: SharedIncMetric, + /// Number of packets received. + pub rx_packets_count: SharedIncMetric, + /// Number of errors while receiving data. + pub rx_fails: SharedIncMetric, + /// Number of transmitted bytes. + pub tx_bytes_count: SharedIncMetric, + /// Number of errors while transmitting data. + pub tx_fails: SharedIncMetric, + /// Number of transmitted packets. + pub tx_packets_count: SharedIncMetric, + /// Number of events associated with the transmitting queue. + pub tx_queue_event_count: SharedIncMetric, + /// Number of events associated with the rate limiter installed on the transmitting path. + pub tx_rate_limiter_event_count: SharedIncMetric, +} + +struct TxVirtio { + queue: VirtioQueueConfig, + rate_limiter: RateLimiter, + iovec: Vec<(GuestAddress, usize)>, + used_desc_heads: Vec, + frame_buf: [u8; MAX_BUFFER_SIZE], +} + +impl TxVirtio { + fn new(queue: VirtioQueueConfig, rate_limiter: RateLimiter) -> Self { + let tx_queue_max_size = queue.max_size() as usize; + + TxVirtio { + queue, + rate_limiter, + iovec: Vec::with_capacity(tx_queue_max_size), + used_desc_heads: vec![0u16; tx_queue_max_size], + frame_buf: [0u8; MAX_BUFFER_SIZE], + } + } +} + +struct RxVirtio { + queue: VirtioQueueConfig, + rate_limiter: RateLimiter, + deferred_frame: bool, + deferred_irqs: bool, + bytes_read: usize, + frame_buf: [u8; MAX_BUFFER_SIZE], +} + +impl RxVirtio { + fn new(queue: VirtioQueueConfig, rate_limiter: RateLimiter) -> Self { + RxVirtio { + queue, + rate_limiter, + deferred_frame: false, + deferred_irqs: false, + bytes_read: 0, + frame_buf: [0u8; MAX_BUFFER_SIZE], + } + } +} + +fn vnet_hdr_len() -> usize { + mem::size_of::() +} + +#[allow(dead_code)] +pub(crate) struct NetEpollHandler< + AS: GuestAddressSpace, + Q: QueueT + Send = QueueSync, + R: GuestMemoryRegion = GuestRegionMmap, +> { + tap: Tap, + rx: RxVirtio, + tx: TxVirtio, + config: VirtioDeviceConfig, + id: String, + patch_rate_limiter_fd: EventFd, + receiver: Option>, + metrics: Arc, +} + +impl NetEpollHandler { + // Attempts to copy a single frame into the guest if there is enough rate limiting budget. + // Returns true on successful frame delivery. + fn rate_limited_rx_single_frame(&mut self, mem: &AS::M) -> bool { + // If limiter.consume() fails it means there is no more TokenType::Ops + // budget and rate limiting is in effect. + if !self.rx.rate_limiter.consume(1, TokenType::Ops) { + return false; + } + // If limiter.consume() fails it means there is no more TokenType::Bytes + // budget and rate limiting is in effect. + if !self + .rx + .rate_limiter + .consume(self.rx.bytes_read as u64, TokenType::Bytes) + { + // revert the OPS consume() + self.rx.rate_limiter.manual_replenish(1, TokenType::Ops); + return false; + } + + // Attempt frame delivery. + let success = self.rx_single_frame(mem); + + // Undo the tokens consumption if guest delivery failed. + if !success { + self.rx.rate_limiter.manual_replenish(1, TokenType::Ops); + self.rx + .rate_limiter + .manual_replenish(self.rx.bytes_read as u64, TokenType::Bytes); + } + + success + } + + // Copies a single frame from `self.rx.frame_buf` into the guest. + // + // Returns true if a buffer was used, and false if the frame must be deferred until a buffer + // is made available by the driver. + fn rx_single_frame(&mut self, mem: &AS::M) -> bool { + let mut next_desc; + let mut desc_chain; + let mut write_count = 0; + + { + let queue = &mut self.rx.queue.queue_mut().lock(); + let mut iter = match queue.iter(mem) { + Err(e) => { + error!("{}: failed to process queue. {}", self.id, e); + return false; + } + Ok(iter) => iter, + }; + desc_chain = match iter.next() { + Some(v) => v, + None => return false, + }; + next_desc = desc_chain.next(); + + // Copy from frame into buffer, which may span multiple descriptors. + loop { + match next_desc { + Some(desc) => { + if !desc.is_write_only() { + self.metrics.rx_fails.inc(); + debug!("{}: receiving buffer is not write-only", self.id); + break; + } + + let limit = cmp::min(write_count + desc.len() as usize, self.rx.bytes_read); + let source_slice = &self.rx.frame_buf[write_count..limit]; + match mem.write(source_slice, desc.addr()) { + Ok(sz) => write_count += sz, + Err(e) => { + self.metrics.rx_fails.inc(); + debug!("{}: failed to write guest memory slice, {:?}", self.id, e); + break; + } + }; + + if write_count >= self.rx.bytes_read { + break; + } + next_desc = desc_chain.next(); + } + None => { + self.metrics.rx_fails.inc(); + debug!("{}: receiving buffer is too small", self.id); + break; + } + } + } + } + self.rx + .queue + .add_used(mem, desc_chain.head_index(), write_count as u32); + + // Mark that we have at least one pending packet and we need to interrupt the guest. + self.rx.deferred_irqs = true; + + // Current descriptor chain is too small, need a bigger one. + if write_count < self.rx.bytes_read { + return false; + } + + self.metrics.rx_bytes_count.add(write_count); + self.metrics.rx_packets_count.inc(); + true + } + + // Sends frame to the host TAP. + // + // `frame_buf` should contain the frame bytes in a slice of exact length. + // Returns whether MMDS consumed the frame. + fn write_to_tap(frame_buf: &[u8], tap: &mut Tap, metrics: &Arc) { + match tap.write(frame_buf) { + Ok(_) => { + metrics.tx_bytes_count.add(frame_buf.len()); + metrics.tx_packets_count.inc(); + } + Err(e) => { + metrics.tx_fails.inc(); + error!("{}: failed to write to tap, {:?}", NET_DRIVER_NAME, e); + } + } + } + + // Read from regular network packets. + fn read_from_tap(&mut self) -> io::Result { + self.tap.read(&mut self.rx.frame_buf) + } + + fn process_rx(&mut self, mem: &AS::M) -> Result<()> { + // Read as many frames as possible. + loop { + match self.read_from_tap() { + Ok(count) => { + self.rx.bytes_read = count; + if !self.rate_limited_rx_single_frame(mem) { + self.rx.deferred_frame = true; + break; + } + } + Err(e) => { + // The tap device is non-blocking, so any error aside from EAGAIN is unexpected. + match e.raw_os_error() { + Some(err) if err == libc::EAGAIN => (), + _ => { + self.metrics.rx_fails.inc(); + error!("{}: failed to read tap: {:?}", self.id, e); + return Err(e.into()); + } + }; + break; + } + } + } + + if self.rx.deferred_irqs { + self.rx.deferred_irqs = false; + self.rx.queue.notify() + } else { + Ok(()) + } + } + + fn resume_rx(&mut self, mem: &AS::M) -> Result<()> { + if self.rx.deferred_frame { + if self.rate_limited_rx_single_frame(mem) { + self.rx.deferred_frame = false; + // process_rx() was interrupted possibly before consuming all + // packets in the tap; try continuing now. + self.process_rx(mem) + } else if self.rx.deferred_irqs { + self.rx.deferred_irqs = false; + self.rx.queue.notify() + } else { + Ok(()) + } + } else { + Ok(()) + } + } + + fn process_tx(&mut self, mem: &AS::M) -> Result<()> { + let mut rate_limited = false; + let mut used_count = 0; + { + let queue = &mut self.tx.queue.queue_mut().lock(); + + let mut iter = match queue.iter(mem) { + Err(e) => { + return Err(Error::VirtioQueueError(e)); + } + Ok(iter) => iter, + }; + + for desc_chain in &mut iter { + // If limiter.consume() fails it means there is no more TokenType::Ops + // budget and rate limiting is in effect. + if !self.tx.rate_limiter.consume(1, TokenType::Ops) { + rate_limited = true; + // Stop processing the queue. + + break; + } + + let mut read_count = 0; + let header_index = desc_chain.head_index(); + self.tx.iovec.clear(); + + for desc in desc_chain { + if desc.is_write_only() { + break; + } + self.tx.iovec.push((desc.addr(), desc.len() as usize)); + read_count += desc.len() as usize; + } + + // If limiter.consume() fails it means there is no more TokenType::Bytes + // budget and rate limiting is in effect. + if !self + .tx + .rate_limiter + .consume(read_count as u64, TokenType::Bytes) + { + rate_limited = true; + // revert the OPS consume() + self.tx.rate_limiter.manual_replenish(1, TokenType::Ops); + // stop processing the queue + break; + } + + read_count = 0; + // Copy buffer from across multiple descriptors. + // TODO(performance - Issue #420): change this to use `writev()` instead of `write()` + // and get rid of the intermediate buffer. + for (desc_addr, desc_len) in self.tx.iovec.drain(..) { + let limit = cmp::min(read_count + desc_len, self.tx.frame_buf.len()); + + let read_result = + mem.read(&mut self.tx.frame_buf[read_count..limit], desc_addr); + match read_result { + Ok(sz) => read_count += sz, + Err(e) => { + self.metrics.tx_fails.inc(); + error!("{}: failed to read slice: {:?}", self.id, e); + break; + } + } + } + + Self::write_to_tap( + &self.tx.frame_buf[..read_count], + &mut self.tap, + &self.metrics, + ); + + self.tx.used_desc_heads[used_count] = header_index; + used_count += 1; + } + if rate_limited { + // If rate limiting kicked in, queue had advanced one element that we aborted + // processing; go back one element so it can be processed next time. + iter.go_to_previous_position(); + } + } + if used_count != 0 { + // TODO(performance - Issue #425): find a way around RUST mutability enforcements to + // allow calling queue.add_used() inside the loop. This would lead to better distribution + // of descriptor usage between the dragonball thread and the guest tx thread. + // One option to do this is to call queue.add_used() from a static function. + for &desc_index in &self.tx.used_desc_heads[..used_count] { + self.tx.queue.add_used(mem, desc_index, 0); + } + + if let Err(e) = self.tx.queue.notify() { + error!("{}: failed to send tx interrupt to guest, {:?}", self.id, e); + } + } + Ok(()) + } + + pub fn get_patch_rate_limiters( + &mut self, + rx_bytes: BucketUpdate, + rx_ops: BucketUpdate, + tx_bytes: BucketUpdate, + tx_ops: BucketUpdate, + ) { + self.rx.rate_limiter.update_buckets(rx_bytes, rx_ops); + self.tx.rate_limiter.update_buckets(tx_bytes, tx_ops); + info!("{}: Update rate limiters", self.id); + } +} + +impl MutEventSubscriber + for NetEpollHandler +{ + fn process(&mut self, events: Events, _ops: &mut EventOps) { + let guard = self.config.lock_guest_memory(); + let mem = guard.deref(); + self.metrics.event_count.inc(); + match events.data() { + RX_QUEUE_EVENT => { + self.metrics.rx_queue_event_count.inc(); + if let Err(e) = self.rx.queue.consume_event() { + self.metrics.event_fails.inc(); + error!("{}: failed to get rx queue event, {:?}", self.id, e); + } else if !self.rx.rate_limiter.is_blocked() { + // If the limiter is not blocked, resume the receiving of bytes. + // There should be a buffer available now to receive the frame into. + if let Err(e) = self.resume_rx(mem) { + self.metrics.event_fails.inc(); + error!("{}: failed to resume rx_queue event, {:?}", self.id, e); + } + } + } + RX_TAP_EVENT => { + self.metrics.rx_tap_event_count.inc(); + + // While limiter is blocked, don't process any more incoming. + if self.rx.rate_limiter.is_blocked() { + // TODO: this may cause busy loop when rate limiting. + // Process a deferred frame first if available. Don't read from tap again + // until we manage to receive this deferred frame. + } else if self.rx.deferred_frame { + if self.rate_limited_rx_single_frame(mem) { + self.rx.deferred_frame = false; + // Process more packats from the tap device. + if let Err(e) = self.process_rx(mem) { + self.metrics.event_fails.inc(); + error!("{}: failed to process rx queue, {:?}", self.id, e); + } + } else if self.rx.deferred_irqs { + self.rx.deferred_irqs = false; + if let Err(e) = self.rx.queue.notify() { + error!("{}: failed to send rx interrupt to guest, {:?}", self.id, e); + } + } + } else if let Err(e) = self.process_rx(mem) { + error!("{}: failed to process rx queue, {:?}", self.id, e); + } + } + TX_QUEUE_EVENT => { + self.metrics.tx_queue_event_count.inc(); + if let Err(e) = self.tx.queue.consume_event() { + self.metrics.event_fails.inc(); + error!("{}: failed to get tx queue event: {:?}", self.id, e); + // If the limiter is not blocked, continue transmitting bytes. + } else if !self.tx.rate_limiter.is_blocked() { + if let Err(e) = self.process_tx(mem) { + self.metrics.event_fails.inc(); + error!("{}: failed to process tx queue, {:?}", self.id, e); + } + } + } + RX_RATE_LIMITER_EVENT => { + // Upon rate limiter event, call the rate limiter handler and restart processing + // the rx queue. + self.metrics.rx_event_rate_limiter_count.inc(); + match self.rx.rate_limiter.event_handler() { + // There might be enough budget now to receive the frame. + Ok(_) => { + if let Err(e) = self.resume_rx(mem) { + self.metrics.event_fails.inc(); + error!("{}: failed to resume rx, {:?}", self.id, e); + } + } + Err(e) => { + self.metrics.event_fails.inc(); + error!("{}: failed to get rx rate-limiter event: {:?}", self.id, e); + } + } + } + TX_RATE_LIMITER_EVENT => { + // Upon rate limiter event, call the rate limiter handler and restart processing + // the tx queue. + self.metrics.tx_rate_limiter_event_count.inc(); + match self.tx.rate_limiter.event_handler() { + // There might be enough budget now to send the frame. + Ok(_) => { + if let Err(e) = self.process_tx(mem) { + self.metrics.event_fails.inc(); + error!("{}: failed to resume tx, {:?}", self.id, e); + } + } + Err(e) => { + self.metrics.event_fails.inc(); + error!("{}: failed to get tx rate-limiter event, {:?}", self.id, e); + } + } + } + PATCH_RATE_LIMITER_EVENT => { + if let Some(receiver) = &self.receiver { + if let Ok((rx_bytes, rx_ops, tx_bytes, tx_ops)) = receiver.try_recv() { + self.get_patch_rate_limiters(rx_bytes, rx_ops, tx_bytes, tx_ops); + if let Err(e) = self.patch_rate_limiter_fd.read() { + error!("{}: failed to get patch event, {:?}", self.id, e); + } + } + } + } + _ => error!("{}: unknown epoll event slot {}", self.id, events.data()), + } + } + + fn init(&mut self, ops: &mut EventOps) { + trace!(target: "virtio-net", "{}: NetEpollHandler::init()", self.id); + + let events = Events::with_data(&self.tap, RX_TAP_EVENT, EventSet::IN); + if let Err(e) = ops.add(events) { + error!("{}: failed to register TAP RX event, {:?}", self.id, e); + } + + let events = + Events::with_data(self.rx.queue.eventfd.as_ref(), RX_QUEUE_EVENT, EventSet::IN); + if let Err(e) = ops.add(events) { + error!("{}: failed to register RX queue event, {:?}", self.id, e); + } + + let events = + Events::with_data(self.tx.queue.eventfd.as_ref(), TX_QUEUE_EVENT, EventSet::IN); + if let Err(e) = ops.add(events) { + error!("{}: failed to register TX queue event, {:?}", self.id, e); + } + + let rx_rate_limiter_fd = self.rx.rate_limiter.as_raw_fd(); + if rx_rate_limiter_fd >= 0 { + let events = + Events::with_data_raw(rx_rate_limiter_fd, RX_RATE_LIMITER_EVENT, EventSet::IN); + if let Err(e) = ops.add(events) { + error!( + "{}: failed to register RX rate limit event, {:?}", + self.id, e + ); + } + } + + let tx_rate_limiter_fd = self.tx.rate_limiter.as_raw_fd(); + if tx_rate_limiter_fd >= 0 { + let events = + Events::with_data_raw(tx_rate_limiter_fd, TX_RATE_LIMITER_EVENT, EventSet::IN); + if let Err(e) = ops.add(events) { + error!( + "{}: failed to register TX rate limit event, {:?}", + self.id, e + ); + } + } + + let events = Events::with_data( + &self.patch_rate_limiter_fd, + PATCH_RATE_LIMITER_EVENT, + EventSet::IN, + ); + if let Err(e) = ops.add(events) { + error!( + "{}: failed to register rate limiter patch event, {:?}", + self.id, e + ); + } + } +} + +pub struct Net { + pub(crate) device_info: VirtioDeviceInfo, + pub tap: Option, + pub queue_sizes: Arc>, + pub rx_rate_limiter: Option, + pub tx_rate_limiter: Option, + pub subscriber_id: Option, + id: String, + phantom: PhantomData, + patch_rate_limiter_fd: EventFd, + sender: Option>, + metrics: Arc, +} + +impl Net { + /// Create a new virtio network device with the given TAP interface. + pub fn new_with_tap( + tap: Tap, + guest_mac: Option<&MacAddr>, + queue_sizes: Arc>, + event_mgr: EpollManager, + rx_rate_limiter: Option, + tx_rate_limiter: Option, + ) -> Result { + trace!(target: "virtio-net", "{}: Net::new_with_tap()", NET_DRIVER_NAME); + + // Set offload flags to match the virtio features below. + tap.set_offload( + net_gen::TUN_F_CSUM | net_gen::TUN_F_UFO | net_gen::TUN_F_TSO4 | net_gen::TUN_F_TSO6, + ) + .map_err(NetError::TapSetOffload)?; + + let vnet_hdr_size = vnet_hdr_len() as i32; + tap.set_vnet_hdr_size(vnet_hdr_size) + .map_err(NetError::TapSetVnetHdrSize)?; + info!("net tap set finished"); + + let mut avail_features = 1u64 << VIRTIO_NET_F_GUEST_CSUM + | 1u64 << VIRTIO_NET_F_CSUM + | 1u64 << VIRTIO_NET_F_GUEST_TSO4 + | 1u64 << VIRTIO_NET_F_GUEST_UFO + | 1u64 << VIRTIO_NET_F_HOST_TSO4 + | 1u64 << VIRTIO_NET_F_HOST_UFO + | 1u64 << VIRTIO_F_VERSION_1; + + let mut config_space = Vec::new(); + if let Some(mac) = guest_mac { + config_space.resize(MAC_ADDR_LEN, 0); + config_space[..].copy_from_slice(mac.get_bytes()); + // When this feature isn't available, the driver generates a random MAC address. + // Otherwise, it should attempt to read the device MAC address from the config space. + avail_features |= 1u64 << VIRTIO_NET_F_MAC; + } + + let device_info = VirtioDeviceInfo::new( + NET_DRIVER_NAME.to_string(), + avail_features, + queue_sizes.clone(), + config_space, + event_mgr, + ); + let id = device_info.driver_name.clone(); + Ok(Net { + tap: Some(tap), + device_info, + queue_sizes, + rx_rate_limiter, + tx_rate_limiter, + subscriber_id: None, + id, + phantom: PhantomData, + patch_rate_limiter_fd: EventFd::new(0).unwrap(), + sender: None, + metrics: Arc::new(NetDeviceMetrics::default()), + }) + } + + /// Create a new virtio network device with the given Host Device Name + pub fn new( + host_dev_name: String, + guest_mac: Option<&MacAddr>, + queue_sizes: Arc>, + epoll_mgr: EpollManager, + rx_rate_limiter: Option, + tx_rate_limiter: Option, + ) -> Result { + info!("open net tap {}", host_dev_name); + let tap = Tap::open_named(host_dev_name.as_str(), false).map_err(NetError::TapOpen)?; + info!("net tap opened"); + + Self::new_with_tap( + tap, + guest_mac, + queue_sizes, + epoll_mgr, + rx_rate_limiter, + tx_rate_limiter, + ) + } + + pub fn metrics(&self) -> Arc { + self.metrics.clone() + } +} + +impl Net { + pub fn set_patch_rate_limiters( + &self, + rx_bytes: BucketUpdate, + rx_ops: BucketUpdate, + tx_bytes: BucketUpdate, + tx_ops: BucketUpdate, + ) -> Result<()> { + if let Some(sender) = &self.sender { + if sender.send((rx_bytes, rx_ops, tx_bytes, tx_ops)).is_ok() { + if let Err(e) = self.patch_rate_limiter_fd.write(1) { + error!( + "virtio-net: failed to write rate-limiter patch event {:?}", + e + ); + Err(Error::InternalError) + } else { + Ok(()) + } + } else { + error!("virtio-net: failed to send rate-limiter patch data"); + Err(Error::InternalError) + } + } else { + error!("virtio-net: failed to establish channel to send rate-limiter patch data"); + Err(Error::InternalError) + } + } +} + +impl VirtioDevice for Net +where + AS: DbsGuestAddressSpace, + Q: QueueT + Send + 'static, + R: GuestMemoryRegion + Sync + Send + 'static, +{ + fn device_type(&self) -> u32 { + TYPE_NET + } + + fn queue_max_sizes(&self) -> &[u16] { + &self.queue_sizes + } + + fn get_avail_features(&self, page: u32) -> u32 { + self.device_info.get_avail_features(page) + } + + fn set_acked_features(&mut self, page: u32, value: u32) { + trace!(target: "virtio-net", "{}: VirtioDevice::set_acked_features({}, 0x{:x})", + self.id, page, value); + self.device_info.set_acked_features(page, value) + } + + fn read_config(&mut self, offset: u64, data: &mut [u8]) -> ConfigResult { + trace!(target: "virtio-net", "{}: VirtioDevice::read_config(0x{:x}, {:?})", + self.id, offset, data); + self.device_info.read_config(offset, data).map_err(|e| { + self.metrics.cfg_fails.inc(); + e + }) + } + + fn write_config(&mut self, offset: u64, data: &[u8]) -> ConfigResult { + trace!(target: "virtio-net", "{}: VirtioDevice::write_config(0x{:x}, {:?})", + self.id, offset, data); + self.device_info.write_config(offset, data).map_err(|e| { + self.metrics.cfg_fails.inc(); + e + }) + } + + fn activate(&mut self, mut config: VirtioDeviceConfig) -> ActivateResult { + trace!(target: "virtio-net", "{}: VirtioDevice::activate()", self.id); + // Do not support control queue and multi queue. + if config.queues.len() != 2 { + self.metrics.activate_fails.inc(); + return Err(ActivateError::InvalidParam); + } + + self.device_info + .check_queue_sizes(&config.queues[..]) + .map_err(|e| { + self.metrics.activate_fails.inc(); + e + })?; + let tap = self.tap.take().ok_or_else(|| { + self.metrics.activate_fails.inc(); + ActivateError::InvalidParam + })?; + let (sender, receiver) = mpsc::channel(); + self.sender = Some(sender); + let rx_queue = config.queues.remove(0); + let tx_queue = config.queues.remove(0); + let rx = RxVirtio::::new(rx_queue, self.rx_rate_limiter.take().unwrap_or_default()); + let tx = TxVirtio::::new(tx_queue, self.tx_rate_limiter.take().unwrap_or_default()); + let patch_rate_limiter_fd = self.patch_rate_limiter_fd.try_clone().unwrap(); + + let handler = Box::new(NetEpollHandler { + tap, + rx, + tx, + config, + id: self.id.clone(), + patch_rate_limiter_fd, + receiver: Some(receiver), + metrics: self.metrics.clone(), + }); + + self.subscriber_id = Some(self.device_info.register_event_handler(handler)); + Ok(()) + } + + fn get_resource_requirements( + &self, + requests: &mut Vec, + use_generic_irq: bool, + ) { + trace!(target: "virtio-net", "{}: VirtioDevice::get_resource_requirements()", self.id); + requests.push(ResourceConstraint::LegacyIrq { irq: None }); + if use_generic_irq { + requests.push(ResourceConstraint::GenericIrq { + size: (self.queue_sizes.len() + 1) as u32, + }); + } + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn remove(&mut self) { + let subscriber_id = self.subscriber_id.take(); + if let Some(subscriber_id) = subscriber_id { + match self.device_info.remove_event_handler(subscriber_id) { + Ok(_) => debug!("virtio-net: removed subscriber_id {:?}", subscriber_id), + Err(err) => warn!("virtio-net: failed to remove event handler: {:?}", err), + }; + } else { + self.tap.take(); + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::thread; + use std::time::Duration; + + use dbs_device::resources::DeviceResources; + use dbs_interrupt::NoopNotifier; + use dbs_utils::epoll_manager::SubscriberOps; + use dbs_utils::rate_limiter::TokenBucket; + use kvm_ioctls::Kvm; + use vm_memory::{GuestAddress, GuestMemoryMmap}; + + use super::*; + use crate::tests::{VirtQueue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; + use crate::ConfigError; + + static NEXT_IP: AtomicUsize = AtomicUsize::new(1); + + #[allow(dead_code)] + const MAX_REQ_SIZE: u32 = 0x10000; + + fn create_net_epoll_handler(id: String) -> NetEpollHandler> { + let next_ip = NEXT_IP.fetch_add(1, Ordering::SeqCst); + let tap = Tap::open_named(&format!("tap{next_ip}"), false).unwrap(); + let rx = RxVirtio::new( + VirtioQueueConfig::create(256, 0).unwrap(), + RateLimiter::default(), + ); + let tx = TxVirtio::new( + VirtioQueueConfig::create(256, 0).unwrap(), + RateLimiter::default(), + ); + let mem = Arc::new(GuestMemoryMmap::from_ranges(&[(GuestAddress(0x0), 0x10000)]).unwrap()); + let queues = vec![VirtioQueueConfig::create(256, 0).unwrap()]; + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::new( + mem, + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + NetEpollHandler { + tap, + rx, + tx, + config, + id, + patch_rate_limiter_fd: EventFd::new(0).unwrap(), + receiver: None, + metrics: Arc::new(NetDeviceMetrics::default()), + } + } + + #[test] + fn test_net_virtio_device_normal() { + let next_ip = NEXT_IP.fetch_add(1, Ordering::SeqCst); + let tap = Tap::open_named(&format!("tap{next_ip}"), false).unwrap(); + let epoll_mgr = EpollManager::default(); + + let mut dev = Net::>::new_with_tap( + tap, + None, + Arc::new(vec![128]), + epoll_mgr, + None, + None, + ) + .unwrap(); + + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::device_type(&dev), + TYPE_NET + ); + let queue_size = vec![128]; + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::queue_max_sizes( + &dev + ), + &queue_size[..] + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 0), + dev.device_info.get_avail_features(0) + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 1), + dev.device_info.get_avail_features(1) + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 2), + dev.device_info.get_avail_features(2) + ); + VirtioDevice::>, QueueSync, GuestRegionMmap>::set_acked_features( + &mut dev, 2, 0, + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features(&dev, 2), + 0 + ); + // device config length is 0 because guest_mac is None + let mut config: [u8; 1] = [0]; + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::read_config( + &mut dev, + 0, + &mut config, + ) + .unwrap_err(), + ConfigError::InvalidOffset(0) + ); + let config: [u8; 16] = [0; 16]; + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::write_config( + &mut dev, 0, &config, + ) + .unwrap_err(), + ConfigError::InvalidOffset(0) + ); + } + + #[test] + fn test_net_virtio_device_active() { + let epoll_mgr = EpollManager::default(); + { + // config queue size is not 2 + let next_ip = NEXT_IP.fetch_add(1, Ordering::SeqCst); + let tap = Tap::open_named(&format!("tap{next_ip}"), false).unwrap(); + let mut dev = Net::>::new_with_tap( + tap, + None, + Arc::new(vec![128]), + epoll_mgr.clone(), + None, + None, + ) + .unwrap(); + + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues = Vec::new(); + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = + VirtioDeviceConfig::>, QueueSync, GuestRegionMmap>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + + matches!(dev.activate(config), Err(ActivateError::InvalidParam)); + } + { + // check queue sizes error + let next_ip = NEXT_IP.fetch_add(1, Ordering::SeqCst); + let tap = Tap::open_named(&format!("tap{next_ip}"), false).unwrap(); + let mut dev = Net::>::new_with_tap( + tap, + None, + Arc::new(vec![128]), + epoll_mgr.clone(), + None, + None, + ) + .unwrap(); + + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues = vec![ + VirtioQueueConfig::create(2, 0).unwrap(), + VirtioQueueConfig::create(2, 0).unwrap(), + ]; + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = + VirtioDeviceConfig::>, QueueSync, GuestRegionMmap>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + + matches!(dev.activate(config), Err(ActivateError::InvalidParam)); + } + { + // test no tap + let next_ip = NEXT_IP.fetch_add(1, Ordering::SeqCst); + let tap = Tap::open_named(&format!("tap{next_ip}"), false).unwrap(); + let mut dev = Net::>::new_with_tap( + tap, + None, + Arc::new(vec![128, 128]), + epoll_mgr.clone(), + None, + None, + ) + .unwrap(); + dev.tap = None; + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues = vec![ + VirtioQueueConfig::create(128, 0).unwrap(), + VirtioQueueConfig::create(128, 0).unwrap(), + ]; + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = + VirtioDeviceConfig::>, QueueSync, GuestRegionMmap>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + + matches!(dev.activate(config), Err(ActivateError::InvalidParam)); + } + { + // Ok + let next_ip = NEXT_IP.fetch_add(1, Ordering::SeqCst); + let tap = Tap::open_named(&format!("tap{next_ip}"), false).unwrap(); + let mut dev = Net::>::new_with_tap( + tap, + None, + Arc::new(vec![128, 128]), + epoll_mgr, + None, + None, + ) + .unwrap(); + + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues = vec![ + VirtioQueueConfig::create(128, 0).unwrap(), + VirtioQueueConfig::create(128, 0).unwrap(), + ]; + + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = + VirtioDeviceConfig::>, QueueSync, GuestRegionMmap>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + + assert!(dev.activate(config).is_ok()); + } + } + + #[test] + fn test_net_set_patch_rate_limiters() { + let next_ip = NEXT_IP.fetch_add(1, Ordering::SeqCst); + let tap = Tap::open_named(&format!("tap{next_ip}"), false).unwrap(); + let epoll_mgr = EpollManager::default(); + + let mut dev = Net::>::new_with_tap( + tap, + None, + Arc::new(vec![128]), + epoll_mgr, + None, + None, + ) + .unwrap(); + + //No sender + assert!(dev + .set_patch_rate_limiters( + BucketUpdate::None, + BucketUpdate::None, + BucketUpdate::None, + BucketUpdate::None + ) + .is_err()); + + let (sender, _receiver) = mpsc::channel(); + dev.sender = Some(sender); + assert!(dev + .set_patch_rate_limiters( + BucketUpdate::None, + BucketUpdate::None, + BucketUpdate::None, + BucketUpdate::None + ) + .is_ok()); + } + + #[test] + fn test_net_get_patch_rate_limiters() { + let mut handler = create_net_epoll_handler("test_1".to_string()); + let tokenbucket = TokenBucket::new(1, 1, 4); + + //update rx + handler.get_patch_rate_limiters( + BucketUpdate::None, + BucketUpdate::Update(tokenbucket.clone()), + BucketUpdate::None, + BucketUpdate::None, + ); + assert_eq!(handler.rx.rate_limiter.ops().unwrap(), &tokenbucket); + + //update tx + handler.get_patch_rate_limiters( + BucketUpdate::None, + BucketUpdate::None, + BucketUpdate::None, + BucketUpdate::Update(tokenbucket.clone()), + ); + assert_eq!(handler.tx.rate_limiter.ops().unwrap(), &tokenbucket); + } + + #[test] + fn test_net_epoll_handler_handle_event() { + let handler = create_net_epoll_handler("test_1".to_string()); + let event_fd = EventFd::new(0).unwrap(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_op = inner_mgr.event_ops(id).unwrap(); + let event_set = EventSet::EDGE_TRIGGERED; + let mut handler = create_net_epoll_handler("test_2".to_string()); + + // test for RX_QUEUE_EVENT + let events = Events::with_data(&event_fd, RX_QUEUE_EVENT, event_set); + handler.process(events, &mut event_op); + handler.config.queues[0].generate_event().unwrap(); + handler.process(events, &mut event_op); + + // test for TX_QUEUE_EVENT + let events = Events::with_data(&event_fd, TX_QUEUE_EVENT, event_set); + handler.process(events, &mut event_op); + handler.config.queues[0].generate_event().unwrap(); + handler.process(events, &mut event_op); + + // test for RX_TAP_EVENT + let events = Events::with_data(&event_fd, RX_TAP_EVENT, event_set); + handler.process(events, &mut event_op); + + // test for RX&TX RATE_LIMITER_EVENT + let events = Events::with_data(&event_fd, RX_RATE_LIMITER_EVENT, event_set); + handler.process(events, &mut event_op); + let events = Events::with_data(&event_fd, TX_RATE_LIMITER_EVENT, event_set); + handler.process(events, &mut event_op); + + // test for PATCH_RATE_LIMITER_EVENT + let events = Events::with_data(&event_fd, PATCH_RATE_LIMITER_EVENT, event_set); + handler.process(events, &mut event_op); + } + + #[test] + fn test_net_epoll_handler_handle_unknown_event() { + let handler = create_net_epoll_handler("test_1".to_string()); + let event_fd = EventFd::new(0).unwrap(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_op = inner_mgr.event_ops(id).unwrap(); + let event_set = EventSet::EDGE_TRIGGERED; + let mut handler = create_net_epoll_handler("test_2".to_string()); + + // test for unknown event + let events = Events::with_data(&event_fd, NET_EVENTS_COUNT + 10, event_set); + handler.process(events, &mut event_op); + } + + #[test] + fn test_net_epoll_handler_process_queue() { + { + let mut handler = create_net_epoll_handler("test_1".to_string()); + + let m = &handler.config.vm_as.clone(); + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + let q = vq.create_queue(); + vq.dtable(0).set(0x1000, 0x1000, VIRTQ_DESC_F_NEXT, 1); + vq.dtable(1) + .set(0x2000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); + vq.dtable(2).set(0x3000, 1, VIRTQ_DESC_F_WRITE, 1); + + handler.config.queues = vec![VirtioQueueConfig::new( + q, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + 0, + )]; + assert!(handler.process_rx(m).is_ok()); + } + } + + #[test] + fn test_net_bandwidth_rate_limiter() { + let handler = create_net_epoll_handler("test_1".to_string()); + + let event_fd = EventFd::new(0).unwrap(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_op = inner_mgr.event_ops(id).unwrap(); + let event_set = EventSet::EDGE_TRIGGERED; + let mut handler = create_net_epoll_handler("test_2".to_string()); + let m = &handler.config.vm_as.clone(); + + // Test TX bandwidth rate limiting + { + // create bandwidth rate limiter + let mut rl = RateLimiter::new(0x1000, 0, 100, 0, 0, 0).unwrap(); + // use up the budget + assert!(rl.consume(0x1000, TokenType::Bytes)); + + // set this tx rate limiter to be used + handler.tx.rate_limiter = rl; + // try doing TX + let vq = VirtQueue::new(GuestAddress(0), m, 16); + + let q = vq.create_queue(); + + vq.avail.idx().store(1); + vq.avail.ring(0).store(0); + vq.dtable(0).set(0x2000, 0x1000, 0, 0); + handler.tx.queue.queue = q; + + let events = Events::with_data(&event_fd, TX_QUEUE_EVENT, event_set); + assert!(handler.tx.queue.generate_event().is_ok()); + handler.process(events, &mut event_op); + assert!(handler.tx.rate_limiter.is_blocked()); + + thread::sleep(Duration::from_millis(200)); + + let events = Events::with_data(&event_fd, TX_RATE_LIMITER_EVENT, event_set); + handler.process(events, &mut event_op); + assert!(!handler.tx.rate_limiter.is_blocked()); + } + // Test RX bandwidth rate limiting + { + // create bandwidth rate limiter + let mut rl = RateLimiter::new(0x1000, 0, 100, 0, 0, 0).unwrap(); + // use up the budget + assert!(rl.consume(0x1000, TokenType::Bytes)); + + // set this rx rate limiter to be used + handler.rx.rate_limiter = rl; + // try doing RX + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + vq.dtable(0).set(0x2000, 0x1000, VIRTQ_DESC_F_WRITE, 0); + + let q = vq.create_queue(); + handler.rx.queue.queue = q; + + handler.rx.deferred_frame = true; + handler.rx.bytes_read = 0x1000; + + let events = Events::with_data(&event_fd, RX_QUEUE_EVENT, event_set); + assert!(handler.rx.queue.generate_event().is_ok()); + handler.process(events, &mut event_op); + assert!(handler.rx.rate_limiter.is_blocked()); + + thread::sleep(Duration::from_millis(200)); + + let events = Events::with_data(&event_fd, RX_RATE_LIMITER_EVENT, event_set); + handler.process(events, &mut event_op); + assert!(!handler.rx.rate_limiter.is_blocked()); + } + } + + #[test] + fn test_net_ops_rate_limiter() { + let handler = create_net_epoll_handler("test_1".to_string()); + + let event_fd = EventFd::new(0).unwrap(); + let mgr = EpollManager::default(); + let id = mgr.add_subscriber(Box::new(handler)); + let mut inner_mgr = mgr.mgr.lock().unwrap(); + let mut event_op = inner_mgr.event_ops(id).unwrap(); + let event_set = EventSet::EDGE_TRIGGERED; + let mut handler = create_net_epoll_handler("test_2".to_string()); + let m = &handler.config.vm_as.clone(); + + // Test TX ops rate limiting + { + // create ops rate limiter + let mut rl = RateLimiter::new(0, 0, 0, 2, 0, 100).unwrap(); + // use up the budget + assert!(rl.consume(2, TokenType::Ops)); + + // set this tx rate limiter to be used + handler.tx.rate_limiter = rl; + // try doing TX + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + + let q = vq.create_queue(); + handler.tx.queue.queue = q; + + let events = Events::with_data(&event_fd, TX_QUEUE_EVENT, event_set); + assert!(handler.tx.queue.generate_event().is_ok()); + handler.process(events, &mut event_op); + assert!(handler.tx.rate_limiter.is_blocked()); + + thread::sleep(Duration::from_millis(100)); + + let events = Events::with_data(&event_fd, TX_RATE_LIMITER_EVENT, event_set); + handler.process(events, &mut event_op); + assert!(!handler.tx.rate_limiter.is_blocked()); + } + // Test RX ops rate limiting + { + // create ops rate limiter + let mut rl = RateLimiter::new(0, 0, 0, 2, 0, 100).unwrap(); + // use up the budget + assert!(rl.consume(2, TokenType::Ops)); + + // set this rx rate limiter to be used + handler.rx.rate_limiter = rl; + // try doing RX + let vq = VirtQueue::new(GuestAddress(0), m, 16); + vq.avail.ring(0).store(0); + vq.avail.idx().store(1); + + let q = vq.create_queue(); + handler.rx.queue.queue = q; + + handler.rx.deferred_frame = true; + + let events = Events::with_data(&event_fd, RX_QUEUE_EVENT, event_set); + assert!(handler.rx.queue.generate_event().is_ok()); + handler.process(events, &mut event_op); + assert!(handler.rx.rate_limiter.is_blocked()); + + thread::sleep(Duration::from_millis(100)); + + let events = Events::with_data(&event_fd, RX_RATE_LIMITER_EVENT, event_set); + handler.process(events, &mut event_op); + assert!(!handler.rx.rate_limiter.is_blocked()); + } + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/notifier.rs b/src/dragonball/src/dbs_virtio_devices/src/notifier.rs new file mode 100644 index 000000000..4688a395a --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/notifier.rs @@ -0,0 +1,89 @@ +// Copyright 2019 Alibaba Cloud. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Wrappers over `InterruptNotifier` to support virtio device interrupt management. + +use std::sync::Arc; + +use dbs_interrupt::{ + InterruptIndex, InterruptNotifier, InterruptSourceGroup, InterruptSourceType, + InterruptStatusRegister32, LegacyNotifier, MsiNotifier, +}; + +use crate::{VIRTIO_INTR_CONFIG, VIRTIO_INTR_VRING}; + +/// Create an interrupt notifier for virtio device change events. +pub fn create_device_notifier( + group: Arc>, + intr_status: Arc, + intr_index: InterruptIndex, +) -> Arc { + match group.interrupt_type() { + InterruptSourceType::LegacyIrq => { + Arc::new(LegacyNotifier::new(group, intr_status, VIRTIO_INTR_CONFIG)) + } + InterruptSourceType::MsiIrq => Arc::new(MsiNotifier::new(group, intr_index)), + } +} + +/// Create an interrupt notifier for virtio queue notification events. +pub fn create_queue_notifier( + group: Arc>, + intr_status: Arc, + intr_index: InterruptIndex, +) -> Arc { + match group.interrupt_type() { + InterruptSourceType::LegacyIrq => { + Arc::new(LegacyNotifier::new(group, intr_status, VIRTIO_INTR_VRING)) + } + InterruptSourceType::MsiIrq => Arc::new(MsiNotifier::new(group, intr_index)), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use dbs_interrupt::InterruptManager; + + #[test] + fn test_create_virtio_legacy_notifier() { + let (_vmfd, irq_manager) = crate::tests::create_vm_and_irq_manager(); + let group = irq_manager + .create_group(InterruptSourceType::LegacyIrq, 0, 1) + .unwrap(); + let status = Arc::new(InterruptStatusRegister32::new()); + assert_eq!(status.read(), 0); + + let notifer = create_queue_notifier(group.clone(), status.clone(), 0); + notifer.notify().unwrap(); + assert!(notifer.notifier().is_some()); + + assert_eq!(status.read(), VIRTIO_INTR_VRING); + status.clear_bits(VIRTIO_INTR_VRING); + assert_eq!(status.read(), 0); + let eventfd = notifer.notifier().unwrap(); + eventfd.write(2).unwrap(); + assert_eq!(eventfd.read().unwrap(), 3); + } + + #[test] + fn test_create_virtio_msi_notifier() { + let (_vmfd, irq_manager) = crate::tests::create_vm_and_irq_manager(); + let group = irq_manager + .create_group(InterruptSourceType::MsiIrq, 0, 3) + .unwrap(); + let status = Arc::new(InterruptStatusRegister32::new()); + + let notifier1 = create_device_notifier(group.clone(), status.clone(), 1); + let notifier2 = create_queue_notifier(group.clone(), status.clone(), 2); + let notifier3 = create_queue_notifier(group.clone(), status, 3); + assert!(notifier1.notifier().is_some()); + assert!(notifier2.notifier().is_some()); + assert!(notifier3.notifier().is_none()); + notifier1.notify().unwrap(); + notifier1.notify().unwrap(); + notifier2.notify().unwrap(); + assert_eq!(notifier1.notifier().unwrap().read().unwrap(), 2); + assert_eq!(notifier2.notifier().unwrap().read().unwrap(), 1); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/backend/inner.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/backend/inner.rs new file mode 100644 index 000000000..1cecc0fa4 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/backend/inner.rs @@ -0,0 +1,923 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::any::Any; +use std::io::{Error, ErrorKind, Read, Result, Write}; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::mpsc::{channel, Receiver, RecvTimeoutError, Sender, TryRecvError}; +use std::sync::Arc; +use std::time::Duration; + +use log::error; +use vmm_sys_util::eventfd::{EventFd, EFD_NONBLOCK, EFD_SEMAPHORE}; + +use super::{VsockBackend, VsockBackendType, VsockStream}; + +#[derive(Debug)] +enum InnerStreamRole { + Internal, + External, +} + +/// The stream implementation of vsock inner backend. It can be used like a +/// normal unix stream. +/// +/// When working with epoll, VsockInnerStream only can be used with +/// `level-trigged` mode. +pub struct VsockInnerStream { + stream_event: Arc, + peer_event: Arc, + writer: Sender>, + reader: Receiver>, + read_buf: Option<(Vec, usize)>, + stream_nonblocking: Arc, + peer_nonblocking: Arc, + read_timeout: Option, + role: InnerStreamRole, +} + +impl VsockInnerStream { + fn new( + stream_event: Arc, + peer_event: Arc, + writer: Sender>, + reader: Receiver>, + stream_nonblocking: Arc, + peer_nonblocking: Arc, + role: InnerStreamRole, + ) -> Self { + VsockInnerStream { + stream_event, + peer_event, + writer, + reader, + read_buf: None, + stream_nonblocking, + peer_nonblocking, + read_timeout: None, + role, + } + } + + fn recv_msg_from_channel( + &mut self, + buf: &mut [u8], + msg: Vec, + total_read_len: &mut usize, + ) -> Result { + let read_len = Self::read_msg_from_vec(buf, &msg, *total_read_len, 0); + let mut read_finish = false; + *total_read_len += read_len; + + if read_len < msg.len() { + // buf is full, but msg is not fully read, save it in read_buf (the + // previous read_buf should have been read through before) + self.read_buf = Some((msg, read_len)); + read_finish = true; + } else { + // if msg is fully read, consume one event, and go + // on read next message + self.consume_event()?; + } + + Ok(read_finish) + } + + fn trigger_peer_event(&self) -> Result<()> { + self.peer_event.write(1).map_err(|e| { + error!( + "vsock inner stream {:?}: trigger peer event failed: {:?}", + self.role, e + ); + e + })?; + + Ok(()) + } + + fn consume_event(&self) -> Result<()> { + self.stream_event.read().map_err(|e| { + error!( + "vsock inner stream {:?}: consume event failed: {:?}", + self.role, e + ); + e + })?; + + Ok(()) + } + + fn read_msg_from_vec(buf: &mut [u8], msg: &[u8], buf_start: usize, msg_start: usize) -> usize { + let min_len = std::cmp::min(buf.len() - buf_start, msg.len() - msg_start); + buf[buf_start..buf_start + min_len].copy_from_slice(&msg[msg_start..msg_start + min_len]); + min_len + } +} + +impl AsRawFd for VsockInnerStream { + fn as_raw_fd(&self) -> RawFd { + self.stream_event.as_raw_fd() + } +} + +impl Read for VsockInnerStream { + fn read(&mut self, buf: &mut [u8]) -> Result { + let mut total_read_len = 0; + // if read_buf is not empty, get data from read_buf first + if let Some((read_buf, buf_read_len)) = self.read_buf.as_mut() { + let read_len = Self::read_msg_from_vec(buf, read_buf, total_read_len, *buf_read_len); + total_read_len += read_len; + *buf_read_len += read_len; + + // if read_buf is all read, consume one event + if *buf_read_len == read_buf.len() { + self.consume_event()?; + self.read_buf.take(); + } + } + + // if buf is full, just return + if total_read_len == buf.len() { + return Ok(total_read_len); + } + + // continously fetch data from channel to fill the buf, until the buf is + // full + loop { + // fetch data from channel + match self.reader.try_recv() { + Ok(msg) => { + if self.recv_msg_from_channel(buf, msg, &mut total_read_len)? { + return Ok(total_read_len); + } + } + // this arm indicates there's no more data can fetch from + // channel + Err(TryRecvError::Empty) => { + if total_read_len > 0 { + return Ok(total_read_len); + } else { + // - non-blocking mode: return `WouldBlock` directly + // - blocking mode: use channel's `recv`/`recv_timeout` + // function to block until channel have new data again + if self.stream_nonblocking.load(Ordering::SeqCst) { + return Err(Error::from(ErrorKind::WouldBlock)); + } else { + // - no read timeout: use channel's `recv` function + // to block until a message comes + // - have read timeout: use channel's `recv_timeout` + // to block until a message comes or reach the + // timeout time + if let Some(dur) = self.read_timeout { + match self.reader.recv_timeout(dur) { + Ok(msg) => { + if self.recv_msg_from_channel( + buf, + msg, + &mut total_read_len, + )? { + return Ok(total_read_len); + } + } + Err(RecvTimeoutError::Timeout) => { + return Err(Error::from(ErrorKind::TimedOut)) + } + Err(RecvTimeoutError::Disconnected) => { + return Err(Error::from(ErrorKind::ConnectionReset)) + } + } + } else { + match self.reader.recv() { + Ok(msg) => { + if self.recv_msg_from_channel( + buf, + msg, + &mut total_read_len, + )? { + return Ok(total_read_len); + } + } + Err(_) => return Err(Error::from(ErrorKind::ConnectionReset)), + } + } + } + } + } + Err(TryRecvError::Disconnected) => { + return Err(Error::from(ErrorKind::ConnectionReset)); + } + } + } + } +} + +impl Write for VsockInnerStream { + fn write(&mut self, buf: &[u8]) -> Result { + // We need to carefully distinguish between the timing of the trigger + // eventfd and the writing of data to the channel, because the streams + // on both ends may be working in different threads, and these two + // operations are not atomic! + let peer_nonblocking = self.peer_nonblocking.load(Ordering::SeqCst); + + // In blocking mode, the other end will simulate blocking io by blocking + // on the recv() method of the channel, at which point, if data is + // written to the channel, the other end will immediately return and + // perform the operation of fetching data, during this, one important + // things is to confirm that all the data sent has been read in this + // time, which is done by reading eventfd. + // + // However, if the other side executes faster and we haven't finished + // the trigger eventfd by the time it reads the eventfd, then it will + // return a failure. Therefore, in blocking mode, the eventfd should be + // triggered before writing data to the channel. + if !peer_nonblocking { + self.trigger_peer_event()?; + } + + if let Err(_e) = self.writer.send(buf.to_vec()) { + return Err(Error::from(ErrorKind::ConnectionReset)); + } + + // On the contrary, in nonblocking mode, the peer does not block in the + // recv() method of the channel, but generally adds eventfd to the epoll + // event loop, at this point, if we trigger eventfd, the peer will + // return immediately and perform the fetch operation, but if we do not + // send the data to the channel, then the fetching may fail. Therefore, + // in nonblocking mode, we need to trigger eventfd after writing data + // to the channel. + if peer_nonblocking { + self.trigger_peer_event()?; + } + Ok(buf.len()) + } + + fn flush(&mut self) -> Result<()> { + Ok(()) + } +} + +impl Drop for VsockInnerStream { + fn drop(&mut self) { + // we need to notify peer stream when dropping, peer stream will sense + // that this side of read channel has been disconnected and return an + // error for the upper layer to drop it + if let Err(e) = self.trigger_peer_event() { + error!( + "VsockInnerStream {:?}: can't notify peer inner stream that should be drop: {}", + self.role, e + ); + } + } +} + +impl VsockStream for VsockInnerStream { + fn backend_type(&self) -> VsockBackendType { + VsockBackendType::Inner + } + + fn set_nonblocking(&mut self, nonblocking: bool) -> Result<()> { + self.stream_nonblocking.store(nonblocking, Ordering::SeqCst); + Ok(()) + } + + fn set_read_timeout(&mut self, dur: Option) -> Result<()> { + self.read_timeout = dur; + Ok(()) + } + + fn set_write_timeout(&mut self, _dur: Option) -> Result<()> { + // here's a infinite channel for write, no need to consider about write + // timeout. + Ok(()) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +/// Vsock inner connector is used to connect to vsock inner backend. +#[derive(Clone)] +pub struct VsockInnerConnector { + backend_event: Arc, + conn_sender: Sender, +} + +impl std::fmt::Debug for VsockInnerConnector { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str("VsockInnerConnector") + } +} + +impl VsockInnerConnector { + /// Connect to vsock inner backend and get a new inner stream. + pub fn connect(&self) -> Result> { + self.connect_() + .map(|stream| Box::new(stream) as Box) + } + + fn connect_(&self) -> Result { + let (internal_sender, external_receiver) = channel(); + let (external_sender, internal_receiver) = channel(); + // use `EFD_SEMAPHORE` mode to make EventFd as a write counter for + // channel. + let internal_event = Arc::new(EventFd::new(EFD_NONBLOCK | EFD_SEMAPHORE)?); + let external_event = Arc::new(EventFd::new(EFD_NONBLOCK | EFD_SEMAPHORE)?); + let internal_nonblocking = Arc::new(AtomicBool::new(false)); + let external_nonblocking = Arc::new(AtomicBool::new(false)); + + let mut internal_stream = VsockInnerStream::new( + internal_event.clone(), + external_event.clone(), + internal_sender, + internal_receiver, + internal_nonblocking.clone(), + external_nonblocking.clone(), + InnerStreamRole::Internal, + ); + // internal stream is vsock internal used, we need non-blocking mode + internal_stream.set_nonblocking(true)?; + + // external stream is used for others, the mode can be set by them. + let external_stream = VsockInnerStream::new( + external_event, + internal_event, + external_sender, + external_receiver, + external_nonblocking, + internal_nonblocking, + InnerStreamRole::External, + ); + + // send the inner stream to connection pending list for later accept. + self.conn_sender.send(internal_stream).map_err(|e| { + Error::new( + ErrorKind::ConnectionRefused, + format!("vsock inner stream sender err: {e}"), + ) + })?; + self.backend_event.write(1)?; + + Ok(external_stream) + } +} + +/// The backend implemenation that can be used in-process, no need to forward +/// data by the OS. +pub struct VsockInnerBackend { + /// The eventfd used for notify the connection requests. + backend_event: Arc, + /// The pending connections waiting to be accepted. + pending_conns: Receiver, + /// A sender can Send pending connections to inner backend. + conn_sender: Sender, +} + +impl VsockInnerBackend { + pub fn new() -> Result { + let (conn_sender, pending_conns) = channel(); + // use `EFD_SEMAPHORE` mode to make EventFd as a write counter for + // pending_conns channel. + let backend_event = Arc::new(EventFd::new(EFD_NONBLOCK | EFD_SEMAPHORE)?); + + Ok(VsockInnerBackend { + backend_event, + pending_conns, + conn_sender, + }) + } + + /// Create a inner connector instance. + pub fn get_connector(&self) -> VsockInnerConnector { + VsockInnerConnector { + backend_event: self.backend_event.clone(), + conn_sender: self.conn_sender.clone(), + } + } + + fn accept_(&self) -> Result { + self.backend_event.read()?; + match self.pending_conns.try_recv() { + Ok(stream) => Ok(stream), + Err(_) => Err(Error::from(ErrorKind::ConnectionAborted)), + } + } +} + +impl AsRawFd for VsockInnerBackend { + /// Don't read/write this fd, just use it to get signal. + fn as_raw_fd(&self) -> RawFd { + self.backend_event.as_raw_fd() + } +} + +impl VsockBackend for VsockInnerBackend { + fn accept(&mut self) -> Result> { + self.accept_() + .map(|stream| Box::new(stream) as Box) + } + + fn connect(&self, _dst_port: u32) -> Result> { + Err(Error::new( + ErrorKind::ConnectionRefused, + "vsock inner backend doesn't support incoming connection request", + )) + } + + fn r#type(&self) -> VsockBackendType { + VsockBackendType::Inner + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +#[cfg(test)] +mod tests { + use std::sync::{Condvar, Mutex}; + use std::thread; + use std::time::{Duration, Instant}; + + use super::*; + + #[test] + fn test_inner_backend_create() { + assert!(VsockInnerBackend::new().is_ok()); + } + + #[test] + fn test_inner_backend_accept() { + let mut vsock_backend = VsockInnerBackend::new().unwrap(); + let connector = vsock_backend.get_connector(); + + // no connect request send, accept would return error + assert!(vsock_backend.accept().is_err()); + + // connect once, can accept once + connector.connect().unwrap(); + assert!(vsock_backend.accept().is_ok()); + assert!(vsock_backend.accept().is_err()); + + // connect twice, can accept twice + connector.connect().unwrap(); + connector.connect().unwrap(); + assert!(vsock_backend.accept().is_ok()); + assert!(vsock_backend.accept().is_ok()); + assert!(vsock_backend.accept().is_err()); + } + + #[test] + fn test_inner_backend_communication() { + let test_string = String::from("TEST"); + let mut buffer = [0; 10]; + + let mut vsock_backend = VsockInnerBackend::new().unwrap(); + let connector = vsock_backend.get_connector(); + let mut stream_connect = connector.connect().unwrap(); + stream_connect.set_nonblocking(true).unwrap(); + let mut stream_backend = vsock_backend.accept().unwrap(); + + assert!(stream_connect + .write(&test_string.clone().into_bytes()) + .is_ok()); + assert!(stream_backend.read(&mut buffer).is_ok()); + assert_eq!(&buffer[0..test_string.len()], test_string.as_bytes()); + + assert!(stream_backend + .write(&test_string.clone().into_bytes()) + .is_ok()); + assert!(stream_connect.read(&mut buffer).is_ok()); + assert_eq!(&buffer[0..test_string.len()], test_string.as_bytes()); + } + + #[test] + fn test_inner_backend_connect() { + let vsock_backend = VsockInnerBackend::new().unwrap(); + // inner backend don't support peer connection now + assert!(vsock_backend.connect(0).is_err()); + } + + #[test] + fn test_inner_backend_type() { + let vsock_backend = VsockInnerBackend::new().unwrap(); + assert_eq!(vsock_backend.r#type(), VsockBackendType::Inner); + } + + #[test] + fn test_inner_backend_vsock_stream() { + let vsock_backend = VsockInnerBackend::new().unwrap(); + let connector = vsock_backend.get_connector(); + let mut vsock_stream = connector.connect().unwrap(); + + assert!(vsock_stream.set_nonblocking(true).is_ok()); + assert!(vsock_stream + .set_read_timeout(Some(Duration::from_secs(1))) + .is_ok()); + assert!(vsock_stream.set_read_timeout(None).is_ok()); + assert!(vsock_stream + .set_write_timeout(Some(Duration::from_secs(2))) + .is_ok()); + } + + fn get_inner_backend_stream_pair() -> (VsockInnerStream, VsockInnerStream) { + let vsock_backend = VsockInnerBackend::new().unwrap(); + let connector = vsock_backend.get_connector(); + let outer_stream = connector.connect_().unwrap(); + let inner_stream = vsock_backend.accept_().unwrap(); + + (inner_stream, outer_stream) + } + + #[test] + #[allow(clippy::unused_io_amount)] + fn test_inner_stream_nonblocking() { + // write once, read multi times + { + let (mut inner_stream, mut outer_stream) = get_inner_backend_stream_pair(); + outer_stream.set_nonblocking(true).unwrap(); + + // write data into inner stream with length of 10 + let wirter_buf = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + inner_stream.write_all(&wirter_buf).unwrap(); + + // first, read data from outer stream with length of 5 + let mut reader_buf1 = [0; 5]; + outer_stream.read(&mut reader_buf1).unwrap(); + assert_eq!(reader_buf1, [0, 1, 2, 3, 4]); + // test the unread data in outer stream + assert_eq!(outer_stream.read_buf, Some((Vec::from(&wirter_buf[..]), 5))); + + // second, read more data in outer stream + let mut reader_buf2 = [0; 3]; + outer_stream.read(&mut reader_buf2).unwrap(); + assert_eq!(reader_buf2, [5, 6, 7]); + // test the unread data in outer stream + assert_eq!(outer_stream.read_buf, Some((Vec::from(&wirter_buf[..]), 8))); + + // then, read the last data in outer stream + let mut reader_buf3 = [0; 2]; + outer_stream.read(&mut reader_buf3).unwrap(); + assert_eq!(reader_buf3, [8, 9]); + // there's no unread data in outer stream + assert_eq!(outer_stream.read_buf, None); + + // last, try to read again, it would return error + let mut reader_buf3 = [0; 1]; + assert_eq!( + outer_stream.read(&mut reader_buf3).unwrap_err().kind(), + ErrorKind::WouldBlock + ); + } + + // write multi times, read all + { + let (mut inner_stream, mut outer_stream) = get_inner_backend_stream_pair(); + outer_stream.set_nonblocking(true).unwrap(); + + // first, write some data into inner stream + let writer_buf1 = [0, 1, 2, 3]; + inner_stream.write_all(&writer_buf1).unwrap(); + + // second, write more data into inner stream + let writer_buf2 = [4, 5, 6]; + inner_stream.write_all(&writer_buf2).unwrap(); + + // then, read all data from outer stream + let mut reader_buf1 = [0; 7]; + outer_stream.read(&mut reader_buf1).unwrap(); + assert_eq!(reader_buf1, [0, 1, 2, 3, 4, 5, 6]); + // there's no unread data in outer stream + assert_eq!(outer_stream.read_buf, None); + + // last, try to read again, it would return error + let mut reader_buf2 = [0; 1]; + assert_eq!( + outer_stream.read(&mut reader_buf2).unwrap_err().kind(), + ErrorKind::WouldBlock + ); + } + + // write multi times, then read multi times + { + let (mut inner_stream, mut outer_stream) = get_inner_backend_stream_pair(); + outer_stream.set_nonblocking(true).unwrap(); + + // first, write some data into inner stream + let writer_buf1 = [0, 1, 2, 3]; + inner_stream.write_all(&writer_buf1).unwrap(); + + // second, write more data into inner stream + let writer_buf2 = [4, 5]; + inner_stream.write_all(&writer_buf2).unwrap(); + + // third, write more data into inner stream + let writer_buf3 = [6, 7, 8]; + inner_stream.write_all(&writer_buf3).unwrap(); + + // forth, write more data into inner stream + let writer_buf4 = [9, 10]; + inner_stream.write_all(&writer_buf4).unwrap(); + + // fifth, read some data from outer stream + let mut reader_buf1 = [0; 2]; + outer_stream.read(&mut reader_buf1).unwrap(); + assert_eq!(reader_buf1, [0, 1]); + // now, the content in read buf is writer buf1 + assert_eq!( + outer_stream.read_buf, + Some((Vec::from(&writer_buf1[..]), 2)) + ); + + // sixth, continue read some data from outer steam + let mut reader_buf2 = [0; 3]; + outer_stream.read(&mut reader_buf2).unwrap(); + assert_eq!(reader_buf2, [2, 3, 4]); + // now, the content in read buf is writer buf2 + assert_eq!( + outer_stream.read_buf, + Some((Vec::from(&writer_buf2[..]), 1)) + ); + + // seventh, continue read some data from outer steam + let mut reader_buf3 = [0; 5]; + outer_stream.read(&mut reader_buf3).unwrap(); + assert_eq!(reader_buf3, [5, 6, 7, 8, 9]); + // now, the content in read buf is writer buf4 + assert_eq!( + outer_stream.read_buf, + Some((Vec::from(&writer_buf4[..]), 1)) + ); + + // then, read the rest data from outer stream + let mut reader_buf4 = [0; 3]; + outer_stream.read(&mut reader_buf4).unwrap(); + assert_eq!(reader_buf4, [10, 0, 0]); + // now, there's no unread data in outer stream + assert_eq!(outer_stream.read_buf, None); + + // last, try to read again, it would return error + let mut reader_buf5 = [0; 5]; + assert_eq!( + outer_stream.read(&mut reader_buf5).unwrap_err().kind(), + ErrorKind::WouldBlock + ); + } + + // write and read multi times + { + let (mut inner_stream, mut outer_stream) = get_inner_backend_stream_pair(); + outer_stream.set_nonblocking(true).unwrap(); + + // first, try to read data, it would return error + let mut reader_buf1 = [0; 5]; + assert_eq!( + outer_stream.read(&mut reader_buf1).unwrap_err().kind(), + ErrorKind::WouldBlock + ); + + // second, write some data into inner stream + let writer_buf1 = [0, 1, 2, 3]; + inner_stream.write_all(&writer_buf1).unwrap(); + + // third, read some data from outer stream + let mut reader_buf2 = [0; 2]; + outer_stream.read(&mut reader_buf2).unwrap(); + assert_eq!(reader_buf2, [0, 1]); + // the content in read buf is writer buf1 + assert_eq!( + outer_stream.read_buf, + Some((Vec::from(&writer_buf1[..]), 2)) + ); + + // forth, write some data into inner stream + let writer_buf2 = [4, 5]; + inner_stream.write_all(&writer_buf2).unwrap(); + + // fifth, read some data from outer stream + let mut reader_buf3 = [0; 3]; + outer_stream.read(&mut reader_buf3).unwrap(); + assert_eq!(reader_buf3, [2, 3, 4]); + // the content in read buf is writer buf2 + assert_eq!( + outer_stream.read_buf, + Some((Vec::from(&writer_buf2[..]), 1)) + ); + + // sixth, write some data twice into inner steam + let writer_buf3 = [6]; + inner_stream.write_all(&writer_buf3).unwrap(); + let writer_buf4 = [7, 8, 9]; + inner_stream.write_all(&writer_buf4).unwrap(); + + // seventh, read all data from outer stream + let mut reader_buf4 = [0; 10]; + outer_stream.read(&mut reader_buf4).unwrap(); + assert_eq!(reader_buf4, [5, 6, 7, 8, 9, 0, 0, 0, 0, 0]); + // there's no unread data in outer stream + assert_eq!(outer_stream.read_buf, None); + + // eighth, write some data again into inner stream + let writer_buf5 = [10, 11, 12]; + inner_stream.write_all(&writer_buf5).unwrap(); + + // ninth, read some data from outer stream + let mut reader_buf5 = [0; 1]; + outer_stream.read(&mut reader_buf5).unwrap(); + assert_eq!(reader_buf5, [10]); + // the content in read buf is writer buf5 + assert_eq!( + outer_stream.read_buf, + Some((Vec::from(&writer_buf5[..]), 1)) + ); + + // then, read all data from outer stream + let mut reader_buf6 = [0; 4]; + outer_stream.read(&mut reader_buf6).unwrap(); + assert_eq!(reader_buf6, [11, 12, 0, 0]); + // there's no unread data in outer stream + assert_eq!(outer_stream.read_buf, None); + + // last, try to read again, it would return error + let mut reader_buf7 = [0; 1]; + assert_eq!( + outer_stream.read(&mut reader_buf7).unwrap_err().kind(), + ErrorKind::WouldBlock + ); + } + + // write and read duplex multi times + { + let (mut inner_stream, mut outer_stream) = get_inner_backend_stream_pair(); + outer_stream.set_nonblocking(true).unwrap(); + + // first, try to read data from outer and inner stream, they would + // return error + let mut reader_buf1 = [0; 1]; + assert_eq!( + outer_stream.read(&mut reader_buf1).unwrap_err().kind(), + ErrorKind::WouldBlock + ); + let mut reader_buf2 = [0; 1]; + assert_eq!( + inner_stream.read(&mut reader_buf2).unwrap_err().kind(), + ErrorKind::WouldBlock + ); + + // second, write some data into inner and outer stream + let writer_buf1 = [0, 1, 2]; + inner_stream.write_all(&writer_buf1).unwrap(); + let writer_buf2 = [0, 1]; + outer_stream.write_all(&writer_buf2).unwrap(); + + // third, read all data from outer and inner stream + let mut reader_buf3 = [0; 5]; + outer_stream.read(&mut reader_buf3).unwrap(); + assert_eq!(reader_buf3, [0, 1, 2, 0, 0]); + assert_eq!(outer_stream.read_buf, None); + let mut reader_buf4 = [0; 5]; + inner_stream.read(&mut reader_buf4).unwrap(); + assert_eq!(reader_buf4, [0, 1, 0, 0, 0]); + assert_eq!(inner_stream.read_buf, None); + + // forth, write data twicd into inner and outer stream + let writer_buf3 = [3, 4, 5, 6]; + inner_stream.write_all(&writer_buf3).unwrap(); + let writer_buf4 = [2, 3, 4]; + outer_stream.write_all(&writer_buf4).unwrap(); + let writer_buf5 = [7, 8]; + inner_stream.write_all(&writer_buf5).unwrap(); + let writer_buf6 = [5, 6, 7]; + outer_stream.write_all(&writer_buf6).unwrap(); + + // fifth, read some data from outer and inner stream + let mut reader_buf5 = [0; 5]; + outer_stream.read(&mut reader_buf5).unwrap(); + assert_eq!(reader_buf5, [3, 4, 5, 6, 7]); + assert_eq!( + outer_stream.read_buf, + Some((Vec::from(&writer_buf5[..]), 1)) + ); + let mut reader_buf6 = [0; 5]; + inner_stream.read(&mut reader_buf6).unwrap(); + assert_eq!(reader_buf6, [2, 3, 4, 5, 6]); + assert_eq!( + inner_stream.read_buf, + Some((Vec::from(&writer_buf6[..]), 2)) + ); + + // then, read all data from inner and outer stream + let mut reader_buf7 = [0; 5]; + inner_stream.read(&mut reader_buf7).unwrap(); + assert_eq!(reader_buf7, [7, 0, 0, 0, 0]); + assert_eq!(inner_stream.read_buf, None); + let mut reader_buf8 = [0; 5]; + outer_stream.read(&mut reader_buf8).unwrap(); + assert_eq!(reader_buf8, [8, 0, 0, 0, 0]); + assert_eq!(outer_stream.read_buf, None); + + // last, read data from outer and inner stream again, they would + // return error + let mut reader_buf9 = [0; 1]; + assert_eq!( + outer_stream.read(&mut reader_buf9).unwrap_err().kind(), + ErrorKind::WouldBlock + ); + let mut reader_buf10 = [0; 1]; + assert_eq!( + inner_stream.read(&mut reader_buf10).unwrap_err().kind(), + ErrorKind::WouldBlock + ); + } + } + + #[test] + fn test_inner_stream_block() { + // outer stream is in block mode + let (mut inner_stream, mut outer_stream) = get_inner_backend_stream_pair(); + + let start_time = Instant::now(); + let handler = thread::spawn(move || { + let mut reader_buf = [0; 5]; + assert!(outer_stream.read_exact(&mut reader_buf).is_ok()); + assert_eq!(reader_buf, [1, 2, 3, 4, 5]); + assert!(Instant::now().duration_since(start_time).as_millis() >= 500); + }); + + // sleep 500ms + thread::sleep(Duration::from_millis(500)); + let writer_buf = [1, 2, 3, 4, 5]; + inner_stream.write_all(&writer_buf).unwrap(); + + handler.join().unwrap(); + } + + #[test] + #[allow(clippy::mutex_atomic)] + fn test_inner_stream_timeout() { + // outer stream is in block mode + let (mut inner_stream, mut outer_stream) = get_inner_backend_stream_pair(); + // set write timeout always return Ok, and no effect + assert!(outer_stream + .set_write_timeout(Some(Duration::from_secs(10))) + .is_ok()); + // set read timeout always return ok, can take effect + assert!(outer_stream + .set_read_timeout(Some(Duration::from_millis(150))) + .is_ok()); + + let cond_pair = Arc::new((Mutex::new(false), Condvar::new())); + let cond_pair_2 = Arc::clone(&cond_pair); + let handler = thread::Builder::new() + .spawn(move || { + // notify handler thread start + let (lock, cvar) = &*cond_pair_2; + let mut started = lock.lock().unwrap(); + *started = true; + cvar.notify_one(); + drop(started); + + let start_time1 = Instant::now(); + let mut reader_buf = [0; 5]; + // first read would timed out + assert_eq!( + outer_stream.read_exact(&mut reader_buf).unwrap_err().kind(), + ErrorKind::TimedOut + ); + let end_time1 = Instant::now().duration_since(start_time1).as_millis(); + assert!((150..250).contains(&end_time1)); + + // second read would ok + assert!(outer_stream.read_exact(&mut reader_buf).is_ok()); + assert_eq!(reader_buf, [1, 2, 3, 4, 5]); + + // cancel the read timeout + let start_time2 = Instant::now(); + outer_stream.set_read_timeout(None).unwrap(); + assert!(outer_stream.read_exact(&mut reader_buf).is_ok()); + let end_time2 = Instant::now().duration_since(start_time2).as_millis(); + assert!(end_time2 >= 500); + }) + .unwrap(); + + // wait handler thread started + let (lock, cvar) = &*cond_pair; + let mut started = lock.lock().unwrap(); + while !*started { + started = cvar.wait(started).unwrap(); + } + + // sleep 300ms, test timeout + thread::sleep(Duration::from_millis(300)); + let writer_buf = [1, 2, 3, 4, 5]; + inner_stream.write_all(&writer_buf).unwrap(); + + // sleep 500ms again, test cancel timeout + thread::sleep(Duration::from_millis(500)); + let writer_buf = [1, 2, 3, 4, 5]; + inner_stream.write_all(&writer_buf).unwrap(); + + handler.join().unwrap(); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/backend/mod.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/backend/mod.rs new file mode 100644 index 000000000..4f555c77d --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/backend/mod.rs @@ -0,0 +1,73 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +/// This module implements backends for vsock - the host side vsock endpoint, +/// which can translate vsock stream into host's protocol, eg. AF_UNIX, AF_INET +/// or even the protocol created by us. +use std::any::Any; +use std::io::{Read, Write}; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::time::Duration; + +mod inner; +mod tcp; +mod unix_stream; + +pub use self::inner::{VsockInnerBackend, VsockInnerConnector, VsockInnerStream}; +pub use self::tcp::VsockTcpBackend; +pub use self::unix_stream::HybridUnixStreamBackend; +pub use self::unix_stream::VsockUnixStreamBackend; + +/// The type of vsock backend. +#[derive(PartialEq, Eq, Hash, Debug, Clone)] +pub enum VsockBackendType { + /// Unix stream + UnixStream, + /// Tcp socket + Tcp, + /// Inner backend + Inner, + /// For test purpose + #[cfg(test)] + Test, +} + +/// The generic abstract of Vsock Backend, looks like socket's API. +pub trait VsockBackend: AsRawFd + Send { + /// Accept a host-initiated connection. + fn accept(&mut self) -> std::io::Result>; + /// Connect by a guest-initiated connection. + fn connect(&self, dst_port: u32) -> std::io::Result>; + /// The type of backend. + fn r#type(&self) -> VsockBackendType; + /// Used to downcast to the specific type. + fn as_any(&self) -> &dyn Any; +} + +/// The generic abstract of Vsock Stream. +pub trait VsockStream: Read + Write + AsRawFd + Send { + /// The type of backend which created the stream. + fn backend_type(&self) -> VsockBackendType; + /// Moves VsockStream into or out of nonblocking mode + fn set_nonblocking(&mut self, _nonblocking: bool) -> std::io::Result<()> { + Err(std::io::Error::from(std::io::ErrorKind::WouldBlock)) + } + /// Set the read timeout to the time duration specified. + fn set_read_timeout(&mut self, _dur: Option) -> std::io::Result<()> { + Err(std::io::Error::from(std::io::ErrorKind::InvalidInput)) + } + /// Set the write timeout to the time duration specified. + fn set_write_timeout(&mut self, _dur: Option) -> std::io::Result<()> { + Err(std::io::Error::from(std::io::ErrorKind::InvalidInput)) + } + /// Receive the port and fd from the peer. + fn recv_data_fd( + &self, + _bytes: &mut [u8], + _fds: &mut [RawFd], + ) -> std::io::Result<(usize, usize)> { + Err(std::io::Error::from(std::io::ErrorKind::InvalidInput)) + } + /// Used to downcast to the specific type. + fn as_any(&self) -> &dyn Any; +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/backend/tcp.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/backend/tcp.rs new file mode 100644 index 000000000..f35931483 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/backend/tcp.rs @@ -0,0 +1,170 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::any::Any; +use std::net::{TcpListener, TcpStream}; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::time::Duration; + +use log::info; + +use super::super::{Result, VsockError}; +use super::{VsockBackend, VsockBackendType, VsockStream}; + +impl VsockStream for TcpStream { + fn backend_type(&self) -> VsockBackendType { + VsockBackendType::Tcp + } + + fn set_nonblocking(&mut self, nonblocking: bool) -> std::io::Result<()> { + TcpStream::set_nonblocking(self, nonblocking) + } + + fn set_read_timeout(&mut self, dur: Option) -> std::io::Result<()> { + TcpStream::set_read_timeout(self, dur) + } + + fn set_write_timeout(&mut self, dur: Option) -> std::io::Result<()> { + TcpStream::set_write_timeout(self, dur) + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +/// The backend implementation that using TCP Socket. +#[allow(dead_code)] +pub struct VsockTcpBackend { + /// The TCP socket, through which host-initiated connections are accepted. + tcp_sock: TcpListener, + /// The address of TCP socket. + tcp_sock_addr: String, +} + +impl VsockTcpBackend { + pub fn new(tcp_sock_addr: String) -> Result { + info!("open vsock tcp: {}", tcp_sock_addr); + // Open/bind/listen on the host Unix socket, so we can accept + // host-initiated connections. + let tcp_sock = TcpListener::bind(&tcp_sock_addr) + .and_then(|sock| sock.set_nonblocking(true).map(|_| sock)) + .map_err(VsockError::Backend)?; + info!("vsock tcp opened"); + + Ok(VsockTcpBackend { + tcp_sock, + tcp_sock_addr, + }) + } +} + +impl AsRawFd for VsockTcpBackend { + fn as_raw_fd(&self) -> RawFd { + self.tcp_sock.as_raw_fd() + } +} + +impl VsockBackend for VsockTcpBackend { + fn accept(&mut self) -> std::io::Result> { + let (stream, _) = self.tcp_sock.accept()?; + stream.set_nonblocking(true)?; + + Ok(Box::new(stream)) + } + + // Peer connection doesn't supported by tcp backend yet. + fn connect(&self, _dst_port: u32) -> std::io::Result> { + Err(std::io::Error::new( + std::io::ErrorKind::ConnectionRefused, + "vsock net backend doesn't support incoming connection request", + )) + } + + fn r#type(&self) -> VsockBackendType { + VsockBackendType::Tcp + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +#[cfg(test)] +mod tests { + use std::io::{Read, Write}; + use std::net::TcpStream; + + use super::*; + + #[test] + fn test_tcp_backend_bind() { + let tcp_sock_addr = String::from("127.0.0.2:9000"); + assert!(VsockTcpBackend::new(tcp_sock_addr).is_ok()); + } + + #[test] + fn test_tcp_backend_accept() { + let tcp_sock_addr = String::from("127.0.0.2:9001"); + + let mut vsock_backend = VsockTcpBackend::new(tcp_sock_addr.clone()).unwrap(); + let _stream = TcpStream::connect(&tcp_sock_addr).unwrap(); + + assert!(vsock_backend.accept().is_ok()); + } + + #[test] + fn test_tcp_backend_communication() { + let tcp_sock_addr = String::from("127.0.0.2:9002"); + let test_string = String::from("TEST"); + let mut buffer = [0; 10]; + + let mut vsock_backend = VsockTcpBackend::new(tcp_sock_addr.clone()).unwrap(); + let mut stream_connect = TcpStream::connect(&tcp_sock_addr).unwrap(); + stream_connect.set_nonblocking(true).unwrap(); + let mut stream_backend = vsock_backend.accept().unwrap(); + + assert!(stream_connect + .write(&test_string.clone().into_bytes()) + .is_ok()); + assert!(stream_backend.read(&mut buffer).is_ok()); + assert_eq!(&buffer[0..test_string.len()], test_string.as_bytes()); + + assert!(stream_backend + .write(&test_string.clone().into_bytes()) + .is_ok()); + assert!(stream_connect.read(&mut buffer).is_ok()); + assert_eq!(&buffer[0..test_string.len()], test_string.as_bytes()); + } + + #[test] + fn test_tcp_backend_connect() { + let tcp_sock_addr = String::from("127.0.0.2:9003"); + let vsock_backend = VsockTcpBackend::new(tcp_sock_addr).unwrap(); + // tcp backend don't support peer connection + assert!(vsock_backend.connect(0).is_err()); + } + + #[test] + fn test_tcp_backend_type() { + let tcp_sock_addr = String::from("127.0.0.2:9004"); + let vsock_backend = VsockTcpBackend::new(tcp_sock_addr).unwrap(); + assert_eq!(vsock_backend.r#type(), VsockBackendType::Tcp); + } + + #[test] + fn test_tcp_backend_vsock_stream() { + let tcp_sock_addr = String::from("127.0.0.2:9005"); + let _vsock_backend = VsockTcpBackend::new(tcp_sock_addr.clone()).unwrap(); + let vsock_stream = TcpStream::connect(&tcp_sock_addr).unwrap(); + + assert!(vsock_stream.set_nonblocking(true).is_ok()); + assert!(vsock_stream + .set_read_timeout(Some(Duration::from_secs(1))) + .is_ok()); + assert!(vsock_stream.set_read_timeout(None).is_ok()); + assert!(vsock_stream + .set_write_timeout(Some(Duration::from_secs(2))) + .is_ok()); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/backend/unix_stream.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/backend/unix_stream.rs new file mode 100644 index 000000000..7b268b4f4 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/backend/unix_stream.rs @@ -0,0 +1,267 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::any::Any; +use std::io::{Read, Write}; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::os::unix::net::{UnixListener, UnixStream}; +use std::time::Duration; + +use log::info; +use sendfd::RecvWithFd; + +use super::super::{Result, VsockError}; +use super::{VsockBackend, VsockBackendType, VsockStream}; + +pub struct HybridUnixStreamBackend { + pub unix_stream: Box, + pub slave_stream: Option>, +} + +impl VsockStream for HybridUnixStreamBackend { + fn backend_type(&self) -> VsockBackendType { + self.unix_stream.backend_type() + } + + fn set_nonblocking(&mut self, nonblocking: bool) -> std::io::Result<()> { + self.unix_stream.set_nonblocking(nonblocking) + } + + fn set_read_timeout(&mut self, dur: Option) -> std::io::Result<()> { + self.unix_stream.set_read_timeout(dur) + } + + fn set_write_timeout(&mut self, dur: Option) -> std::io::Result<()> { + self.unix_stream.set_write_timeout(dur) + } + + fn as_any(&self) -> &dyn Any { + self.unix_stream.as_any() + } + + fn recv_data_fd(&self, bytes: &mut [u8], fds: &mut [RawFd]) -> std::io::Result<(usize, usize)> { + self.unix_stream.recv_data_fd(bytes, fds) + } +} + +impl AsRawFd for HybridUnixStreamBackend { + fn as_raw_fd(&self) -> RawFd { + self.unix_stream.as_raw_fd() + } +} + +impl Read for HybridUnixStreamBackend { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + self.unix_stream.read(buf) + } +} + +impl Write for HybridUnixStreamBackend { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + // The slave stream was only used to reply the connect result "ok ", + // thus it was only used once here, and the data would be replied by the + // main stream. + if let Some(mut stream) = self.slave_stream.take() { + stream.write(buf) + } else { + self.unix_stream.write(buf) + } + } + + fn flush(&mut self) -> std::io::Result<()> { + self.unix_stream.flush() + } +} + +impl VsockStream for UnixStream { + fn backend_type(&self) -> VsockBackendType { + VsockBackendType::UnixStream + } + + fn set_nonblocking(&mut self, nonblocking: bool) -> std::io::Result<()> { + UnixStream::set_nonblocking(self, nonblocking) + } + + fn set_read_timeout(&mut self, dur: Option) -> std::io::Result<()> { + UnixStream::set_read_timeout(self, dur) + } + + fn set_write_timeout(&mut self, dur: Option) -> std::io::Result<()> { + UnixStream::set_write_timeout(self, dur) + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn recv_data_fd(&self, bytes: &mut [u8], fds: &mut [RawFd]) -> std::io::Result<(usize, usize)> { + self.recv_with_fd(bytes, fds) + } +} + +/// The backend implementation that using Unix Stream. +pub struct VsockUnixStreamBackend { + /// The Unix socket, through which host-initiated connections are accepted. + pub(crate) host_sock: UnixListener, + /// The file system path of the host-side Unix socket. + pub(crate) host_sock_path: String, +} + +impl VsockUnixStreamBackend { + pub fn new(host_sock_path: String) -> Result { + info!("Open vsock uds: {}", host_sock_path); + // Open/bind/listen on the host Unix socket, so we can accept + // host-initiated connections. + let host_sock = UnixListener::bind(&host_sock_path) + .and_then(|sock| sock.set_nonblocking(true).map(|_| sock)) + .map_err(VsockError::Backend)?; + info!("vsock uds opened"); + + Ok(VsockUnixStreamBackend { + host_sock, + host_sock_path, + }) + } +} + +impl AsRawFd for VsockUnixStreamBackend { + fn as_raw_fd(&self) -> RawFd { + self.host_sock.as_raw_fd() + } +} + +impl VsockBackend for VsockUnixStreamBackend { + fn accept(&mut self) -> std::io::Result> { + let (stream, _) = self.host_sock.accept()?; + stream.set_nonblocking(true)?; + + Ok(Box::new(stream)) + } + + fn connect(&self, dst_port: u32) -> std::io::Result> { + // We can figure out the path to Unix sockets listening on specific + // ports using `host_sock_path` field. I.e. "_". + let port_path = format!("{}_{}", self.host_sock_path, dst_port); + let stream = UnixStream::connect(port_path)?; + stream.set_nonblocking(true)?; + + Ok(Box::new(stream)) + } + + fn r#type(&self) -> VsockBackendType { + VsockBackendType::UnixStream + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +impl Drop for VsockUnixStreamBackend { + fn drop(&mut self) { + std::fs::remove_file(&self.host_sock_path).ok(); + } +} + +#[cfg(test)] +mod tests { + use std::fs; + use std::io::{Read, Write}; + use std::os::unix::net::UnixStream; + use std::path::Path; + + use super::*; + + #[test] + fn test_unix_backend_bind() { + let host_sock_path = String::from("/tmp/host_sock_path_1"); + fs::remove_file(Path::new(&host_sock_path)).unwrap_or_default(); + + assert!(VsockUnixStreamBackend::new(host_sock_path.clone()).is_ok()); + + fs::remove_file(Path::new(&host_sock_path)).unwrap_or_default(); + } + + #[test] + fn test_unix_backend_accept() { + let host_sock_path = String::from("/tmp/host_sock_path_2"); + fs::remove_file(Path::new(&host_sock_path)).unwrap_or_default(); + + let mut vsock_backend = VsockUnixStreamBackend::new(host_sock_path.clone()).unwrap(); + let _stream = UnixStream::connect(&host_sock_path).unwrap(); + + assert!(vsock_backend.accept().is_ok()); + + fs::remove_file(Path::new(&host_sock_path)).unwrap_or_default(); + } + + #[test] + fn test_unix_backend_communication() { + let host_sock_path = String::from("/tmp/host_sock_path_3"); + fs::remove_file(Path::new(&host_sock_path)).unwrap_or_default(); + let test_string = String::from("TEST"); + let mut buffer = [0; 10]; + + let mut vsock_backend = VsockUnixStreamBackend::new(host_sock_path.clone()).unwrap(); + let mut stream_connect = UnixStream::connect(&host_sock_path).unwrap(); + stream_connect.set_nonblocking(true).unwrap(); + let mut stream_backend = vsock_backend.accept().unwrap(); + + assert!(stream_connect + .write(&test_string.clone().into_bytes()) + .is_ok()); + assert!(stream_backend.read(&mut buffer).is_ok()); + assert_eq!(&buffer[0..test_string.len()], test_string.as_bytes()); + + assert!(stream_backend + .write(&test_string.clone().into_bytes()) + .is_ok()); + assert!(stream_connect.read(&mut buffer).is_ok()); + assert_eq!(&buffer[0..test_string.len()], test_string.as_bytes()); + + fs::remove_file(Path::new(&host_sock_path)).unwrap_or_default(); + } + + #[test] + fn test_unix_backend_connect() { + let host_sock_path = String::from("/tmp/host_sock_path_4"); + let local_server_port = 1; + let local_server_path = format!("{host_sock_path}_{local_server_port}"); + fs::remove_file(Path::new(&host_sock_path)).unwrap_or_default(); + fs::remove_file(Path::new(&local_server_path)).unwrap_or_default(); + + let _local_listener = UnixListener::bind(&local_server_path).unwrap(); + let vsock_backend = VsockUnixStreamBackend::new(host_sock_path.clone()).unwrap(); + + assert!(vsock_backend.connect(local_server_port).is_ok()); + + fs::remove_file(Path::new(&host_sock_path)).unwrap_or_default(); + fs::remove_file(Path::new(&local_server_path)).unwrap_or_default(); + } + + #[test] + fn test_unix_backend_type() { + let host_sock_path = String::from("/tmp/host_sock_path_5"); + fs::remove_file(Path::new(&host_sock_path)).unwrap_or_default(); + + let vsock_backend = VsockUnixStreamBackend::new(host_sock_path.clone()).unwrap(); + assert_eq!(vsock_backend.r#type(), VsockBackendType::UnixStream); + + fs::remove_file(Path::new(&host_sock_path)).unwrap_or_default(); + } + + #[test] + fn test_unix_backend_vsock_stream() { + let (sock1, _sock2) = UnixStream::pair().unwrap(); + let mut vsock_stream: Box = Box::new(sock1); + + assert!(vsock_stream.set_nonblocking(true).is_ok()); + assert!(vsock_stream + .set_read_timeout(Some(Duration::from_secs(1))) + .is_ok()); + assert!(vsock_stream.set_read_timeout(None).is_ok()); + assert!(vsock_stream + .set_write_timeout(Some(Duration::from_secs(2))) + .is_ok()); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/csm/connection.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/csm/connection.rs new file mode 100644 index 000000000..e2ca7e333 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/csm/connection.rs @@ -0,0 +1,1282 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +/// The main job of `VsockConnection` is to forward data traffic, back and +/// forth, between a guest-side AF_VSOCK socket and a host-side generic `Read + +/// Write + AsRawFd` stream, while also managing its internal state. To that +/// end, `VsockConnection` implements: +/// - `VsockChannel` for: +/// - moving data from the host stream to a guest-provided RX buffer, via +/// `recv_pkt()`; and +/// - moving data from a guest-provided TX buffer to the host stream, via +/// `send_pkt()`; and +/// - updating its internal state, by absorbing control packets (anything +/// other than VSOCK_OP_RW). +/// - `VsockEpollListener` for getting notified about the availability of data +/// or free buffer space at the host stream. +/// +/// Note: there is a certain asymmetry to the RX and TX data flows: +/// - RX transfers do not need any data buffering, since data is read straight +/// from the host stream and into the guest-provided RX buffer; +/// - TX transfers may require some data to be buffered by `VsockConnection`, if +/// the host peer can't keep up with reading the data that we're writing. +/// This is because, once the guest driver provides some data in a virtio +/// TX buffer, the vsock device must consume it. If that data can't be +/// forwarded straight to the host stream, we'll have to store it in a +/// buffer (and flush it at a later time). Vsock flow control ensures that +/// our TX buffer doesn't overflow. +// The code in this file is best read with a fresh memory of the vsock protocol +// inner-workings. To help with that, here is a +// +// Short primer on the vsock protocol +// ---------------------------------- +// +// 1. Establishing a connection A vsock connection is considered established +// after a two-way handshake: +// - the initiating peer sends a connection request packet (`hdr.op` == +// VSOCK_OP_REQUEST); then +// - the listening peer sends back a connection response packet (`hdr.op` == +// VSOCK_OP_RESPONSE). +// +// 2. Terminating a connection When a peer wants to shut down an established +// connection, it sends a VSOCK_OP_SHUTDOWN packet. Two header flags are used +// with VSOCK_OP_SHUTDOWN, indicating the sender's intention: +// - VSOCK_FLAGS_SHUTDOWN_RCV: the sender will receive no more data for this +// connection; and +// - VSOCK_FLAGS_SHUTDOWN_SEND: the sender will send no more data for this +// connection. After a shutdown packet, the receiving peer will have some +// protocol-undefined time to flush its buffers, and then forcefully +// terminate the connection by sending back an RST packet. If the +// shutdown-initiating peer doesn't receive this RST packet during a +// timeout period, it will send one itself, thus terminating the +// connection. Note: a peer can send more than one VSOCK_OP_SHUTDOWN +// packets. However, read/write indications cannot be undone. E.g. once a +// "no-more-sending" promise was made, it cannot be taken back. That is, +// `hdr.flags` will be ORed between subsequent VSOCK_OP_SHUTDOWN packets. +// +// 3. Flow control Before sending a data packet (VSOCK_OP_RW), the sender must +// make sure that the receiver has enough free buffer space to store that +// data. If this condition is not respected, the receiving peer's behaviour +// is undefined. In this implementation, we forcefully terminate the +// connection by sending back a VSOCK_OP_RST packet. Note: all buffer space +// information is computed and stored on a per-connection basis. Peers keep +// each other informed about the free buffer space they have by filling in +// two packet header members with each packet they send: +// - `hdr.buf_alloc`: the total buffer space the peer has allocated for +// receiving data; and +// - `hdr.fwd_cnt`: the total number of bytes the peer has successfully +// flushed out of its buffer. One can figure out how much space its peer +// has available in its buffer by inspecting the difference between how +// much it has sent to the peer and how much the peer has flushed out +// (i.e. "forwarded", in the vsock spec terminology): `peer_free = +// peer_buf_alloc - (total_bytes_sent_to_peer - peer_fwd_cnt)`. +// +// Note: the above requires that peers constantly keep each other informed on +// their buffer space situation. However, since there are no receipt +// acknowledgement packets defined for the vsock protocol, packet flow can +// often be unidirectional (just one peer sending data to another), so the +// sender's information about the receiver's buffer space can get quickly +// outdated. The vsock protocol defines two solutions to this problem: 1. +// The sender can explicitly ask for a buffer space (i.e. "credit") update +// from its peer, via a VSOCK_OP_CREDIT_REQUEST packet, to which it will +// get a VSOCK_OP_CREDIT_UPDATE response (or any response will do, really, +// since credit information must be included in any packet); 2. The +// receiver can be proactive, and send VSOCK_OP_CREDIT_UPDATE packet, +// whenever it thinks its peer's information is out of date. Our +// implementation uses the proactive approach. +use std::io::{ErrorKind, Read, Write}; +use std::num::Wrapping; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::time::{Duration, Instant}; + +use log::{debug, error, info, warn}; + +use super::super::backend::VsockStream; +use super::super::defs::uapi; +use super::super::packet::VsockPacket; +use super::super::{Result as VsockResult, VsockChannel, VsockEpollListener, VsockError}; +use super::defs; +use super::txbuf::TxBuf; +use super::{ConnState, Error, PendingRx, PendingRxSet, Result}; + +/// A self-managing connection object, that handles communication between a +/// guest-side AF_VSOCK socket and a host-side `Read + Write + AsRawFd` stream. +pub struct VsockConnection { + /// The current connection state. + state: ConnState, + /// The local CID. Most of the time this will be the constant `2` (the vsock + /// host CID). + pub(crate) local_cid: u64, + /// The peer (guest) CID. + pub(crate) peer_cid: u64, + /// The local (host) port. + pub(crate) local_port: u32, + /// The peer (guest) port. + pub(crate) peer_port: u32, + /// The (connected) host-side stream. + pub(crate) stream: Box, + /// The TX buffer for this connection. + tx_buf: TxBuf, + /// Total number of bytes that have been successfully written to + /// `self.stream`, either directly, or flushed from `self.tx_buf`. + fwd_cnt: Wrapping, + /// The amount of buffer space that the peer (guest) has allocated for this + /// connection. + peer_buf_alloc: u32, + /// The total number of bytes that the peer has forwarded away. + peer_fwd_cnt: Wrapping, + /// The total number of bytes sent to the peer (guest vsock driver) + rx_cnt: Wrapping, + /// Our `self.fwd_cnt`, as last sent to the peer. This is used to provide + /// proactive credit updates, and let the peer know it's OK to send more + /// data. + last_fwd_cnt_to_peer: Wrapping, + /// The set of pending RX packet indications that `recv_pkt()` will use to + /// fill in a packet for the peer (guest). + pending_rx: PendingRxSet, + /// Instant when this connection should be scheduled for immediate + /// termination, due to some timeout condition having been fulfilled. + expiry: Option, +} + +impl VsockChannel for VsockConnection { + /// Fill in a vsock packet, to be delivered to our peer (the guest driver). + /// + /// As per the `VsockChannel` trait, this should only be called when there + /// is data to be fetched from the channel (i.e. `has_pending_rx()` is + /// true). Otherwise, it will error out with `VsockError::NoData`. Pending + /// RX indications are set by other mutable actions performed on the + /// channel. For instance, `send_pkt()` could set an Rst indication, if + /// called with a VSOCK_OP_SHUTDOWN packet, or `notify()` could set a Rw + /// indication (a data packet can be fetched from the channel), if data was + /// ready to be read from the host stream. + /// + /// Returns: + /// - `Ok(())`: the packet has been successfully filled in and is ready for + /// delivery; + /// - `Err(VsockError::NoData)`: there was no data available with which to + /// fill in the packet; + /// - `Err(VsockError::PktBufMissing)`: the packet would've been filled in + /// with data, but it is missing the data buffer. + fn recv_pkt(&mut self, pkt: &mut VsockPacket) -> VsockResult<()> { + // Perform some generic initialization that is the same for any packet + // operation (e.g. source, destination, credit, etc). + self.init_pkt(pkt); + + // If forceful termination is pending, there's no point in checking for + // anything else. It's dead, Jim. + if self.pending_rx.remove(PendingRx::Rst) { + pkt.set_op(uapi::VSOCK_OP_RST); + return Ok(()); + } + + // Next up: if we're due a connection confirmation, that's all we need + // to know to fill in this packet. + if self.pending_rx.remove(PendingRx::Response) { + self.state = ConnState::Established; + pkt.set_op(uapi::VSOCK_OP_RESPONSE); + return Ok(()); + } + + // Same thing goes for locally-initiated connections that need to yield + // a connection request. + if self.pending_rx.remove(PendingRx::Request) { + self.expiry = + Some(Instant::now() + Duration::from_millis(defs::CONN_REQUEST_TIMEOUT_MS)); + pkt.set_op(uapi::VSOCK_OP_REQUEST); + return Ok(()); + } + + if self.pending_rx.remove(PendingRx::Rw) { + // We're due to produce a data packet, by reading the data from the + // host-side backend. + + match self.state { + // A data packet is only valid for established connections, and + // connections for which our peer has initiated a graceful + // shutdown, but can still receive data. + ConnState::Established | ConnState::PeerClosed(false, _) => (), + _ => { + // Any other connection state is invalid at this point, and + // we need to kill it with fire. + pkt.set_op(uapi::VSOCK_OP_RST); + return Ok(()); + } + } + + // Oh wait, before we start bringing in the big data, can our peer + // handle receiving so much bytey goodness? + if self.need_credit_update_from_peer() { + self.last_fwd_cnt_to_peer = self.fwd_cnt; + pkt.set_op(uapi::VSOCK_OP_CREDIT_REQUEST); + return Ok(()); + } + + let buf = pkt.buf_mut().ok_or(VsockError::PktBufMissing)?; + + // The maximum amount of data we can read in is limited by both the + // RX buffer size and the peer available buffer space. + let max_len = std::cmp::min(buf.len(), self.peer_avail_credit()); + + // Read data from the stream straight to the RX buffer, for maximum throughput. + match self.stream.read(&mut buf[..max_len]) { + Ok(read_cnt) => { + if read_cnt == 0 { + // A 0-length read means the host stream was closed + // down. In that case, we'll ask our peer to shut down + // the connection. We can neither send nor receive any + // more data. + self.state = ConnState::LocalClosed; + self.expiry = Some( + Instant::now() + Duration::from_millis(defs::CONN_SHUTDOWN_TIMEOUT_MS), + ); + pkt.set_op(uapi::VSOCK_OP_SHUTDOWN) + .set_flag(uapi::VSOCK_FLAGS_SHUTDOWN_RCV) + .set_flag(uapi::VSOCK_FLAGS_SHUTDOWN_SEND); + } else { + // On a successful data read, we fill in the packet with + // the RW op, and length of the read data. + pkt.set_op(uapi::VSOCK_OP_RW).set_len(read_cnt as u32); + } + self.rx_cnt += Wrapping(pkt.len()); + self.last_fwd_cnt_to_peer = self.fwd_cnt; + return Ok(()); + } + Err(err) if err.kind() == ErrorKind::WouldBlock => { + // This shouldn't actually happen (receiving EWOULDBLOCK + // after EPOLLIN), but apparently it does, so we need to + // handle it greacefully. + warn!( + "vsock: unexpected EWOULDBLOCK while reading from backing stream: \ + lp={}, pp={}, err={:?}", + self.local_port, self.peer_port, err + ); + } + Err(err) => { + // We are not expecting any other errors when reading from + // the underlying stream. If any show up, we'll immediately + // kill this connection. + error!( + "vsock: error reading from backing stream: lp={}, pp={}, err={:?}", + self.local_port, self.peer_port, err + ); + pkt.set_op(uapi::VSOCK_OP_RST); + self.last_fwd_cnt_to_peer = self.fwd_cnt; + return Ok(()); + } + }; + } + + // A credit update is basically a no-op, so we should only waste a + // perfectly fine RX buffer on it if we really have nothing else to say, + // hence we check for this RX indication last. + if self.pending_rx.remove(PendingRx::CreditUpdate) && !self.has_pending_rx() { + pkt.set_op(uapi::VSOCK_OP_CREDIT_UPDATE); + self.last_fwd_cnt_to_peer = self.fwd_cnt; + return Ok(()); + } + + // We've already checked for all conditions that would have produced a + // packet, so if we got to here, we don't know how to yield one. + Err(VsockError::NoData) + } + + /// Deliver a guest-generated packet to this connection. + /// + /// This forwards the data in RW packets to the host stream, and absorbs + /// control packets, using them to manage the internal connection state. + /// + /// Returns: always `Ok(())`: the packet has been consumed; + fn send_pkt(&mut self, pkt: &VsockPacket) -> VsockResult<()> { + // Update the peer credit information. + self.peer_buf_alloc = pkt.buf_alloc(); + self.peer_fwd_cnt = Wrapping(pkt.fwd_cnt()); + + match self.state { + // Most frequent case: this is an established connection that needs + // to forward some data to the host stream. Also works for a + // connection that has begun shutting down, but the peer still has + // some data to send. + ConnState::Established | ConnState::PeerClosed(_, false) + if pkt.op() == uapi::VSOCK_OP_RW => + { + if pkt.buf().is_none() { + info!( + "vsock: dropping empty data packet from guest (lp={}, pp={}", + self.local_port, self.peer_port + ); + return Ok(()); + } + + // Unwrapping here is safe, since we just checked `pkt.buf()` + // above. + let buf_slice = &pkt.buf().unwrap()[..(pkt.len() as usize)]; + if let Err(err) = self.send_bytes(buf_slice) { + // If we can't write to the host stream, that's an + // unrecoverable error, so we'll terminate this connection. + warn!( + "vsock: error writing to local stream (lp={}, pp={}): {:?}", + self.local_port, self.peer_port, err + ); + self.kill(); + return Ok(()); + } + + // We might've just consumed some data. If that's the case, we + // might need to update the peer on our buffer space situation, + // so that it can keep sending data packets our way. + if self.peer_needs_credit_update() { + self.pending_rx.insert(PendingRx::CreditUpdate); + } + } + + // Next up: receiving a response / confirmation for a host-initiated + // connection. We'll move to an Established state, and pass on the + // good news through the host stream. + ConnState::LocalInit if pkt.op() == uapi::VSOCK_OP_RESPONSE => { + self.expiry = None; + self.state = ConnState::Established; + } + + // The peer wants to shut down an established connection. If they + // have nothing more to send nor receive, and we don't have to wait + // to drain our TX buffer, we can schedule an RST packet (to + // terminate the connection on the next recv call). Otherwise, we'll + // arm the kill timer. + ConnState::Established if pkt.op() == uapi::VSOCK_OP_SHUTDOWN => { + let recv_off = pkt.flags() & uapi::VSOCK_FLAGS_SHUTDOWN_RCV != 0; + let send_off = pkt.flags() & uapi::VSOCK_FLAGS_SHUTDOWN_SEND != 0; + self.state = ConnState::PeerClosed(recv_off, send_off); + if recv_off && send_off { + if self.tx_buf.is_empty() { + self.pending_rx.insert(PendingRx::Rst); + } else { + self.expiry = Some( + Instant::now() + Duration::from_millis(defs::CONN_SHUTDOWN_TIMEOUT_MS), + ); + } + } + } + + // The peer wants to update a shutdown request, with more + // receive/send indications. The same logic as above applies. + ConnState::PeerClosed(ref mut recv_off, ref mut send_off) + if pkt.op() == uapi::VSOCK_OP_SHUTDOWN => + { + *recv_off = *recv_off || (pkt.flags() & uapi::VSOCK_FLAGS_SHUTDOWN_RCV != 0); + *send_off = *send_off || (pkt.flags() & uapi::VSOCK_FLAGS_SHUTDOWN_SEND != 0); + if *recv_off && *send_off && self.tx_buf.is_empty() { + self.pending_rx.insert(PendingRx::Rst); + } + } + + // A credit update from our peer is valid only in a state which + // allows data transfer towards the peer. + ConnState::Established | ConnState::PeerInit | ConnState::PeerClosed(false, _) + if pkt.op() == uapi::VSOCK_OP_CREDIT_UPDATE => + { + // Nothing to do here; we've already updated peer credit. + } + + // A credit request from our peer is valid only in a state which + // allows data transfer from the peer. We'll respond with a credit + // update packet. + ConnState::Established | ConnState::PeerInit | ConnState::PeerClosed(_, false) + if pkt.op() == uapi::VSOCK_OP_CREDIT_REQUEST => + { + self.pending_rx.insert(PendingRx::CreditUpdate); + } + + _ => { + debug!( + "vsock: dropping invalid TX pkt for connection: state={:?}, pkt.hdr={:?}", + self.state, + pkt.hdr() + ); + } + }; + + Ok(()) + } + + /// Check if the connection has any pending packet addressed to the peer. + fn has_pending_rx(&self) -> bool { + !self.pending_rx.is_empty() + } +} + +impl AsRawFd for VsockConnection { + /// Get the file descriptor that this connection wants polled. + /// + /// The connection is interested in being notified about EPOLLIN / EPOLLOUT + /// events on the host stream. + fn as_raw_fd(&self) -> RawFd { + self.stream.as_raw_fd() + } +} + +impl VsockEpollListener for VsockConnection { + /// Get the event set that this connection is interested in. + /// + /// A connection will want to be notified when: + /// - data is available to be read from the host stream, so that it can + /// store an RW pending RX indication; and + /// - data can be written to the host stream, and the TX buffer needs to be + /// flushed. + fn get_polled_evset(&self) -> epoll::Events { + let mut evset = epoll::Events::empty(); + if !self.tx_buf.is_empty() { + // There's data waiting in the TX buffer, so we are interested in + // being notified when writing to the host stream wouldn't block. + evset.insert(epoll::Events::EPOLLOUT); + } + // We're generally interested in being notified when data can be read + // from the host stream, unless we're in a state which doesn't allow + // moving data from host to guest. + match self.state { + ConnState::Killed | ConnState::LocalClosed | ConnState::PeerClosed(true, _) => (), + _ if self.need_credit_update_from_peer() => (), + _ => evset.insert(epoll::Events::EPOLLIN), + } + evset + } + + /// Notify the connection about an event (or set of events) that it was + /// interested in. + fn notify(&mut self, evset: epoll::Events) { + if evset.contains(epoll::Events::EPOLLIN) { + // Data can be read from the host stream. Setting a Rw pending + // indication, so that the muxer will know to call `recv_pkt()` + // later. + self.pending_rx.insert(PendingRx::Rw); + } + + if evset.contains(epoll::Events::EPOLLOUT) { + // Data can be written to the host stream. Time to flush out the TX + // buffer. + if self.tx_buf.is_empty() { + info!("vsock: connection received unexpected EPOLLOUT event"); + return; + } + let flushed = self + .tx_buf + .flush_to(&mut self.stream) + .unwrap_or_else(|err| { + warn!( + "vsock: error flushing TX buf for (lp={}, pp={}): {:?}", + self.local_port, self.peer_port, err + ); + match err { + Error::TxBufFlush(inner) if inner.kind() == ErrorKind::WouldBlock => { + // This should never happen (EWOULDBLOCK after + // EPOLLOUT), but it does, so let's absorb it. + } + _ => self.kill(), + }; + 0 + }); + self.fwd_cnt += Wrapping(flushed as u32); + + // If this connection was shutting down, but is waiting to drain the + // TX buffer before forceful termination, the wait might be over. + if self.state == ConnState::PeerClosed(true, true) && self.tx_buf.is_empty() { + self.pending_rx.insert(PendingRx::Rst); + } else if self.peer_needs_credit_update() { + // If we've freed up some more buffer space, we may need to let + // the peer know it can safely send more data our way. + self.pending_rx.insert(PendingRx::CreditUpdate); + } + } + } +} + +impl VsockConnection { + /// Create a new guest-initiated connection object. + pub fn new_peer_init( + stream: Box, + local_cid: u64, + peer_cid: u64, + local_port: u32, + peer_port: u32, + peer_buf_alloc: u32, + ) -> Self { + Self { + local_cid, + peer_cid, + local_port, + peer_port, + stream, + state: ConnState::PeerInit, + tx_buf: TxBuf::default(), + fwd_cnt: Wrapping(0), + peer_buf_alloc, + peer_fwd_cnt: Wrapping(0), + rx_cnt: Wrapping(0), + last_fwd_cnt_to_peer: Wrapping(0), + pending_rx: PendingRxSet::from(PendingRx::Response), + expiry: None, + } + } + + /// Create a new host-initiated connection object. + pub fn new_local_init( + stream: Box, + local_cid: u64, + peer_cid: u64, + local_port: u32, + peer_port: u32, + ) -> Self { + Self { + local_cid, + peer_cid, + local_port, + peer_port, + stream, + state: ConnState::LocalInit, + tx_buf: TxBuf::default(), + fwd_cnt: Wrapping(0), + peer_buf_alloc: 0, + peer_fwd_cnt: Wrapping(0), + rx_cnt: Wrapping(0), + last_fwd_cnt_to_peer: Wrapping(0), + pending_rx: PendingRxSet::from(PendingRx::Request), + expiry: None, + } + } + + /// Check if there is an expiry (kill) timer set for this connection, + /// sometime in the future. + pub fn will_expire(&self) -> bool { + match self.expiry { + None => false, + Some(t) => t > Instant::now(), + } + } + + /// Check if this connection needs to be scheduled for forceful termination, + /// due to its kill timer having expired. + pub fn has_expired(&self) -> bool { + match self.expiry { + None => false, + Some(t) => t <= Instant::now(), + } + } + + /// Get the kill timer value, if one is set. + pub fn expiry(&self) -> Option { + self.expiry + } + + /// Schedule the connection to be forcefully terminated ASAP (i.e. the next + /// time the connection is asked to yield a packet, via `recv_pkt()`). + pub fn kill(&mut self) { + self.state = ConnState::Killed; + self.pending_rx.insert(PendingRx::Rst); + } + + /// Return the connections state. + pub fn state(&self) -> ConnState { + self.state + } + + /// Send some raw, untracked, data straight to the underlying connected + /// stream. Returns: number of bytes written, or the error describing the + /// write failure. + /// + /// Warning: this will bypass the connection state machine and write + /// directly to the underlying stream. No account of this write is kept, + /// which includes bypassing vsock flow control. + pub fn send_bytes_raw(&mut self, buf: &[u8]) -> Result { + self.stream.write(buf).map_err(Error::StreamWrite) + } + + /// Send some raw data (a byte-slice) to the host stream. + /// + /// Raw data can either be sent straight to the host stream, or to our TX + /// buffer, if the former fails. + fn send_bytes(&mut self, buf: &[u8]) -> Result<()> { + // If there is data in the TX buffer, that means we're already + // registered for EPOLLOUT events on the underlying stream. Therefore, + // there's no point in attempting a write at this point. `self.notify()` + // will get called when EPOLLOUT arrives, and it will attempt to drain + // the TX buffer then. + if !self.tx_buf.is_empty() { + return self.tx_buf.push(buf); + } + + // The TX buffer is empty, so we can try to write straight to the host + // stream. + let written = match self.stream.write(buf) { + Ok(cnt) => cnt, + Err(e) => { + // Absorb any would-block errors, since we can always try again + // later. + if e.kind() == ErrorKind::WouldBlock { + 0 + } else { + // We don't know how to handle any other write error, so + // we'll send it up the call chain. + return Err(Error::StreamWrite(e)); + } + } + }; + // Move the "forwarded bytes" counter ahead by how much we were able to + // send out. + self.fwd_cnt += Wrapping(written as u32); + + // If we couldn't write the whole slice, we'll need to push the + // remaining data to our buffer. + if written < buf.len() { + self.tx_buf.push(&buf[written..])?; + } + + Ok(()) + } + + /// Check if the credit information the peer has last received from us is + /// outdated. + fn peer_needs_credit_update(&self) -> bool { + let peer_seen_free_buf = + Wrapping(defs::CONN_TX_BUF_SIZE) - (self.fwd_cnt - self.last_fwd_cnt_to_peer); + peer_seen_free_buf < Wrapping(defs::CONN_CREDIT_UPDATE_THRESHOLD) + } + + /// Check if we need to ask the peer for a credit update before sending any + /// more data its way. + fn need_credit_update_from_peer(&self) -> bool { + self.peer_avail_credit() == 0 + } + + /// Get the maximum number of bytes that we can send to our peer, without + /// overflowing its buffer. + fn peer_avail_credit(&self) -> usize { + (Wrapping(self.peer_buf_alloc) - (self.rx_cnt - self.peer_fwd_cnt)).0 as usize + } + + /// Prepare a packet header for transmission to our peer. + fn init_pkt<'a>(&self, pkt: &'a mut VsockPacket) -> &'a mut VsockPacket { + // Make sure the header is zeroed-out first. This looks sub-optimal, but + // it is actually optimized-out in the compiled code to be faster than a + // memset(). + for b in pkt.hdr_mut() { + *b = 0; + } + + pkt.set_src_cid(self.local_cid) + .set_dst_cid(self.peer_cid) + .set_src_port(self.local_port) + .set_dst_port(self.peer_port) + .set_type(uapi::VSOCK_TYPE_STREAM) + .set_buf_alloc(defs::CONN_TX_BUF_SIZE) + .set_fwd_cnt(self.fwd_cnt.0) + } +} + +#[cfg(test)] +pub(crate) mod tests { + use std::any::Any; + use std::io::{Error as IoError, ErrorKind, Read, Result as IoResult, Write}; + use std::os::unix::io::RawFd; + use std::time::{Duration, Instant}; + + use virtio_queue::QueueT; + use vmm_sys_util::eventfd::{EventFd, EFD_NONBLOCK}; + + use super::super::super::backend::VsockBackendType; + use super::super::super::defs::{uapi, RXQ_EVENT}; + use super::super::super::tests::TestContext; + use super::super::defs as csm_defs; + use super::*; + + const LOCAL_CID: u64 = 2; + const PEER_CID: u64 = 3; + const LOCAL_PORT: u32 = 1002; + const PEER_PORT: u32 = 1003; + const PEER_BUF_ALLOC: u32 = 64 * 1024; + + enum StreamState { + Closed, + Error(ErrorKind), + Ready, + WouldBlock, + } + + pub struct TestStream { + fd: EventFd, + read_buf: Vec, + read_state: StreamState, + write_buf: Vec, + write_state: StreamState, + } + impl TestStream { + pub fn new() -> Self { + Self { + fd: EventFd::new(EFD_NONBLOCK).unwrap(), + read_state: StreamState::Ready, + write_state: StreamState::Ready, + read_buf: Vec::new(), + write_buf: Vec::new(), + } + } + fn new_with_read_buf(buf: &[u8]) -> Self { + let mut stream = Self::new(); + stream.read_buf = buf.to_vec(); + stream + } + } + + impl AsRawFd for TestStream { + fn as_raw_fd(&self) -> RawFd { + self.fd.as_raw_fd() + } + } + + impl Read for TestStream { + fn read(&mut self, data: &mut [u8]) -> IoResult { + match self.read_state { + StreamState::Closed => Ok(0), + StreamState::Error(kind) => Err(IoError::new(kind, "whatevs")), + StreamState::Ready => { + if self.read_buf.is_empty() { + return Err(IoError::new(ErrorKind::WouldBlock, "EAGAIN")); + } + let len = std::cmp::min(data.len(), self.read_buf.len()); + assert_ne!(len, 0); + data[..len].copy_from_slice(&self.read_buf[..len]); + self.read_buf = self.read_buf.split_off(len); + Ok(len) + } + StreamState::WouldBlock => Err(IoError::new(ErrorKind::WouldBlock, "EAGAIN")), + } + } + } + + impl Write for TestStream { + fn write(&mut self, data: &[u8]) -> IoResult { + match self.write_state { + StreamState::Closed => Err(IoError::new(ErrorKind::BrokenPipe, "EPIPE")), + StreamState::Error(kind) => Err(IoError::new(kind, "whatevs")), + StreamState::Ready => { + self.write_buf.extend_from_slice(data); + Ok(data.len()) + } + StreamState::WouldBlock => Err(IoError::new(ErrorKind::WouldBlock, "EAGAIN")), + } + } + fn flush(&mut self) -> IoResult<()> { + Ok(()) + } + } + + impl VsockStream for TestStream { + fn backend_type(&self) -> VsockBackendType { + VsockBackendType::Test + } + + fn as_any(&self) -> &dyn Any { + self + } + } + + impl VsockConnection { + /// Get the fwd_cnt value from the connection. + pub(crate) fn fwd_cnt(&self) -> Wrapping { + self.fwd_cnt + } + + /// Forcefully insert a credit update flag. + pub(crate) fn insert_credit_update(&mut self) { + self.pending_rx.insert(PendingRx::CreditUpdate); + } + } + + fn init_pkt(pkt: &mut VsockPacket, op: u16, len: u32) -> &mut VsockPacket { + for b in pkt.hdr_mut() { + *b = 0; + } + pkt.set_src_cid(PEER_CID) + .set_dst_cid(LOCAL_CID) + .set_src_port(PEER_PORT) + .set_dst_port(LOCAL_PORT) + .set_type(uapi::VSOCK_TYPE_STREAM) + .set_buf_alloc(PEER_BUF_ALLOC) + .set_op(op) + .set_len(len) + } + + // This is the connection state machine test context: a helper struct to provide CSM testing + // primitives. A single `VsockPacket` object will be enough for our testing needs. We'll be + // using it for simulating both packet sends and packet receives. We need to keep the vsock + // testing context alive, since `VsockPacket` is just a pointer-wrapper over some data that + // resides in guest memory. The vsock test context owns the `GuestMemoryMmap` object, so we'll make + // it a member here, in order to make sure that guest memory outlives our testing packet. A + // single `VsockConnection` object will also suffice for our testing needs. We'll be using a + // specially crafted `Read + Write + AsRawFd` object as a backing stream, so that we can + // control the various error conditions that might arise. + struct CsmTestContext { + _vsock_test_ctx: TestContext, + pkt: VsockPacket, + conn: VsockConnection, + } + + impl CsmTestContext { + fn new_established() -> Self { + Self::new(ConnState::Established) + } + + fn new(conn_state: ConnState) -> Self { + let vsock_test_ctx = TestContext::new(); + let mut handler_ctx = vsock_test_ctx.create_event_handler_context(); + let stream = TestStream::new(); + let mut pkt = VsockPacket::from_rx_virtq_head( + &mut handler_ctx.queues[RXQ_EVENT as usize] + .queue_mut() + .pop_descriptor_chain(&vsock_test_ctx.mem) + .unwrap(), + ) + .unwrap(); + let conn = match conn_state { + ConnState::PeerInit => VsockConnection::new_peer_init( + Box::new(stream), + LOCAL_CID, + PEER_CID, + LOCAL_PORT, + PEER_PORT, + PEER_BUF_ALLOC, + ), + ConnState::LocalInit => VsockConnection::new_local_init( + Box::new(stream), + LOCAL_CID, + PEER_CID, + LOCAL_PORT, + PEER_PORT, + ), + ConnState::Established => { + let mut conn = VsockConnection::new_peer_init( + Box::new(stream), + LOCAL_CID, + PEER_CID, + LOCAL_PORT, + PEER_PORT, + PEER_BUF_ALLOC, + ); + assert!(conn.has_pending_rx()); + conn.recv_pkt(&mut pkt).unwrap(); + assert_eq!(pkt.op(), uapi::VSOCK_OP_RESPONSE); + conn + } + other => panic!("invalid ctx state: {:?}", other), + }; + assert_eq!(conn.state, conn_state); + Self { + _vsock_test_ctx: vsock_test_ctx, + pkt, + conn, + } + } + + fn set_stream(&mut self, stream: TestStream) { + self.conn.stream = Box::new(stream); + } + + fn set_peer_credit(&mut self, credit: u32) { + assert!(credit < self.conn.peer_buf_alloc); + self.conn.peer_fwd_cnt = Wrapping(0); + self.conn.rx_cnt = Wrapping(self.conn.peer_buf_alloc - credit); + assert_eq!(self.conn.peer_avail_credit(), credit as usize); + } + + fn send(&mut self) { + self.conn.send_pkt(&self.pkt).unwrap(); + } + + fn recv(&mut self) { + self.conn.recv_pkt(&mut self.pkt).unwrap(); + } + + fn notify_epollin(&mut self) { + self.conn.notify(epoll::Events::EPOLLIN); + assert!(self.conn.has_pending_rx()); + } + + fn notify_epollout(&mut self) { + self.conn.notify(epoll::Events::EPOLLOUT); + } + + fn init_pkt(&mut self, op: u16, len: u32) -> &mut VsockPacket { + init_pkt(&mut self.pkt, op, len) + } + + fn init_data_pkt(&mut self, data: &[u8]) -> &VsockPacket { + assert!(data.len() <= self.pkt.buf().unwrap().len()); + self.init_pkt(uapi::VSOCK_OP_RW, data.len() as u32); + self.pkt.buf_mut().unwrap()[..data.len()].copy_from_slice(data); + &self.pkt + } + } + + #[test] + fn test_peer_request() { + let mut ctx = CsmTestContext::new(ConnState::PeerInit); + assert!(ctx.conn.has_pending_rx()); + ctx.recv(); + // For peer-initiated requests, our connection should always yield a vsock reponse packet, + // in order to establish the connection. + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RESPONSE); + assert_eq!(ctx.pkt.src_cid(), LOCAL_CID); + assert_eq!(ctx.pkt.dst_cid(), PEER_CID); + assert_eq!(ctx.pkt.src_port(), LOCAL_PORT); + assert_eq!(ctx.pkt.dst_port(), PEER_PORT); + assert_eq!(ctx.pkt.type_(), uapi::VSOCK_TYPE_STREAM); + assert_eq!(ctx.pkt.len(), 0); + // After yielding the response packet, the connection should have transitioned to the + // established state. + assert_eq!(ctx.conn.state, ConnState::Established); + } + + #[test] + fn test_local_request() { + let mut ctx = CsmTestContext::new(ConnState::LocalInit); + // Host-initiated connections should first yield a connection request packet. + assert!(ctx.conn.has_pending_rx()); + // Before yielding the connection request packet, the timeout kill timer shouldn't be + // armed. + assert!(!ctx.conn.will_expire()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_REQUEST); + // Since the request might time-out, the kill timer should now be armed. + assert!(ctx.conn.will_expire()); + assert!(!ctx.conn.has_expired()); + ctx.init_pkt(uapi::VSOCK_OP_RESPONSE, 0); + ctx.send(); + // Upon receiving a connection response, the connection should have transitioned to the + // established state, and the kill timer should've been disarmed. + assert_eq!(ctx.conn.state, ConnState::Established); + assert!(!ctx.conn.will_expire()); + } + + #[test] + fn test_local_request_timeout() { + let mut ctx = CsmTestContext::new(ConnState::LocalInit); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_REQUEST); + assert!(ctx.conn.will_expire()); + assert!(!ctx.conn.has_expired()); + std::thread::sleep(std::time::Duration::from_millis( + defs::CONN_REQUEST_TIMEOUT_MS, + )); + assert!(ctx.conn.has_expired()); + } + + #[test] + fn test_rx_data() { + let mut ctx = CsmTestContext::new_established(); + let data = &[1, 2, 3, 4]; + ctx.set_stream(TestStream::new_with_read_buf(data)); + assert_eq!(ctx.conn.as_raw_fd(), ctx.conn.stream.as_raw_fd()); + ctx.notify_epollin(); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RW); + assert_eq!(ctx.pkt.len() as usize, data.len()); + assert_eq!(ctx.pkt.buf().unwrap()[..ctx.pkt.len() as usize], *data); + + // There's no more data in the stream, so `recv_pkt` should yield `VsockError::NoData`. + match ctx.conn.recv_pkt(&mut ctx.pkt) { + Err(VsockError::NoData) => (), + other => panic!("{:?}", other), + } + + // A recv attempt in an invalid state should yield an instant reset packet. + ctx.conn.state = ConnState::LocalClosed; + ctx.notify_epollin(); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RST); + } + + #[test] + fn test_local_close() { + let mut ctx = CsmTestContext::new_established(); + let mut stream = TestStream::new(); + stream.read_state = StreamState::Closed; + ctx.set_stream(stream); + ctx.notify_epollin(); + ctx.recv(); + // When the host-side stream is closed, we can neither send not receive any more data. + // Therefore, the vsock shutdown packet that we'll deliver to the guest must contain both + // the no-more-send and the no-more-recv indications. + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_SHUTDOWN); + assert_ne!(ctx.pkt.flags() & uapi::VSOCK_FLAGS_SHUTDOWN_SEND, 0); + assert_ne!(ctx.pkt.flags() & uapi::VSOCK_FLAGS_SHUTDOWN_RCV, 0); + + // The kill timer should now be armed. + assert!(ctx.conn.will_expire()); + assert!( + ctx.conn.expiry().unwrap() + < Instant::now() + Duration::from_millis(defs::CONN_SHUTDOWN_TIMEOUT_MS) + ); + } + + #[test] + fn test_peer_close() { + // Test that send/recv shutdown indications are handled correctly. + // I.e. once set, an indication cannot be reset. + { + let mut ctx = CsmTestContext::new_established(); + + ctx.init_pkt(uapi::VSOCK_OP_SHUTDOWN, 0) + .set_flags(uapi::VSOCK_FLAGS_SHUTDOWN_RCV); + ctx.send(); + assert_eq!(ctx.conn.state, ConnState::PeerClosed(true, false)); + + // Attempting to reset the no-more-recv indication should not work + // (we are only setting the no-more-send indication here). + ctx.pkt.set_flags(uapi::VSOCK_FLAGS_SHUTDOWN_SEND); + ctx.send(); + assert_eq!(ctx.conn.state, ConnState::PeerClosed(true, true)); + } + + // Test case: + // - reading data from a no-more-send connection should work; and + // - writing data should have no effect. + { + let data = &[1, 2, 3, 4]; + let mut ctx = CsmTestContext::new_established(); + ctx.set_stream(TestStream::new_with_read_buf(data)); + ctx.init_pkt(uapi::VSOCK_OP_SHUTDOWN, 0) + .set_flags(uapi::VSOCK_FLAGS_SHUTDOWN_SEND); + ctx.send(); + ctx.notify_epollin(); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RW); + assert_eq!(&ctx.pkt.buf().unwrap()[..ctx.pkt.len() as usize], data); + + ctx.init_data_pkt(data); + ctx.send(); + let test_stream = ctx + .conn + .stream + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(test_stream.write_buf.len(), 0); + assert!(ctx.conn.tx_buf.is_empty()); + } + + // Test case: + // - writing data to a no-more-recv connection should work; and + // - attempting to read data from it should yield an RST packet. + { + let mut ctx = CsmTestContext::new_established(); + ctx.init_pkt(uapi::VSOCK_OP_SHUTDOWN, 0) + .set_flags(uapi::VSOCK_FLAGS_SHUTDOWN_RCV); + ctx.send(); + let data = &[1, 2, 3, 4]; + ctx.init_data_pkt(data); + ctx.send(); + let test_stream = ctx + .conn + .stream + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(test_stream.write_buf, data.to_vec()); + + ctx.notify_epollin(); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RST); + } + + // Test case: setting both no-more-send and no-more-recv indications should have the + // connection confirm termination (i.e. yield an RST). + { + let mut ctx = CsmTestContext::new_established(); + ctx.init_pkt(uapi::VSOCK_OP_SHUTDOWN, 0) + .set_flags(uapi::VSOCK_FLAGS_SHUTDOWN_RCV | uapi::VSOCK_FLAGS_SHUTDOWN_SEND); + ctx.send(); + assert!(ctx.conn.has_pending_rx()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RST); + } + } + + #[test] + fn test_local_read_error() { + let mut ctx = CsmTestContext::new_established(); + let mut stream = TestStream::new(); + stream.read_state = StreamState::Error(ErrorKind::PermissionDenied); + ctx.set_stream(stream); + ctx.notify_epollin(); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RST); + } + + #[test] + fn test_credit_request_to_peer() { + let mut ctx = CsmTestContext::new_established(); + ctx.set_peer_credit(0); + ctx.notify_epollin(); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_CREDIT_REQUEST); + } + + #[test] + fn test_credit_request_from_peer() { + let mut ctx = CsmTestContext::new_established(); + ctx.init_pkt(uapi::VSOCK_OP_CREDIT_REQUEST, 0); + ctx.send(); + assert!(ctx.conn.has_pending_rx()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_CREDIT_UPDATE); + assert_eq!(ctx.pkt.buf_alloc(), csm_defs::CONN_TX_BUF_SIZE); + assert_eq!(ctx.pkt.fwd_cnt(), ctx.conn.fwd_cnt.0); + } + + #[test] + fn test_credit_update_to_peer() { + let mut ctx = CsmTestContext::new_established(); + + // Force a stale state, where the peer hasn't been updated on our credit situation. + ctx.conn.last_fwd_cnt_to_peer = Wrapping(0); + + // Since a credit update token is sent when the fwd_cnt value exceeds + // CONN_TX_BUF_SIZE - CONN_CREDIT_UPDATE_THRESHOLD, we initialize + // fwd_cnt at 6 bytes below the threshold. + let initial_fwd_cnt = + csm_defs::CONN_TX_BUF_SIZE - csm_defs::CONN_CREDIT_UPDATE_THRESHOLD - 6; + ctx.conn.fwd_cnt = Wrapping(initial_fwd_cnt); + + // Use a 4-byte packet for triggering the credit update threshold. + let data = &[1, 2, 3, 4]; + + // Check that there is no pending RX. + ctx.init_data_pkt(data); + ctx.send(); + assert!(!ctx.conn.has_pending_rx()); + + // Send a packet again. + ctx.init_data_pkt(data); + ctx.send(); + + // The CSM should now have a credit update available for the peer. + assert!(ctx.conn.has_pending_rx()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_CREDIT_UPDATE); + assert_eq!(ctx.pkt.fwd_cnt(), initial_fwd_cnt + data.len() as u32 * 2); + assert_eq!(ctx.conn.fwd_cnt, ctx.conn.last_fwd_cnt_to_peer); + } + + #[test] + fn test_tx_buffering() { + // Test case: + // - when writing to the backing stream would block, TX data should end up in the TX buf + // - when the CSM is notified that it can write to the backing stream, it should flush + // the TX buf. + { + let mut ctx = CsmTestContext::new_established(); + + let mut stream = TestStream::new(); + stream.write_state = StreamState::WouldBlock; + ctx.set_stream(stream); + + // Send some data through the connection. The backing stream is set to reject writes, + // so the data should end up in the TX buffer. + let data = &[1, 2, 3, 4]; + ctx.init_data_pkt(data); + ctx.send(); + + // When there's data in the TX buffer, the connection should ask to be notified when it + // can write to its backing stream. + assert!(ctx + .conn + .get_polled_evset() + .contains(epoll::Events::EPOLLOUT)); + assert_eq!(ctx.conn.tx_buf.len(), data.len()); + + // Unlock the write stream and notify the connection it can now write its bufferred + // data. + ctx.set_stream(TestStream::new()); + ctx.conn.notify(epoll::Events::EPOLLOUT); + assert!(ctx.conn.tx_buf.is_empty()); + let test_stream = ctx + .conn + .stream + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(test_stream.write_buf, data); + } + } + + #[test] + fn test_stream_write_error() { + // Test case: sending a data packet to a broken / closed backing stream should kill it. + { + let mut ctx = CsmTestContext::new_established(); + let mut stream = TestStream::new(); + stream.write_state = StreamState::Closed; + ctx.set_stream(stream); + + let data = &[1, 2, 3, 4]; + ctx.init_data_pkt(data); + ctx.send(); + + assert_eq!(ctx.conn.state, ConnState::Killed); + assert!(ctx.conn.has_pending_rx()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RST); + } + + // Test case: notifying a connection that it can flush its TX buffer to a broken stream + // should kill the connection. + { + let mut ctx = CsmTestContext::new_established(); + + let mut stream = TestStream::new(); + stream.write_state = StreamState::WouldBlock; + ctx.set_stream(stream); + + // Send some data through the connection. The backing stream is set to reject writes, + // so the data should end up in the TX buffer. + let data = &[1, 2, 3, 4]; + ctx.init_data_pkt(data); + ctx.send(); + + // Set the backing stream to error out on write. + let mut stream = TestStream::new(); + stream.write_state = StreamState::Closed; + ctx.set_stream(stream); + + assert!(ctx + .conn + .get_polled_evset() + .contains(epoll::Events::EPOLLOUT)); + ctx.notify_epollout(); + assert_eq!(ctx.conn.state, ConnState::Killed); + } + } + + #[test] + fn test_peer_credit_misbehavior() { + let mut ctx = CsmTestContext::new_established(); + + let mut stream = TestStream::new(); + stream.write_state = StreamState::WouldBlock; + ctx.set_stream(stream); + + // Fill up the TX buffer. + let data = vec![0u8; ctx.pkt.buf().unwrap().len()]; + ctx.init_data_pkt(data.as_slice()); + for _i in 0..(csm_defs::CONN_TX_BUF_SIZE / data.len() as u32) { + ctx.send(); + } + + // Then try to send more data. + ctx.send(); + + // The connection should've committed suicide. + assert_eq!(ctx.conn.state, ConnState::Killed); + assert!(ctx.conn.has_pending_rx()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RST); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/csm/mod.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/csm/mod.rs new file mode 100644 index 000000000..964f4ea9f --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/csm/mod.rs @@ -0,0 +1,131 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +/// This module implements our vsock connection state machine. The heavy lifting +/// is done by `connection::VsockConnection`, while this file only defines some +/// constants and helper structs. +pub(crate) mod connection; +pub mod txbuf; + +pub use self::connection::VsockConnection; + +pub mod defs { + /// Vsock connection TX buffer capacity. + pub const CONN_TX_BUF_SIZE: u32 = 64 * 1024; + + /// When the guest thinks we have less than this amount of free buffer + /// space, we will send them a credit update packet. + pub const CONN_CREDIT_UPDATE_THRESHOLD: u32 = 4 * 1024; + + /// Connection request timeout, in millis. + pub const CONN_REQUEST_TIMEOUT_MS: u64 = 2000; + + /// Connection graceful shutdown timeout, in millis. + pub const CONN_SHUTDOWN_TIMEOUT_MS: u64 = 2000; +} + +#[derive(Debug)] +pub enum Error { + /// Attempted to push data to a full TX buffer. + TxBufFull, + /// An I/O error occurred, when attempting to flush the connection TX + /// buffer. + TxBufFlush(std::io::Error), + /// An I/O error occurred, when attempting to write data to the host-side + /// stream. + StreamWrite(std::io::Error), +} + +type Result = std::result::Result; + +/// A vsock connection state. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum ConnState { + /// The connection has been initiated by the host end, but is yet to be + /// confirmed by the guest. + LocalInit, + /// The connection has been initiated by the guest, but we are yet to + /// confirm it, by sending a response packet (VSOCK_OP_RESPONSE). + PeerInit, + /// The connection handshake has been performed successfully, and data can + /// now be exchanged. + Established, + /// The host (AF_UNIX) socket was closed. + LocalClosed, + /// A VSOCK_OP_SHUTDOWN packet was received from the guest. The tuple + /// represents the guest R/W indication: (will_not_recv_anymore_data, + /// will_not_send_anymore_data). + PeerClosed(bool, bool), + /// The connection is scheduled to be forcefully terminated as soon as + /// possible. + Killed, +} + +/// An RX indication, used by `VsockConnection` to schedule future `recv_pkt()` +/// responses. +/// +/// For instance, after being notified that there is available data to be read +/// from the host stream (via `notify()`), the connection will store a +/// `PendingRx::Rw` to be later inspected by `recv_pkt()`. +#[derive(Clone, Copy, PartialEq, Eq)] +enum PendingRx { + /// We need to yield a connection request packet (VSOCK_OP_REQUEST). + Request = 0, + /// We need to yield a connection response packet (VSOCK_OP_RESPONSE). + Response = 1, + /// We need to yield a forceful connection termination packet (VSOCK_OP_RST). + Rst = 2, + /// We need to yield a data packet (VSOCK_OP_RW), by reading from the + /// AF_UNIX socket. + Rw = 3, + /// We need to yield a credit update packet (VSOCK_OP_CREDIT_UPDATE). + CreditUpdate = 4, +} +impl PendingRx { + /// Transform the enum value into a bitmask, that can be used for set + /// operations. + fn into_mask(self) -> u16 { + 1u16 << (self as u16) + } +} + +/// A set of RX indications (`PendingRx` items). +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PendingRxSet { + data: u16, +} + +impl PendingRxSet { + /// Insert an item into the set. + fn insert(&mut self, it: PendingRx) { + self.data |= it.into_mask(); + } + + /// Remove an item from the set and return: + /// - true, if the item was in the set; or + /// - false, if the item wasn't in the set. + fn remove(&mut self, it: PendingRx) -> bool { + let ret = self.contains(it); + self.data &= !it.into_mask(); + ret + } + + /// Check if an item is present in this set. + fn contains(&self, it: PendingRx) -> bool { + self.data & it.into_mask() != 0 + } + + /// Check if the set is empty. + fn is_empty(&self) -> bool { + self.data == 0 + } +} + +/// Create a set containing only one item. +impl From for PendingRxSet { + fn from(it: PendingRx) -> Self { + Self { + data: it.into_mask(), + } + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/csm/txbuf.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/csm/txbuf.rs new file mode 100644 index 000000000..ab1241649 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/csm/txbuf.rs @@ -0,0 +1,274 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::io::Write; +use std::num::Wrapping; + +use super::defs; +use super::{Error, Result}; + +/// A simple ring-buffer implementation, used by vsock connections to buffer TX +/// (guest -> host) data. Memory for this buffer is allocated lazily, since +/// buffering will only be needed when the host can't read fast enough. +#[derive(Eq, PartialEq)] +pub struct TxBuf { + /// The actual u8 buffer - only allocated after the first push. + pub data: Option>, + /// Ring-buffer head offset - where new data is pushed to. + pub head: Wrapping, + /// Ring-buffer tail offset - where data is flushed from. + pub tail: Wrapping, +} + +impl TxBuf { + /// Total buffer size, in bytes. + const SIZE: usize = defs::CONN_TX_BUF_SIZE as usize; + + /// Get the used length of this buffer - number of bytes that have been + /// pushed in, but not yet flushed out. + pub fn len(&self) -> usize { + (self.head - self.tail).0 as usize + } + + /// Push a byte slice onto the ring-buffer. + /// + /// Either the entire source slice will be pushed to the ring-buffer, or + /// none of it, if there isn't enough room, in which case + /// `Err(Error::TxBufFull)` is returned. + pub fn push(&mut self, src: &[u8]) -> Result<()> { + // Error out if there's no room to push the entire slice. + if self.len() + src.len() > Self::SIZE { + return Err(Error::TxBufFull); + } + + let data = self + .data + .get_or_insert_with(|| vec![0u8; Self::SIZE].into_boxed_slice()); + + // Buffer head, as an offset into the data slice. + let head_ofs = self.head.0 as usize % Self::SIZE; + + // Pushing a slice to this buffer can take either one or two slice + // copies: - one copy, if the slice fits between `head_ofs` and + // `Self::SIZE`; or - two copies, if the ring-buffer head wraps around. + + // First copy length: we can only go from the head offset up to the + // total buffer size. + let len = std::cmp::min(Self::SIZE - head_ofs, src.len()); + data[head_ofs..(head_ofs + len)].copy_from_slice(&src[..len]); + + // If the slice didn't fit, the buffer head will wrap around, and + // pushing continues from the start of the buffer (`&self.data[0]`). + if len < src.len() { + data[..(src.len() - len)].copy_from_slice(&src[len..]); + } + + // Either way, we've just pushed exactly `src.len()` bytes, so that's + // the amount by which the (wrapping) buffer head needs to move forward. + self.head += Wrapping(src.len() as u32); + + Ok(()) + } + + /// Flush the contents of the ring-buffer to a writable stream. + /// + /// Return the number of bytes that have been transferred out of the + /// ring-buffer and into the writable stream. + pub fn flush_to(&mut self, sink: &mut W) -> Result + where + W: Write, + { + // Nothing to do, if this buffer holds no data. + if self.is_empty() { + return Ok(0); + } + + // Buffer tail, as an offset into the buffer data slice. + let tail_ofs = self.tail.0 as usize % Self::SIZE; + + // Flushing the buffer can take either one or two writes: + // - one write, if the tail doesn't need to wrap around to reach the + // head; or + // - two writes, if the tail would wrap around: tail to slice end, then + // slice end to head. + + // First write length: the lesser of tail to slice end, or tail to head. + let len_to_write = std::cmp::min(Self::SIZE - tail_ofs, self.len()); + + // It's safe to unwrap here, since we've already checked if the buffer + // was empty. + let data = self.data.as_ref().unwrap(); + + // Issue the first write and absorb any `WouldBlock` error (we can just + // try again later). + let written = sink + .write(&data[tail_ofs..(tail_ofs + len_to_write)]) + .map_err(Error::TxBufFlush)?; + + // Move the buffer tail ahead by the amount (of bytes) we were able to + // flush out. + self.tail += Wrapping(written as u32); + + // If we weren't able to flush out as much as we tried, there's no point + // in attempting our second write. + if written < len_to_write { + return Ok(written); + } + + // Attempt our second write. This will return immediately if a second + // write isn't needed, since checking for an empty buffer is the first + // thing we do in this function. + // + // Interesting corner case: if we've already written some data in the + // first pass, and then the second write fails, we will consider the + // flush action a success and return the number of bytes written in the + // first pass. + Ok(written + self.flush_to(sink).unwrap_or(0)) + } + + /// Check if the buffer holds any data that hasn't yet been flushed out. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +impl Default for TxBuf { + /// Ring-buffer constructor. + fn default() -> Self { + Self { + data: None, + head: Wrapping(0), + tail: Wrapping(0), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Error as IoError; + use std::io::Result as IoResult; + use std::io::{ErrorKind, Write}; + + struct TestSink { + data: Vec, + err: Option, + capacity: usize, + } + + impl TestSink { + const DEFAULT_CAPACITY: usize = 2 * TxBuf::SIZE; + fn new() -> Self { + Self { + data: Vec::with_capacity(Self::DEFAULT_CAPACITY), + err: None, + capacity: Self::DEFAULT_CAPACITY, + } + } + } + + impl Write for TestSink { + fn write(&mut self, src: &[u8]) -> IoResult { + if self.err.is_some() { + return Err(self.err.take().unwrap()); + } + let len_to_push = std::cmp::min(self.capacity - self.data.len(), src.len()); + self.data.extend_from_slice(&src[..len_to_push]); + Ok(len_to_push) + } + fn flush(&mut self) -> IoResult<()> { + Ok(()) + } + } + + impl TestSink { + fn clear(&mut self) { + self.data = Vec::with_capacity(self.capacity); + self.err = None; + } + fn set_err(&mut self, err: IoError) { + self.err = Some(err); + } + fn set_capacity(&mut self, capacity: usize) { + self.capacity = capacity; + if self.data.len() > self.capacity { + self.data.resize(self.capacity, 0); + } + } + } + + #[test] + fn test_push_nowrap() { + let mut txbuf = TxBuf::default(); + let mut sink = TestSink::new(); + assert!(txbuf.is_empty()); + + assert!(txbuf.data.is_none()); + txbuf.push(&[1, 2, 3, 4]).unwrap(); + txbuf.push(&[5, 6, 7, 8]).unwrap(); + txbuf.flush_to(&mut sink).unwrap(); + assert_eq!(sink.data, [1, 2, 3, 4, 5, 6, 7, 8]); + } + + #[test] + fn test_push_wrap() { + let mut txbuf = TxBuf::default(); + let mut sink = TestSink::new(); + let mut tmp: Vec = Vec::new(); + + tmp.resize(TxBuf::SIZE - 2, 0); + txbuf.push(tmp.as_slice()).unwrap(); + txbuf.flush_to(&mut sink).unwrap(); + sink.clear(); + + txbuf.push(&[1, 2, 3, 4]).unwrap(); + assert_eq!(txbuf.flush_to(&mut sink).unwrap(), 4); + assert_eq!(sink.data, [1, 2, 3, 4]); + } + + #[test] + fn test_push_error() { + let mut txbuf = TxBuf::default(); + let mut tmp = Vec::with_capacity(TxBuf::SIZE); + + tmp.resize(TxBuf::SIZE - 1, 0); + txbuf.push(tmp.as_slice()).unwrap(); + match txbuf.push(&[1, 2]) { + Err(Error::TxBufFull) => (), + other => panic!("Unexpected result: {:?}", other), + } + } + + #[test] + fn test_incomplete_flush() { + let mut txbuf = TxBuf::default(); + let mut sink = TestSink::new(); + + sink.set_capacity(2); + txbuf.push(&[1, 2, 3, 4]).unwrap(); + assert_eq!(txbuf.flush_to(&mut sink).unwrap(), 2); + assert_eq!(txbuf.len(), 2); + assert_eq!(sink.data, [1, 2]); + + sink.set_capacity(4); + assert_eq!(txbuf.flush_to(&mut sink).unwrap(), 2); + assert!(txbuf.is_empty()); + assert_eq!(sink.data, [1, 2, 3, 4]); + } + + #[test] + fn test_flush_error() { + const EACCESS: i32 = 13; + + let mut txbuf = TxBuf::default(); + let mut sink = TestSink::new(); + + txbuf.push(&[1, 2, 3, 4]).unwrap(); + let io_err = IoError::from_raw_os_error(EACCESS); + sink.set_err(io_err); + match txbuf.flush_to(&mut sink) { + Err(Error::TxBufFlush(ref err)) if err.kind() == ErrorKind::PermissionDenied => (), + other => panic!("Unexpected result: {:?}", other), + } + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/device.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/device.rs new file mode 100644 index 000000000..66f5b8bf5 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/device.rs @@ -0,0 +1,369 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. +use std::any::Any; +use std::marker::PhantomData; +use std::sync::Arc; + +use dbs_device::resources::ResourceConstraint; +use dbs_utils::epoll_manager::{EpollManager, SubscriberId}; +use log::debug; +use log::trace; +use log::warn; +use virtio_queue::QueueT; +use vm_memory::GuestAddressSpace; +use vm_memory::GuestMemoryRegion; + +use super::backend::VsockBackend; +use super::defs::uapi; +use super::epoll_handler::VsockEpollHandler; +use super::muxer::{Error as MuxerError, VsockGenericMuxer, VsockMuxer}; +use super::{Result, VsockError}; +use crate::device::{VirtioDeviceConfig, VirtioDeviceInfo}; +use crate::{ActivateResult, ConfigResult, DbsGuestAddressSpace, VirtioDevice}; + +const VSOCK_DRIVER_NAME: &str = "virtio-vsock"; +const VSOCK_CONFIG_SPACE_SIZE: usize = 8; +const VSOCK_AVAIL_FEATURES: u64 = + 1u64 << uapi::VIRTIO_F_VERSION_1 | 1u64 << uapi::VIRTIO_F_IN_ORDER; + +/// This is the `VirtioDevice` implementation for our vsock device. It handles +/// the virtio-level device logic: feature negociation, device configuration, +/// and device activation. The run-time device logic (i.e. event-driven data +/// handling) is implemented by `super::epoll_handler::EpollHandler`. +/// +/// The vsock device has two input parameters: a CID to identify the device, and +/// a `VsockBackend` to use for offloading vsock traffic. +/// +/// Upon its activation, the vsock device creates its `EpollHandler`, passes it +/// the event-interested file descriptors, and registers these descriptors with +/// the VMM `EpollContext`. Going forward, the `EpollHandler` will get notified +/// whenever an event occurs on the just-registered FDs: +/// - an RX queue FD; +/// - a TX queue FD; +/// - an event queue FD; and +/// - a backend FD. +pub struct Vsock { + cid: u64, + queue_sizes: Arc>, + device_info: VirtioDeviceInfo, + subscriber_id: Option, + muxer: Option, + phantom: PhantomData, +} + +// Default muxer implementation of Vsock +impl Vsock { + /// Create a new virtio-vsock device with the given VM CID and vsock + /// backend. + pub fn new(cid: u64, queue_sizes: Arc>, epoll_mgr: EpollManager) -> Result { + let muxer = VsockMuxer::new(cid).map_err(VsockError::Muxer)?; + Self::new_with_muxer(cid, queue_sizes, epoll_mgr, muxer) + } +} + +impl Vsock { + pub(crate) fn new_with_muxer( + cid: u64, + queue_sizes: Arc>, + epoll_mgr: EpollManager, + muxer: M, + ) -> Result { + let mut config_space = Vec::with_capacity(VSOCK_CONFIG_SPACE_SIZE); + for i in 0..VSOCK_CONFIG_SPACE_SIZE { + config_space.push((cid >> (8 * i as u64)) as u8); + } + + Ok(Vsock { + cid, + queue_sizes: queue_sizes.clone(), + device_info: VirtioDeviceInfo::new( + VSOCK_DRIVER_NAME.to_string(), + VSOCK_AVAIL_FEATURES, + queue_sizes, + config_space, + epoll_mgr, + ), + subscriber_id: None, + muxer: Some(muxer), + phantom: PhantomData, + }) + } + + fn id(&self) -> &str { + &self.device_info.driver_name + } + + /// add backend for vsock muxer + // NOTE: Backend is not allowed to add when vsock device is activated. + pub fn add_backend(&mut self, backend: Box, is_default: bool) -> Result<()> { + if let Some(muxer) = self.muxer.as_mut() { + muxer + .add_backend(backend, is_default) + .map_err(VsockError::Muxer) + } else { + Err(VsockError::Muxer(MuxerError::BackendAddAfterActivated)) + } + } +} + +impl VirtioDevice for Vsock +where + AS: DbsGuestAddressSpace, + Q: QueueT + Send + 'static, + R: GuestMemoryRegion + Sync + Send + 'static, + M: VsockGenericMuxer + 'static, +{ + fn device_type(&self) -> u32 { + uapi::VIRTIO_ID_VSOCK + } + + fn queue_max_sizes(&self) -> &[u16] { + &self.queue_sizes + } + + fn get_avail_features(&self, page: u32) -> u32 { + self.device_info.get_avail_features(page) + } + + fn set_acked_features(&mut self, page: u32, value: u32) { + trace!(target: "virtio-vsock", "{}: VirtioDevice::set_acked_features({}, 0x{:x})", + self.id(), page, value + ); + self.device_info.set_acked_features(page, value) + } + + fn read_config(&mut self, offset: u64, data: &mut [u8]) -> ConfigResult { + trace!(target: "virtio-vsock", "{}: VirtioDevice::read_config(0x{:x}, {:?})", + self.id(), offset, data); + self.device_info.read_config(offset, data) + } + + fn write_config(&mut self, offset: u64, data: &[u8]) -> ConfigResult { + trace!(target: "virtio-vsock", "{}: VirtioDevice::write_config(0x{:x}, {:?})", + self.id(), offset, data); + self.device_info.write_config(offset, data) + } + + fn activate(&mut self, config: VirtioDeviceConfig) -> ActivateResult { + trace!(target: "virtio-vsock", "{}: VirtioDevice::activate()", self.id()); + + self.device_info.check_queue_sizes(&config.queues[..])?; + let handler: VsockEpollHandler = VsockEpollHandler::new( + config, + self.id().to_owned(), + self.cid, + // safe to unwrap, because we create muxer using New() + self.muxer.take().unwrap(), + ); + + self.subscriber_id = Some(self.device_info.register_event_handler(Box::new(handler))); + + Ok(()) + } + + fn get_resource_requirements( + &self, + requests: &mut Vec, + use_generic_irq: bool, + ) { + trace!(target: "virtio-vsock", "{}: VirtioDevice::get_resource_requirements()", self.id()); + + requests.push(ResourceConstraint::LegacyIrq { irq: None }); + if use_generic_irq { + requests.push(ResourceConstraint::GenericIrq { + size: (self.queue_sizes.len() + 1) as u32, + }); + } + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn remove(&mut self) { + let subscriber_id = self.subscriber_id.take(); + if let Some(subscriber_id) = subscriber_id { + match self.device_info.remove_event_handler(subscriber_id) { + Ok(_) => debug!("virtio-vsock: removed subscriber_id {:?}", subscriber_id), + Err(err) => warn!("virtio-vsock: failed to remove event handler: {:?}", err), + }; + } else { + self.muxer.take(); + } + } +} + +#[cfg(test)] +mod tests { + use dbs_device::resources::DeviceResources; + use dbs_interrupt::NoopNotifier; + use kvm_ioctls::Kvm; + use virtio_queue::QueueSync; + use vm_memory::{GuestAddress, GuestMemoryMmap, GuestRegionMmap}; + + use super::super::defs::uapi; + use super::super::tests::{test_bytes, TestContext}; + use super::*; + use crate::device::VirtioDeviceConfig; + use crate::VirtioQueueConfig; + + impl Vsock { + pub fn mock_activate( + &mut self, + config: VirtioDeviceConfig, + ) -> Result> { + trace!(target: "virtio-vsock", "{}: VirtioDevice::activate_re()", self.id()); + + self.device_info + .check_queue_sizes(&config.queues[..]) + .unwrap(); + let handler: VsockEpollHandler = + VsockEpollHandler::new( + config, + self.id().to_owned(), + self.cid, + // safe to unwrap, because we create muxer using New() + self.muxer.take().unwrap(), + ); + + Ok(handler) + } + } + + #[test] + fn test_virtio_device() { + let mut ctx = TestContext::new(); + let device_features = VSOCK_AVAIL_FEATURES; + let driver_features: u64 = VSOCK_AVAIL_FEATURES | 1 | (1 << 32); + let device_pages = [ + (device_features & 0xffff_ffff) as u32, + (device_features >> 32) as u32, + ]; + let driver_pages = [ + (driver_features & 0xffff_ffff) as u32, + (driver_features >> 32) as u32, + ]; + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::device_type( + &ctx.device + ), + uapi::VIRTIO_ID_VSOCK + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features( + &ctx.device, 0 + ), + device_pages[0] + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features( + &ctx.device, 1 + ), + device_pages[1] + ); + assert_eq!( + VirtioDevice::>, QueueSync, GuestRegionMmap>::get_avail_features( + &ctx.device, 2 + ), + 0 + ); + + // Ack device features, page 0. + ctx.device + .device_info + .set_acked_features(0, driver_pages[0]); + // Ack device features, page 1. + ctx.device + .device_info + .set_acked_features(1, driver_pages[1]); + // Ack some bogus page (i.e. 2). This should have no side effect. + ctx.device.device_info.set_acked_features(2, 0); + // Attempt to un-ack the first feature page. This should have no side effect. + ctx.device + .device_info + .set_acked_features(0, !driver_pages[0]); + // Check that no side effect are present, and that the acked features are exactly the same + // as the device features. + assert_eq!( + ctx.device.device_info.acked_features(), + device_features & driver_features + ); + + // Test reading 32-bit chunks. + let mut data = [0u8; 8]; + VirtioDevice::>, QueueSync, GuestRegionMmap>::read_config( + &mut ctx.device, + 0, + &mut data[..4], + ) + .unwrap(); + test_bytes(&data[..], &(ctx.cid & 0xffff_ffff).to_le_bytes()); + VirtioDevice::>, QueueSync, GuestRegionMmap>::read_config( + &mut ctx.device, + 4, + &mut data[4..], + ) + .unwrap(); + test_bytes(&data[4..], &((ctx.cid >> 32) & 0xffff_ffff).to_le_bytes()); + + // Test reading 64-bit. + let mut data = [0u8; 8]; + VirtioDevice::>, QueueSync, GuestRegionMmap>::read_config( + &mut ctx.device, + 0, + &mut data, + ) + .unwrap(); + test_bytes(&data, &ctx.cid.to_le_bytes()); + + // Check out-of-bounds reading. + let mut data = [0u8, 1, 2, 3, 4, 5, 6, 7]; + VirtioDevice::>, QueueSync, GuestRegionMmap>::read_config( + &mut ctx.device, + 2, + &mut data, + ) + .unwrap(); + assert_eq!(data, [0u8, 0, 0, 0, 0, 0, 6, 7]); + + // Just covering lines here, since the vsock device has no writable config. + // A warning is, however, logged, if the guest driver attempts to write any config data. + VirtioDevice::>, QueueSync, GuestRegionMmap>::write_config( + &mut ctx.device, + 0, + &data[..4], + ) + .unwrap(); + + let mem = GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); + let queues = vec![ + VirtioQueueConfig::::create(2, 0).unwrap(), + VirtioQueueConfig::::create(2, 0).unwrap(), + VirtioQueueConfig::::create(2, 0).unwrap(), + ]; + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::>>::new( + Arc::new(mem), + vm_fd, + resources, + queues, + None, + Arc::new(NoopNotifier::new()), + ); + + // Test activation. + ctx.device.activate(config).unwrap(); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/epoll_handler.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/epoll_handler.rs new file mode 100644 index 000000000..5ffcd23c5 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/epoll_handler.rs @@ -0,0 +1,629 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +use std::ops::Deref; + +use dbs_utils::epoll_manager::{EventOps, EventSet, Events, MutEventSubscriber}; +use log::{error, trace, warn}; +use virtio_queue::{QueueOwnedT, QueueSync, QueueT}; +use vm_memory::{GuestMemoryRegion, GuestRegionMmap}; + +use super::defs; +use super::muxer::{VsockGenericMuxer, VsockMuxer}; +use super::packet::VsockPacket; +use crate::device::VirtioDeviceConfig; +use crate::{DbsGuestAddressSpace, Result as VirtIoResult}; + +const QUEUE_RX: usize = 0; +const QUEUE_TX: usize = 1; +const QUEUE_CFG: usize = 2; + +// TODO: Detect / handle queue deadlock: +// 1. If `self.backend.send_pkt()` errors out, TX queue processing will halt. +// Try to process any pending backend RX, then try TX again. If it fails +// again, we have a deadlock. +// 2. If the driver halts RX queue processing, we'll need to notify +// `self.backend`, so that it can unregister any EPOLLIN listeners, since +// otherwise it will keep spinning, unable to consume its EPOLLIN events. + +/// The vsock `EpollHandler` implements the runtime logic of our vsock device: +/// 1. Respond to TX queue events by wrapping virtio buffers into +/// `VsockPacket`s, then sending those packets to the `VsockBackend`; +/// 2. Forward backend FD event notifications to the `VsockBackend`; +/// 3. Fetch incoming packets from the `VsockBackend` and place them into the +/// virtio RX queue; +/// 4. Whenever we have processed some virtio buffers (either TX or RX), let the +/// driver know by raising our assigned IRQ. +/// +/// In a nutshell, the `EpollHandler` logic looks like this: +/// - on TX queue event: +/// - fetch all packets from the TX queue and send them to the backend; then +/// - if the backend has queued up any incoming packets, fetch them into any +/// available RX buffers. +/// - on RX queue event: +/// - fetch any incoming packets, queued up by the backend, into newly +/// available RX buffers. +/// - on backend event: +/// - forward the event to the backend; then +/// - again, attempt to fetch any incoming packets queued by the backend into +/// virtio RX buffers. +pub struct VsockEpollHandler< + AS: DbsGuestAddressSpace, + Q: QueueT + Send = QueueSync, + R: GuestMemoryRegion = GuestRegionMmap, + M: VsockGenericMuxer = VsockMuxer, +> { + pub(crate) config: VirtioDeviceConfig, + id: String, + pub(crate) muxer: M, + _cid: u64, +} + +impl VsockEpollHandler +where + AS: DbsGuestAddressSpace, + Q: QueueT + Send, + R: GuestMemoryRegion, + M: VsockGenericMuxer, +{ + pub fn new(config: VirtioDeviceConfig, id: String, cid: u64, muxer: M) -> Self { + VsockEpollHandler { + config, + id, + _cid: cid, + muxer, + } + } + + /// Signal the guest driver that we've used some virtio buffers that it had + /// previously made available. + pub(crate) fn signal_used_queue(&self, idx: usize) -> VirtIoResult<()> { + trace!("{}: raising IRQ", self.id); + self.config.queues[idx].notify().map_err(|e| { + error!("{}: failed to signal used queue {}, {:?}", self.id, idx, e); + e + }) + } + + /// Walk the driver-provided RX queue buffers and attempt to fill them up + /// with any data that we have pending. + fn process_rx(&mut self, mem: &AS::M) { + trace!("{}: epoll_handler::process_rx()", self.id); + let mut raise_irq = false; + { + let rxvq = &mut self.config.queues[QUEUE_RX].queue_mut().lock(); + loop { + let mut iter = match rxvq.iter(mem) { + Err(e) => { + error!("{}: failed to process rx queue. {}", self.id, e); + return; + } + Ok(iter) => iter, + }; + + if let Some(mut desc_chain) = iter.next() { + let used_len = match VsockPacket::from_rx_virtq_head(&mut desc_chain) { + Ok(mut pkt) => { + if self.muxer.recv_pkt(&mut pkt).is_ok() { + pkt.hdr().len() as u32 + pkt.len() + } else { + // We are using a consuming iterator over the virtio buffers, so, if we + // can't fill in this buffer, we'll need to undo the last iterator step. + iter.go_to_previous_position(); + break; + } + } + Err(e) => { + warn!("{}: RX queue error: {:?}", self.id, e); + 0 + } + }; + + raise_irq = true; + let _ = rxvq.add_used(mem, desc_chain.head_index(), used_len); + } else { + break; + } + } + } + if raise_irq { + if let Err(e) = self.signal_used_queue(QUEUE_RX) { + error!("{}: failed to notify guest for RX queue, {:?}", self.id, e); + } + } + } + + /// Walk the dirver-provided TX queue buffers, package them up as vsock + /// packets, and send them to the backend for processing. + fn process_tx(&mut self, mem: &AS::M) { + trace!("{}: epoll_handler::process_tx()", self.id); + let mut have_used = false; + + { + let txvq = &mut self.config.queues[QUEUE_TX].queue_mut().lock(); + + loop { + let mut iter = match txvq.iter(mem) { + Err(e) => { + error!("{}: failed to process tx queue. {}", self.id, e); + return; + } + Ok(iter) => iter, + }; + + if let Some(mut desc_chain) = iter.next() { + let pkt = match VsockPacket::from_tx_virtq_head(&mut desc_chain) { + Ok(pkt) => pkt, + Err(e) => { + error!("{}: error reading TX packet: {:?}", self.id, e); + have_used = true; + let _ = txvq.add_used(mem, desc_chain.head_index(), 0); + continue; + } + }; + + if self.muxer.send_pkt(&pkt).is_err() { + iter.go_to_previous_position(); + break; + } + + have_used = true; + let _ = txvq.add_used(mem, desc_chain.head_index(), 0); + } else { + break; + } + } + } + if have_used { + if let Err(e) = self.signal_used_queue(QUEUE_TX) { + error!("{}: failed to notify guest for TX queue, {:?}", self.id, e); + } + } + } + + pub(crate) fn handle_rxq_event(&mut self, mem: &AS::M) { + trace!("{}: handle RX queue event", self.id); + if let Err(e) = self.config.queues[QUEUE_RX].consume_event() { + error!("{}: failed to consume rx queue event, {:?}", self.id, e); + } else if self.muxer.has_pending_rx() { + self.process_rx(mem); + } + } + + pub(crate) fn handle_txq_event(&mut self, mem: &AS::M) { + trace!("{}: handle TX queue event", self.id); + if let Err(e) = self.config.queues[QUEUE_TX].consume_event() { + error!("{}: failed to consume tx queue event, {:?}", self.id, e); + } else { + self.process_tx(mem); + // The backend may have queued up responses to the packets + // we sent during TX queue processing. If that happened, we + // need to fetch those responses and place them into RX + // buffers. + if self.muxer.has_pending_rx() { + self.process_rx(mem); + } + } + } + + fn handle_evq_event(&mut self, _mem: &AS::M) { + trace!("{}: handle event queue event", self.id); + if let Err(e) = self.config.queues[QUEUE_CFG].consume_event() { + error!("{}: failed to consume config queue event, {:?}", self.id, e); + } + } + + pub(crate) fn notify_backend_event(&mut self, events: &Events, mem: &AS::M) { + trace!("{}: backend event", self.id); + let events = epoll::Events::from_bits(events.event_set().bits()).unwrap(); + self.muxer.notify(events); + // After the backend has been kicked, it might've freed up some + // resources, so we can attempt to send it more data to process. In + // particular, if `self.backend.send_pkt()` halted the TX queue + // processing (by reurning an error) at some point in the past, now is + // the time to try walking the TX queue again. + self.process_tx(mem); + // This event may have caused some packets to be queued up by the + // backend. Make sure they are processed. + if self.muxer.has_pending_rx() { + self.process_rx(mem); + } + } +} + +impl MutEventSubscriber for VsockEpollHandler +where + AS: DbsGuestAddressSpace, + Q: QueueT + Send, + R: GuestMemoryRegion, + M: VsockGenericMuxer + 'static, +{ + fn process(&mut self, events: Events, _ops: &mut EventOps) { + let guard = self.config.lock_guest_memory(); + let mem = guard.deref(); + + match events.data() { + defs::RXQ_EVENT => self.handle_rxq_event(mem), + defs::TXQ_EVENT => self.handle_txq_event(mem), + defs::EVQ_EVENT => self.handle_evq_event(mem), + defs::BACKEND_EVENT => self.notify_backend_event(&events, mem), + _ => error!("{}: unknown epoll event slot {}", self.id, events.data()), + } + } + + fn init(&mut self, ops: &mut EventOps) { + trace!("{}: VsockEpollHandler::init()", self.id); + + let events = Events::with_data( + self.config.queues[QUEUE_RX].eventfd.as_ref(), + defs::RXQ_EVENT, + EventSet::IN, + ); + if let Err(e) = ops.add(events) { + error!( + "{}: failed to register epoll event for RX queue, {:?}.", + self.id, e + ); + } + + let events = Events::with_data( + self.config.queues[QUEUE_TX].eventfd.as_ref(), + defs::TXQ_EVENT, + EventSet::IN, + ); + if let Err(e) = ops.add(events) { + error!( + "{}: failed to register epoll event for TX queue, {:?}.", + self.id, e + ); + } + + let events = Events::with_data( + self.config.queues[QUEUE_CFG].eventfd.as_ref(), + defs::EVQ_EVENT, + EventSet::IN, + ); + if let Err(e) = ops.add(events) { + error!( + "{}: failed to register epoll event for config queue, {:?}.", + self.id, e + ); + } + + let be_fd = self.muxer.as_raw_fd(); + let be_evset = EventSet::from_bits(self.muxer.get_polled_evset().bits()).unwrap(); + let events = Events::with_data_raw(be_fd, defs::BACKEND_EVENT, be_evset); + if let Err(e) = ops.add(events) { + error!( + "{}: failed to register epoll event for backend fd: {:?}, {:?}.", + self.id, be_fd, e + ); + } + } +} + +#[cfg(test)] +mod tests { + use vm_memory::{Bytes, GuestAddress, GuestMemoryMmap}; + use vmm_sys_util::epoll::EventSet; + + use super::super::packet::VSOCK_PKT_HDR_SIZE; + use super::super::tests::TestContext; + use super::super::VsockError; + use super::*; + + #[test] + fn test_irq() { + let test_ctx = TestContext::new(); + let mut ctx = test_ctx.create_event_handler_context(); + ctx.arti_activate(&test_ctx.mem); + + assert!(ctx.signal_used_queue(0).is_ok()); + } + + #[test] + fn test_txq_event() { + // Test case: + // - the driver has something to send (there's data in the TX queue); + // and + // - the backend has no pending RX data. + { + let test_ctx = TestContext::new(); + let mut ctx = test_ctx.create_event_handler_context(); + ctx.arti_activate(&test_ctx.mem); + + if let Some(epoll_handler) = &mut ctx.epoll_handler { + epoll_handler.muxer.set_pending_rx(false); + } + ctx.signal_txq_event(); + + // The available TX descriptor should have been used. + assert_eq!(ctx.guest_txvq.used.idx().load(), 1); + // The available RX descriptor should be untouched. + assert_eq!(ctx.guest_rxvq.used.idx().load(), 0); + } + + // Test case: + // - the driver has something to send (there's data in the TX queue); + // and + // - the backend also has some pending RX data. + { + let test_ctx = TestContext::new(); + let mut ctx = test_ctx.create_event_handler_context(); + ctx.arti_activate(&test_ctx.mem); + + if let Some(epoll_handler) = &mut ctx.epoll_handler { + epoll_handler.muxer.set_pending_rx(true); + } + ctx.signal_txq_event(); + + // Both available RX and TX descriptors should have been used. + assert_eq!(ctx.guest_txvq.used.idx().load(), 1); + assert_eq!(ctx.guest_rxvq.used.idx().load(), 1); + } + + // Test case: + // - the driver has something to send (there's data in the TX queue); + // and + // - the backend errors out and cannot process the TX queue. + { + let test_ctx = TestContext::new(); + let mut ctx = test_ctx.create_event_handler_context(); + ctx.arti_activate(&test_ctx.mem); + + if let Some(epoll_handler) = &mut ctx.epoll_handler { + epoll_handler.muxer.set_pending_rx(false); + epoll_handler.muxer.set_tx_err(Some(VsockError::NoData)); + } + ctx.signal_txq_event(); + + // Both RX and TX queues should be untouched. + assert_eq!(ctx.guest_txvq.used.idx().load(), 0); + assert_eq!(ctx.guest_rxvq.used.idx().load(), 0); + } + + // Test case: + // - the driver supplied a malformed TX buffer. + { + let test_ctx = TestContext::new(); + let mut ctx = test_ctx.create_event_handler_context(); + ctx.arti_activate(&test_ctx.mem); + + // Invalidate the packet header descriptor, by setting its length to + // 0. + ctx.guest_txvq.dtable(0).len().store(0); + ctx.signal_txq_event(); + + // The available descriptor should have been consumed, but no packet + // should have reached the backend. + assert_eq!(ctx.guest_txvq.used.idx().load(), 1); + if let Some(epoll_handler) = &mut ctx.epoll_handler { + assert_eq!(epoll_handler.muxer.tx_ok_cnt, 0); + } + } + } + + #[test] + fn test_rxq_event() { + // Test case: + // - there is pending RX data in the backend; and + // - the driver makes RX buffers available; and + // - the backend successfully places its RX data into the queue. + { + let test_ctx = TestContext::new(); + let mut ctx = test_ctx.create_event_handler_context(); + ctx.arti_activate(&test_ctx.mem); + + if let Some(epoll_handler) = &mut ctx.epoll_handler { + epoll_handler.muxer.set_pending_rx(true); + epoll_handler.muxer.set_rx_err(Some(VsockError::NoData)); + } + ctx.signal_rxq_event(); + + // The available RX buffer should've been left untouched. + assert_eq!(ctx.guest_rxvq.used.idx().load(), 0); + } + + // Test case: + // - there is pending RX data in the backend; and + // - the driver makes RX buffers available; and + // - the backend errors out, when attempting to receive data. + { + let test_ctx = TestContext::new(); + let mut ctx = test_ctx.create_event_handler_context(); + ctx.arti_activate(&test_ctx.mem); + + if let Some(epoll_handler) = &mut ctx.epoll_handler { + epoll_handler.muxer.set_pending_rx(true); + } + ctx.signal_rxq_event(); + + // The available RX buffer should have been used. + assert_eq!(ctx.guest_rxvq.used.idx().load(), 1); + } + + // Test case: the driver provided a malformed RX descriptor chain. + { + let test_ctx = TestContext::new(); + let mut ctx = test_ctx.create_event_handler_context(); + ctx.arti_activate(&test_ctx.mem); + + // Invalidate the packet header descriptor, by setting its length to 0. + ctx.guest_rxvq.dtable(0).len().store(0); + + // The chain should've been processed, without employing the backend. + if let Some(epoll_handler) = &mut ctx.epoll_handler { + epoll_handler.process_rx(&test_ctx.mem); + assert_eq!(ctx.guest_rxvq.used.idx().load(), 1); + assert_eq!(epoll_handler.muxer.rx_ok_cnt, 0); + } + } + } + + #[test] + fn test_backend_event() { + // Test case: + // - a backend event is received; and + // - the backend has pending RX data. + { + let test_ctx = TestContext::new(); + let mut ctx = test_ctx.create_event_handler_context(); + ctx.arti_activate(&test_ctx.mem); + + if let Some(epoll_handler) = &mut ctx.epoll_handler { + epoll_handler.muxer.set_pending_rx(true); + epoll_handler + .notify_backend_event(&Events::new_raw(0, EventSet::IN), &test_ctx.mem); + + // The backend should've received this event + assert_eq!(epoll_handler.muxer.evset, Some(epoll::Events::EPOLLIN)); + } + + // TX queue processing should've been triggered. + assert_eq!(ctx.guest_txvq.used.idx().load(), 1); + // RX queue processing should've been triggered. + assert_eq!(ctx.guest_rxvq.used.idx().load(), 1); + } + + // Test case: + // - a backend event is received; and + // - the backend doesn't have any pending RX data. + { + let test_ctx = TestContext::new(); + let mut ctx = test_ctx.create_event_handler_context(); + ctx.arti_activate(&test_ctx.mem); + + if let Some(epoll_handler) = &mut ctx.epoll_handler { + epoll_handler.muxer.set_pending_rx(false); + epoll_handler + .notify_backend_event(&Events::new_raw(0, EventSet::IN), &test_ctx.mem); + + // The backend should've received this event. + assert_eq!(epoll_handler.muxer.evset, Some(epoll::Events::EPOLLIN)); + } + // TX queue processing should've been triggered. + assert_eq!(ctx.guest_txvq.used.idx().load(), 1); + // The RX queue should've been left untouched. + assert_eq!(ctx.guest_rxvq.used.idx().load(), 0); + } + } + + // Creates an epoll handler context and attempts to assemble a VsockPkt from + // the descriptor chains available on the rx and tx virtqueues, but first it + // will set the addr and len of the descriptor specified by desc_idx to the + // provided values. We are only using this function for testing error cases, + // so the asserts always expect is_err() to be true. When desc_idx = 0 we + // are altering the header (first descriptor in the chain), and when + // desc_idx = 1 we are altering the packet buffer. + fn vsock_bof_helper(test_ctx: &mut TestContext, desc_idx: usize, addr: u64, len: u32) { + assert!(desc_idx <= 1); + + { + // should error here, but it works + // let mut ctx = test_ctx.create_event_handler_context(); + // ctx.guest_rxvq.dtable(desc_idx as u16).addr().store(addr); + // ctx.guest_rxvq.dtable(desc_idx as u16).len().store(len); + // // If the descriptor chain is already declared invalid, there's no + // // reason to assemble a packet. + // if let Some(mut rx_desc) = ctx.queues[defs::RXQ_EVENT as usize] + // .iter(&mut test_ctx.mem) + // .next() + // { + // assert!(VsockPacket::from_rx_virtq_head(&mut rx_desc).is_err()); + // } + } + + { + let mut ctx = test_ctx.create_event_handler_context(); + + // When modifiyng the buffer descriptor, make sure the len field is altered in the + // vsock packet header descriptor as well. + if desc_idx == 1 { + // The vsock packet len field has offset 24 in the header. + let hdr_len_addr = GuestAddress(ctx.guest_txvq.dtable(0).addr().load() + 24); + test_ctx + .mem + .write_obj(len.to_le_bytes(), hdr_len_addr) + .unwrap(); + } + + ctx.guest_txvq.dtable(desc_idx as u16).addr().store(addr); + ctx.guest_txvq.dtable(desc_idx as u16).len().store(len); + + if let Some(mut tx_desc) = ctx.queues[defs::TXQ_EVENT as usize] + .queue_mut() + .pop_descriptor_chain(&test_ctx.mem) + { + assert!(VsockPacket::from_tx_virtq_head(&mut tx_desc).is_err()); + } + } + } + + #[test] + fn test_vsock_bof() { + const GAP_SIZE: usize = 768 << 20; + const FIRST_AFTER_GAP: usize = 1 << 32; + const GAP_START_ADDR: usize = FIRST_AFTER_GAP - GAP_SIZE; + const MIB: usize = 1 << 20; + + let mut test_ctx = TestContext::new(); + test_ctx.mem = GuestMemoryMmap::from_ranges(&[ + (GuestAddress(0), 8 * MIB), + (GuestAddress((GAP_START_ADDR - MIB) as u64), MIB), + (GuestAddress(FIRST_AFTER_GAP as u64), MIB), + ]) + .unwrap(); + + // The default configured descriptor chains are valid. + { + let mut ctx = test_ctx.create_event_handler_context(); + let mut rx_desc = ctx.queues[defs::RXQ_EVENT as usize] + .queue_mut() + .pop_descriptor_chain(&test_ctx.mem) + .unwrap(); + assert!(VsockPacket::from_rx_virtq_head(&mut rx_desc).is_ok()); + } + + { + let mut ctx = test_ctx.create_event_handler_context(); + let mut tx_desc = ctx.queues[defs::TXQ_EVENT as usize] + .queue_mut() + .pop_descriptor_chain(&test_ctx.mem) + .unwrap(); + assert!(VsockPacket::from_tx_virtq_head(&mut tx_desc).is_ok()); + } + + // Let's check what happens when the header descriptor is right before + // the gap. + vsock_bof_helper( + &mut test_ctx, + 0, + GAP_START_ADDR as u64 - 1, + VSOCK_PKT_HDR_SIZE as u32, + ); + + // Let's check what happens when the buffer descriptor crosses into the + // gap, but does not go past its right edge. + vsock_bof_helper( + &mut test_ctx, + 1, + GAP_START_ADDR as u64 - 4, + GAP_SIZE as u32 + 4, + ); + + // Let's modify the buffer descriptor addr and len such that it crosses + // over the MMIO gap, and check we cannot assemble the VsockPkts. + vsock_bof_helper( + &mut test_ctx, + 1, + GAP_START_ADDR as u64 - 4, + GAP_SIZE as u32 + 100, + ); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/mod.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/mod.rs new file mode 100644 index 000000000..de99cc6be --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/mod.rs @@ -0,0 +1,494 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the THIRD-PARTY file. + +pub mod backend; +pub mod csm; +mod device; +mod epoll_handler; +pub mod muxer; +mod packet; + +use std::os::unix::io::AsRawFd; + +use vm_memory::GuestMemoryError; + +pub use self::defs::{NUM_QUEUES, QUEUE_SIZES}; +pub use self::device::Vsock; +use self::muxer::Error as MuxerError; +pub use self::muxer::VsockMuxer; +use self::packet::VsockPacket; + +mod defs { + /// RX queue event: the driver added available buffers to the RX queue. + pub const RXQ_EVENT: u32 = 0; + /// TX queue event: the driver added available buffers to the RX queue. + pub const TXQ_EVENT: u32 = 1; + /// Event queue event: the driver added available buffers to the event + /// queue. + pub const EVQ_EVENT: u32 = 2; + /// Backend event: the backend needs a kick. + pub const BACKEND_EVENT: u32 = 3; + + /// Number of virtio queues. + pub const NUM_QUEUES: usize = 3; + /// Virtio queue sizes, in number of descriptor chain heads. + /// + /// There are 3 queues for a virtio device (in this order): RX, TX, Event + pub const QUEUE_SIZES: &[u16] = &[256; NUM_QUEUES]; + + /// Max vsock packet data/buffer size. + pub const MAX_PKT_BUF_SIZE: usize = 64 * 1024; + + pub mod uapi { + /// Virtio feature flags. + /// + /// Defined in `/include/uapi/linux/virtio_config.h`. + /// + /// The device processes available buffers in the same order in which + /// the device offers them. + pub const VIRTIO_F_IN_ORDER: usize = 35; + /// The device conforms to the virtio spec version 1.0. + pub const VIRTIO_F_VERSION_1: u32 = 32; + + /// Virtio vsock device ID. + /// + /// Defined in `include/uapi/linux/virtio_ids.h`. + pub const VIRTIO_ID_VSOCK: u32 = 19; + + /// Vsock packet operation IDs. + /// + /// Defined in `/include/uapi/linux/virtio_vsock.h`. + /// + /// Connection request. + pub const VSOCK_OP_REQUEST: u16 = 1; + /// Connection response. + pub const VSOCK_OP_RESPONSE: u16 = 2; + /// Connection reset. + pub const VSOCK_OP_RST: u16 = 3; + /// Connection clean shutdown. + pub const VSOCK_OP_SHUTDOWN: u16 = 4; + /// Connection data (read/write). + pub const VSOCK_OP_RW: u16 = 5; + /// Flow control credit update. + pub const VSOCK_OP_CREDIT_UPDATE: u16 = 6; + /// Flow control credit update request. + pub const VSOCK_OP_CREDIT_REQUEST: u16 = 7; + + /// Vsock packet flags. Defined in `/include/uapi/linux/virtio_vsock.h`. + /// + /// Valid with a VSOCK_OP_SHUTDOWN packet: the packet sender will + /// receive no more data. + pub const VSOCK_FLAGS_SHUTDOWN_RCV: u32 = 1; + /// Valid with a VSOCK_OP_SHUTDOWN packet: the packet sender will send + /// no more data. + pub const VSOCK_FLAGS_SHUTDOWN_SEND: u32 = 2; + + /// Vsock packet type. + /// Defined in `/include/uapi/linux/virtio_vsock.h`. + /// + /// Stream / connection-oriented packet (the only currently valid type). + pub const VSOCK_TYPE_STREAM: u16 = 1; + + /// Well known vsock CID for host system. + pub const VSOCK_HOST_CID: u64 = 2; + } +} + +#[derive(Debug, thiserror::Error)] +pub enum VsockError { + /// vsock backend error + #[error("Vsock backend error: {0}")] + Backend(#[source] std::io::Error), + /// The vsock data/buffer virtio descriptor is expected, but missing. + #[error("The vsock data/buffer virtio descriptor is expected, but missing")] + BufDescMissing, + /// The vsock data/buffer virtio descriptor length is smaller than expected. + #[error("The vsock data/buffer virtio descriptor length is smaller than expected")] + BufDescTooSmall, + /// Chained GuestMemory error. + #[error("Chained GuestMemory error: {0}")] + GuestMemory(#[source] GuestMemoryError), + /// Bounds check failed on guest memory pointer. + #[error("Bounds check failed on guest memory pointer, addr: {0}, size: {1}")] + GuestMemoryBounds(u64, usize), + /// The vsock header descriptor length is too small. + #[error("The vsock header descriptor length {0} is too small")] + HdrDescTooSmall(u32), + /// The vsock header `len` field holds an invalid value. + #[error("The vsock header `len` field holds an invalid value {0}")] + InvalidPktLen(u32), + /// vsock muxer error + #[error("Vsock muxer error: {0}")] + Muxer(#[source] MuxerError), + /// A data fetch was attempted when no data was available. + #[error("A data fetch was attempted when no data was available")] + NoData, + /// A data buffer was expected for the provided packet, but it is missing. + #[error("A data buffer was expected for the provided packet, but it is missing")] + PktBufMissing, + /// Encountered an unexpected write-only virtio descriptor. + #[error("Encountered an unexpected write-only virtio descriptor")] + UnreadableDescriptor, + /// Encountered an unexpected read-only virtio descriptor. + #[error("Encountered an unexpected read-only virtio descriptor")] + UnwritableDescriptor, +} + +type Result = std::result::Result; + +/// A passive, event-driven object, that needs to be notified whenever an +/// epoll-able event occurs. An event-polling control loop will use +/// `get_polled_fd()` and `get_polled_evset()` to query the listener for the +/// file descriptor and the set of events it's interested in. When such an event +/// occurs, the control loop will route the event to the listener via +/// `notify()`. +pub trait VsockEpollListener: AsRawFd { + /// Get the set of events for which the listener wants to be notified. + fn get_polled_evset(&self) -> epoll::Events; + + /// Notify the listener that one ore more events have occured. + fn notify(&mut self, evset: epoll::Events); +} + +/// Any channel that handles vsock packet traffic: sending and receiving +/// packets. Since we're implementing the device model here, our responsibility +/// is to always process the sending of packets (i.e. the TX queue). So, any +/// locally generated data, addressed to the driver (e.g. a connection response +/// or RST), will have to be queued, until we get to processing the RX queue. +/// +/// Note: `recv_pkt()` and `send_pkt()` are named analogous to `Read::read()` +/// and `Write::write()`, respectively. I.e. - `recv_pkt()` will read data +/// from the channel, and place it into a packet; and - `send_pkt()` will +/// fetch data from a packet, and place it into the channel. +pub trait VsockChannel { + /// Read/receive an incoming packet from the channel. + fn recv_pkt(&mut self, pkt: &mut VsockPacket) -> Result<()>; + + /// Write/send a packet through the channel. + fn send_pkt(&mut self, pkt: &VsockPacket) -> Result<()>; + + /// Checks weather there is pending incoming data inside the channel, + /// meaning that a subsequent call to `recv_pkt()` won't fail. + fn has_pending_rx(&self) -> bool; +} + +#[cfg(test)] +mod tests { + use std::ops::Deref; + use std::os::unix::io::{AsRawFd, RawFd}; + use std::sync::Arc; + + use dbs_device::resources::DeviceResources; + use dbs_interrupt::NoopNotifier; + use dbs_utils::epoll_manager::EpollManager; + use kvm_ioctls::Kvm; + use virtio_queue::QueueSync; + use vm_memory::{GuestAddress, GuestAddressSpace, GuestMemoryMmap, GuestRegionMmap}; + use vmm_sys_util::eventfd::{EventFd, EFD_NONBLOCK}; + + use super::backend::VsockBackend; + use super::defs::{EVQ_EVENT, RXQ_EVENT, TXQ_EVENT}; + use super::epoll_handler::VsockEpollHandler; + use super::muxer::{Result as MuxerResult, VsockGenericMuxer}; + use super::packet::{VsockPacket, VSOCK_PKT_HDR_SIZE}; + use super::*; + use crate::device::VirtioDeviceConfig; + use crate::tests::{VirtQueue as GuestQ, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; + use crate::Result as VirtioResult; + use crate::VirtioQueueConfig; + + pub fn test_bytes(src: &[u8], dst: &[u8]) { + let min_len = std::cmp::min(src.len(), dst.len()); + assert_eq!(src[0..min_len], dst[0..min_len]) + } + + type Result = std::result::Result; + + pub struct TestMuxer { + pub evfd: EventFd, + pub rx_err: Option, + pub tx_err: Option, + pub pending_rx: bool, + pub rx_ok_cnt: usize, + pub tx_ok_cnt: usize, + pub evset: Option, + } + + impl TestMuxer { + pub fn new() -> Self { + Self { + evfd: EventFd::new(EFD_NONBLOCK).unwrap(), + rx_err: None, + tx_err: None, + pending_rx: false, + rx_ok_cnt: 0, + tx_ok_cnt: 0, + evset: None, + } + } + + pub fn set_rx_err(&mut self, err: Option) { + self.rx_err = err; + } + pub fn set_tx_err(&mut self, err: Option) { + self.tx_err = err; + } + pub fn set_pending_rx(&mut self, prx: bool) { + self.pending_rx = prx; + } + } + + impl Default for TestMuxer { + fn default() -> Self { + Self::new() + } + } + + impl VsockChannel for TestMuxer { + fn recv_pkt(&mut self, _pkt: &mut VsockPacket) -> Result<()> { + let cool_buf = [0xDu8, 0xE, 0xA, 0xD, 0xB, 0xE, 0xE, 0xF]; + match self.rx_err.take() { + None => { + if let Some(buf) = _pkt.buf_mut() { + for i in 0..buf.len() { + buf[i] = cool_buf[i % cool_buf.len()]; + } + } + self.rx_ok_cnt += 1; + Ok(()) + } + Some(e) => Err(e), + } + } + + fn send_pkt(&mut self, _pkt: &VsockPacket) -> Result<()> { + match self.tx_err.take() { + None => { + self.tx_ok_cnt += 1; + Ok(()) + } + Some(e) => Err(e), + } + } + + fn has_pending_rx(&self) -> bool { + self.pending_rx + } + } + + impl AsRawFd for TestMuxer { + fn as_raw_fd(&self) -> RawFd { + self.evfd.as_raw_fd() + } + } + + impl VsockEpollListener for TestMuxer { + fn get_polled_evset(&self) -> epoll::Events { + epoll::Events::EPOLLIN + } + fn notify(&mut self, evset: epoll::Events) { + self.evset = Some(evset); + } + } + + impl VsockGenericMuxer for TestMuxer { + fn add_backend( + &mut self, + _backend: Box, + _is_peer_backend: bool, + ) -> MuxerResult<()> { + Ok(()) + } + } + + pub struct TestContext { + pub cid: u64, + pub mem: GuestMemoryMmap, + pub mem_size: usize, + pub epoll_manager: EpollManager, + pub device: Vsock, TestMuxer>, + } + + impl TestContext { + pub fn new() -> Self { + const CID: u64 = 52; + const MEM_SIZE: usize = 1024 * 1024 * 128; + let mem = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), MEM_SIZE)]).unwrap(); + let epoll_manager = EpollManager::default(); + Self { + cid: CID, + mem, + mem_size: MEM_SIZE, + epoll_manager: epoll_manager.clone(), + device: Vsock::new_with_muxer( + CID, + Arc::new(defs::QUEUE_SIZES.to_vec()), + epoll_manager, + TestMuxer::new(), + ) + .unwrap(), + } + } + + pub fn create_event_handler_context(&self) -> EventHandlerContext { + const QSIZE: u16 = 256; + + let guest_rxvq = GuestQ::new(GuestAddress(0x0010_0000), &self.mem, QSIZE); + let guest_txvq = GuestQ::new(GuestAddress(0x0020_0000), &self.mem, QSIZE); + let guest_evvq = GuestQ::new(GuestAddress(0x0030_0000), &self.mem, QSIZE); + let rxvq = guest_rxvq.create_queue(); + let txvq = guest_txvq.create_queue(); + let evvq = guest_evvq.create_queue(); + + let rxvq_config = VirtioQueueConfig::new( + rxvq, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + RXQ_EVENT as u16, + ); + let txvq_config = VirtioQueueConfig::new( + txvq, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + TXQ_EVENT as u16, + ); + let evvq_config = VirtioQueueConfig::new( + evvq, + Arc::new(EventFd::new(0).unwrap()), + Arc::new(NoopNotifier::new()), + EVQ_EVENT as u16, + ); + + // Set up one available descriptor in the RX queue. + guest_rxvq.dtable(0).set( + 0x0040_0000, + VSOCK_PKT_HDR_SIZE as u32, + VIRTQ_DESC_F_WRITE | VIRTQ_DESC_F_NEXT, + 1, + ); + guest_rxvq + .dtable(1) + .set(0x0040_1000, 4096, VIRTQ_DESC_F_WRITE, 0); + + guest_rxvq.avail.ring(0).store(0); + guest_rxvq.avail.idx().store(1); + + // Set up one available descriptor in the TX queue. + guest_txvq + .dtable(0) + .set(0x0050_0000, VSOCK_PKT_HDR_SIZE as u32, VIRTQ_DESC_F_NEXT, 1); + guest_txvq.dtable(1).set(0x0050_1000, 4096, 0, 0); + guest_txvq.avail.ring(0).store(0); + guest_txvq.avail.idx().store(1); + + let queues = vec![rxvq_config, txvq_config, evvq_config]; + EventHandlerContext { + guest_rxvq, + guest_txvq, + guest_evvq, + queues, + epoll_handler: None, + device: Vsock::new_with_muxer( + self.cid, + Arc::new(defs::QUEUE_SIZES.to_vec()), + EpollManager::default(), + TestMuxer::new(), + ) + .unwrap(), + mem: Arc::new(self.mem.clone()), + } + } + } + + impl Default for TestContext { + fn default() -> Self { + Self::new() + } + } + + pub struct EventHandlerContext<'a> { + pub device: Vsock, TestMuxer>, + pub epoll_handler: + Option, QueueSync, GuestRegionMmap, TestMuxer>>, + pub queues: Vec>, + pub guest_rxvq: GuestQ<'a>, + pub guest_txvq: GuestQ<'a>, + pub guest_evvq: GuestQ<'a>, + pub mem: Arc, + } + + impl<'a> EventHandlerContext<'a> { + // Artificially activate the device. + pub fn arti_activate(&mut self, mem: &GuestMemoryMmap) { + let kvm = Kvm::new().unwrap(); + let vm_fd = Arc::new(kvm.create_vm().unwrap()); + let resources = DeviceResources::new(); + let config = VirtioDeviceConfig::>, QueueSync>::new( + Arc::new(mem.clone()), + vm_fd, + resources, + self.queues.drain(..).collect(), + None, + Arc::new(NoopNotifier::new()), + ); + + let epoll_handler = self.device.mock_activate(config).unwrap(); + self.epoll_handler = Some(epoll_handler); + } + + pub fn handle_txq_event(&mut self, mem: &GuestMemoryMmap) { + if let Some(epoll_handler) = &mut self.epoll_handler { + epoll_handler.config.queues[TXQ_EVENT as usize] + .generate_event() + .unwrap(); + epoll_handler.handle_txq_event(mem); + } + } + + pub fn handle_rxq_event(&mut self, mem: &GuestMemoryMmap) { + if let Some(epoll_handler) = &mut self.epoll_handler { + epoll_handler.config.queues[TXQ_EVENT as usize] + .generate_event() + .unwrap(); + epoll_handler.handle_rxq_event(mem); + } + } + + pub fn signal_txq_event(&mut self) { + if let Some(epoll_handler) = &mut self.epoll_handler { + epoll_handler.config.queues[TXQ_EVENT as usize] + .generate_event() + .unwrap(); + } + let mem_guard = self.mem.memory(); + let mem = mem_guard.deref(); + self.handle_txq_event(mem); + } + + pub fn signal_rxq_event(&mut self) { + if let Some(epoll_handler) = &mut self.epoll_handler { + epoll_handler.config.queues[RXQ_EVENT as usize] + .generate_event() + .unwrap(); + } + let mem_guard = self.mem.memory(); + let mem = mem_guard.deref(); + self.handle_rxq_event(mem); + } + + pub fn signal_used_queue(&mut self, idx: usize) -> VirtioResult<()> { + if let Some(epoll_handler) = &mut self.epoll_handler { + epoll_handler.config.queues[RXQ_EVENT as usize] + .generate_event() + .unwrap(); + epoll_handler.signal_used_queue(idx).unwrap(); + } + + Ok(()) + } + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/muxer/mod.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/muxer/mod.rs new file mode 100644 index 000000000..2e71adefa --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/muxer/mod.rs @@ -0,0 +1,82 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +/// This module implements a muxer for vsock - a mediator between guest-side +/// AF_VSOCK sockets and host-side backends. The heavy lifting is performed by +/// `muxer::VsockMuxer`, a connection multiplexer that uses +/// `super::csm::VsockConnection` for handling vsock connection states. Check +/// out `muxer.rs` for a more detailed explanation of the inner workings of this +/// backend. +pub mod muxer_impl; +pub mod muxer_killq; +pub mod muxer_rxq; + +use super::backend::{VsockBackend, VsockBackendType}; +use super::{VsockChannel, VsockEpollListener}; +pub use muxer_impl::VsockMuxer; + +mod defs { + /// Maximum number of established connections that we can handle. + pub const MAX_CONNECTIONS: usize = 1023; + + /// Size of the muxer RX packet queue. + pub const MUXER_RXQ_SIZE: usize = 256; + + /// Size of the muxer connection kill queue. + pub const MUXER_KILLQ_SIZE: usize = 128; +} + +pub type Result = std::result::Result; + +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Error registering a new epoll-listening FD. + #[error("error when registering a new epoll-listening FD: {0}")] + EpollAdd(#[source] std::io::Error), + + /// Error creating an epoll FD. + #[error("error when creating an epoll: {0}")] + EpollFdCreate(#[source] std::io::Error), + + /// The host made an invalid vsock port connection request. + #[error("invalid vsock prot connection request")] + InvalidPortRequest, + + /// Cannot add muxer backend when vsock device is activated. + #[error("cannot add muxer backend when vsock device is activated")] + BackendAddAfterActivated, + + /// Error accepting a new connection from backend. + #[error("error accepting a new connection from backend: {0}")] + BackendAccept(#[source] std::io::Error), + + /// Error binding to the backend. + #[error("error binding to the backend: {0}")] + BackendBind(#[source] std::io::Error), + + /// Error connecting to a backend. + #[error("error connecting to a backend: {0}")] + BackendConnect(#[source] std::io::Error), + + /// Error reading from backend. + #[error("error reading from backend: {0}")] + BackendRead(#[source] std::io::Error), + + /// Muxer connection limit reached. + #[error("muxer reaches connection limit")] + TooManyConnections, + + /// Backend type has been registered. + #[error("backend type has been registered: {0:?}")] + BackendRegistered(VsockBackendType), +} + +/// The vsock generic muxer, which is basically an epoll-event-driven vsock +/// channel. Currently, the only implementation we have is +/// `vsock::muxer::muxer::VsockMuxer`, which translates guest-side vsock +/// connections to host-side connections with different backends. +pub trait VsockGenericMuxer: VsockChannel + VsockEpollListener + Send { + fn add_backend(&mut self, backend: Box, is_peer_backend: bool) -> Result<()>; +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/muxer/muxer_impl.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/muxer/muxer_impl.rs new file mode 100644 index 000000000..1b5b7c4e9 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/muxer/muxer_impl.rs @@ -0,0 +1,1601 @@ +// Copyright 2022 Alibaba Cloud. All Rights Reserved. +// +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +/// `VsockMuxer` is the device-facing component of multiple vsock backends. You +/// can add various of backends to VsockMuxer which implements the +/// `VsockBackend` trait. VsockMuxer can abstracts away the gory details of +/// translating between AF_VSOCK and the protocol of backends which you added. +/// It can also presents a clean interface to the rest of the vsock device +/// model. +/// +/// The vsock muxer has two main roles: +/// 1. Vsock connection multiplexer: It's the muxer's job to create, manage, and +/// terminate `VsockConnection` objects. The muxer also routes packets to +/// their owning connections. It does so via a connection `HashMap`, keyed by +/// what is basically a (host_port, guest_port) tuple. Vsock packet traffic +/// needs to be inspected, in order to detect connection request packets +/// (leading to the creation of a new connection), and connection reset +/// packets (leading to the termination of an existing connection). All other +/// packets, though, must belong to an existing connection and, as such, the +/// muxer simply forwards them. +/// 2. Event dispatcher There are three event categories that the vsock backend +/// is interested it: +/// 1. A new host-initiated connection is ready to be accepted from the +/// backends added to muxer; +/// 2. Data is available for reading from a newly-accepted host-initiated +/// connection (i.e. the host is ready to issue a vsock connection +/// request, informing us of the destination port to which it wants to +/// connect); +/// 3. Some event was triggered for a connected backend connection, that +/// belongs to a `VsockConnection`. The muxer gets notified about all of +/// these events, because, as a `VsockEpollListener` implementor, it gets +/// to register a nested epoll FD into the main VMM epolling loop. All +/// other pollable FDs are then registered under this nested epoll FD. To +/// route all these events to their handlers, the muxer uses another +/// `HashMap` object, mapping `RawFd`s to `EpollListener`s. +use std::collections::{HashMap, HashSet}; +use std::io::Read; +use std::os::fd::FromRawFd; +use std::os::unix::io::{AsRawFd, RawFd}; +use std::os::unix::net::UnixStream; + +use log::{debug, error, info, trace, warn}; + +use super::super::backend::{HybridUnixStreamBackend, VsockBackend, VsockBackendType, VsockStream}; + +use super::super::csm::{ConnState, VsockConnection}; +use super::super::defs::uapi; +use super::super::packet::VsockPacket; +use super::super::{Result as VsockResult, VsockChannel, VsockEpollListener, VsockError}; +use super::muxer_killq::MuxerKillQ; +use super::muxer_rxq::MuxerRxQ; +use super::{defs, Error, Result, VsockGenericMuxer}; + +/// A unique identifier of a `VsockConnection` object. Connections are stored in +/// a hash map, keyed by a `ConnMapKey` object. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct ConnMapKey { + local_port: u32, + pub(crate) peer_port: u32, +} + +/// A muxer RX queue item. +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub enum MuxerRx { + /// The packet must be fetched from the connection identified by + /// `ConnMapKey`. + ConnRx(ConnMapKey), + /// The muxer must produce an RST packet. + RstPkt { local_port: u32, peer_port: u32 }, +} + +enum ReadPortResult { + PassFd, + Connect(u32), +} + +/// An epoll listener, registered under the muxer's nested epoll FD. +pub enum EpollListener { + /// The listener is a `VsockConnection`, identified by `key`, and interested + /// in the events in `evset`. Since `VsockConnection` implements + /// `VsockEpollListener`, notifications will be forwarded to the listener + /// via `VsockEpollListener::notify()`. + Connection { + key: ConnMapKey, + evset: epoll::Events, + backend: VsockBackendType, + }, + /// A listener interested in new host-initiated connections. + Backend(VsockBackendType), + /// A listener interested in reading host "connect " commands from a + /// freshly connected host socket. + LocalStream(Box), + /// A listener interested in recvmsg from host to get the and a + /// socket/pipe fd. + PassFdStream(Box), +} + +/// The vsock connection multiplexer. +pub struct VsockMuxer { + /// Guest CID. + cid: u64, + /// A hash map used to store the active connections. + conn_map: HashMap, + /// A hash map used to store epoll event listeners / handlers. + listener_map: HashMap, + /// The RX queue. Items in this queue are consumed by + /// `VsockMuxer::recv_pkt()`, and produced + /// - by `VsockMuxer::send_pkt()` (e.g. RST in response to a connection + /// request packet); and + /// - in response to EPOLLIN events (e.g. data available to be read from an + /// AF_UNIX socket). + rxq: MuxerRxQ, + /// A queue used for terminating connections that are taking too long to + /// shut down. + killq: MuxerKillQ, + /// The nested epoll FD, used to register epoll listeners. + epoll_fd: RawFd, + /// A hash set used to keep track of used host-side (local) ports, in order + /// to assign local ports to host-initiated connections. + local_port_set: HashSet, + /// The last used host-side port. + local_port_last: u32, + /// backend implementations supported in muxer. + backend_map: HashMap>, + /// the backend which can accept peer-initiated connection. + peer_backend: Option, +} + +impl VsockChannel for VsockMuxer { + /// Deliver a vsock packet to the guest vsock driver. + /// + /// Retuns: + /// - `Ok(())`: `pkt` has been successfully filled in; or + /// - `Err(VsockError::NoData)`: there was no available data with which to fill in the packet. + fn recv_pkt(&mut self, pkt: &mut VsockPacket) -> VsockResult<()> { + // We'll look for instructions on how to build the RX packet in the RX + // queue. If the queue is empty, that doesn't necessarily mean we don't + // have any pending RX, since the queue might be out-of-sync. If that's + // the case, we'll attempt to sync it first, and then try to pop + // something out again. + if self.rxq.is_empty() && !self.rxq.is_synced() { + self.rxq = MuxerRxQ::from_conn_map(&self.conn_map); + } + + while let Some(rx) = self.rxq.peek() { + let res = match rx { + // We need to build an RST packet, going from `local_port` to + // `peer_port`. + MuxerRx::RstPkt { + local_port, + peer_port, + } => { + pkt.set_op(uapi::VSOCK_OP_RST) + .set_src_cid(uapi::VSOCK_HOST_CID) + .set_dst_cid(self.cid) + .set_src_port(local_port) + .set_dst_port(peer_port) + .set_len(0) + .set_type(uapi::VSOCK_TYPE_STREAM) + .set_flags(0) + .set_buf_alloc(0) + .set_fwd_cnt(0); + self.rxq.pop().unwrap(); + trace!( + "vsock: muxer.recv[rxq.len={}, type={}, op={}, sp={}, sc={}, dp={}, dc={}]: {:?}", + self.rxq.len(), + pkt.type_(), + pkt.op(), + pkt.src_port(), + pkt.src_cid(), + pkt.dst_port(), + pkt.dst_cid(), + pkt.hdr() + ); + return Ok(()); + } + + // We'll defer building the packet to this connection, that has + // something to say. + MuxerRx::ConnRx(key) => { + let mut conn_res = Err(VsockError::NoData); + let mut do_pop = true; + self.apply_conn_mutation(key, |conn| { + conn_res = conn.recv_pkt(pkt); + do_pop = !conn.has_pending_rx(); + }); + if do_pop { + self.rxq.pop().unwrap(); + } + conn_res + } + }; + + if res.is_ok() { + // Inspect traffic, looking for RST packets, since that means we + // have to terminate and remove this connection from the active + // connection pool. + if pkt.op() == uapi::VSOCK_OP_RST { + self.remove_connection(ConnMapKey { + local_port: pkt.src_port(), + peer_port: pkt.dst_port(), + }); + } + + trace!( + "vsock: muxer.recv[rxq.len={}, type={}, op={}, sp={}, sc={}, dp={}, dc={}]: {:?}", + self.rxq.len(), + pkt.type_(), + pkt.op(), + pkt.src_port(), + pkt.src_cid(), + pkt.dst_port(), + pkt.dst_cid(), + pkt.hdr() + ); + return Ok(()); + } + } + + Err(VsockError::NoData) + } + + /// Deliver a guest-generated packet to its destination in the vsock + /// backend. + /// + /// This absorbs unexpected packets, handles RSTs (by dropping connections), + /// and forwards all the rest to their owning `VsockConnection`. + /// + /// Returns: always `Ok(())` - the packet has been consumed, and its virtio + /// TX buffers can be returned to the guest vsock driver. + fn send_pkt(&mut self, pkt: &VsockPacket) -> VsockResult<()> { + let conn_key = ConnMapKey { + local_port: pkt.dst_port(), + peer_port: pkt.src_port(), + }; + + trace!( + "vsock: muxer.send[rxq.len={}, type={}, op={}, sp={}, sc={}, dp={}, dc={}]: {:?}", + self.rxq.len(), + pkt.type_(), + pkt.op(), + pkt.src_port(), + pkt.src_cid(), + pkt.dst_port(), + pkt.dst_cid(), + pkt.hdr() + ); + + // If this packet has an unsupported type (!=stream), we must send back + // an RST. + if pkt.type_() != uapi::VSOCK_TYPE_STREAM { + self.enq_rst(pkt.dst_port(), pkt.src_port()); + return Ok(()); + } + + // We don't know how to handle packets addressed to other CIDs. We only + // handle the host part of the guest - host communication here. + if pkt.dst_cid() != uapi::VSOCK_HOST_CID { + info!( + "vsock: dropping guest packet for unknown CID: {:?}", + pkt.hdr() + ); + return Ok(()); + } + + if !self.conn_map.contains_key(&conn_key) { + // This packet can't be routed to any active connection (based on + // its src and dst ports). The only orphan / unroutable packets we + // know how to handle are connection requests. + if pkt.op() == uapi::VSOCK_OP_REQUEST { + // Oh, this is a connection request! + self.handle_peer_request_pkt(pkt); + } else { + // Send back an RST, to let the drive know we weren't expecting + // this packet. + self.enq_rst(pkt.dst_port(), pkt.src_port()); + } + return Ok(()); + } + + // Right, we know where to send this packet, then (to `conn_key`). + // However, if this is an RST, we have to forcefully terminate the + // connection, so there's no point in forwarding it the packet. + if pkt.op() == uapi::VSOCK_OP_RST { + self.remove_connection(conn_key); + return Ok(()); + } + + // Alright, everything looks in order - forward this packet to its + // owning connection. + let mut res: VsockResult<()> = Ok(()); + self.apply_conn_mutation(conn_key, |conn| { + res = conn.send_pkt(pkt); + }); + + res + } + + /// Check if the muxer has any pending RX data, with which to fill a + /// guest-provided RX buffer. + fn has_pending_rx(&self) -> bool { + !self.rxq.is_empty() || !self.rxq.is_synced() + } +} + +impl AsRawFd for VsockMuxer { + /// Get the FD to be registered for polling upstream (in the main VMM epoll + /// loop, in this case). + /// + /// This will be the muxer's nested epoll FD. + fn as_raw_fd(&self) -> RawFd { + self.epoll_fd + } +} + +impl VsockEpollListener for VsockMuxer { + /// Get the epoll events to be polled upstream. + /// + /// Since the polled FD is a nested epoll FD, we're only interested in + /// EPOLLIN events (i.e. some event occurred on one of the FDs registered + /// under our epoll FD). + fn get_polled_evset(&self) -> epoll::Events { + epoll::Events::EPOLLIN + } + + /// Notify the muxer about a pending event having occurred under its nested + /// epoll FD. + fn notify(&mut self, _: epoll::Events) { + trace!("vsock: muxer received kick"); + + let mut epoll_events = vec![epoll::Event::new(epoll::Events::empty(), 0); 32]; + match epoll::wait(self.epoll_fd, 0, epoll_events.as_mut_slice()) { + Ok(ev_cnt) => { + for ev in &epoll_events[0..ev_cnt] { + self.handle_event( + ev.data as RawFd, + epoll::Events::from_bits(ev.events).unwrap(), + ); + } + } + Err(e) => { + warn!("vsock: failed to consume muxer epoll event: {}", e); + } + } + } +} + +impl VsockGenericMuxer for VsockMuxer { + /// add a backend for Muxer. + fn add_backend(&mut self, backend: Box, is_peer_backend: bool) -> Result<()> { + let backend_type = backend.r#type(); + if self.backend_map.contains_key(&backend_type) { + return Err(Error::BackendRegistered(backend_type)); + } + self.add_listener( + backend.as_raw_fd(), + EpollListener::Backend(backend_type.clone()), + )?; + self.backend_map.insert(backend_type.clone(), backend); + if is_peer_backend { + self.peer_backend = Some(backend_type); + } + Ok(()) + } +} + +impl VsockMuxer { + /// Muxer constructor. + pub fn new(cid: u64) -> Result { + Ok(Self { + cid, + epoll_fd: epoll::create(false).map_err(Error::EpollFdCreate)?, + rxq: MuxerRxQ::default(), + conn_map: HashMap::with_capacity(defs::MAX_CONNECTIONS), + listener_map: HashMap::with_capacity(defs::MAX_CONNECTIONS + 1), + killq: MuxerKillQ::default(), + local_port_last: (1u32 << 30) - 1, + local_port_set: HashSet::with_capacity(defs::MAX_CONNECTIONS), + backend_map: HashMap::new(), + peer_backend: None, + }) + } + + /// Handle/dispatch an epoll event to its listener. + fn handle_event(&mut self, fd: RawFd, event_set: epoll::Events) { + trace!( + "vsock: muxer processing event: fd={}, evset={:?}", + fd, + event_set + ); + + match self.listener_map.get_mut(&fd) { + // This event needs to be forwarded to a `VsockConnection` that is + // listening for it. + Some(EpollListener::Connection { key, evset: _, .. }) => { + let key_copy = *key; + // The handling of this event will most probably mutate the + // state of the receiving connection. We'll need to check for new + // pending RX, event set mutation, and all that, so we're + // wrapping the event delivery inside those checks. + self.apply_conn_mutation(key_copy, |conn| { + conn.notify(event_set); + }); + } + + // A new host-initiated connection is ready to be accepted. + Some(EpollListener::Backend(backend_type)) => { + if let Some(backend) = self.backend_map.get_mut(backend_type) { + if self.rxq.len() == defs::MAX_CONNECTIONS { + // If we're already maxed-out on connections, we'll just + // accept and immediately discard this potentially new + // one. + warn!("vsock: connection limit reached; refusing new host connection"); + backend.accept().map(|_| 0).unwrap_or(0); + return; + } + backend + .accept() + .map_err(Error::BackendAccept) + .and_then(|stream| { + // Before forwarding this connection to a listening + // AF_VSOCK socket on the guest side, we need to + // know the destination port. We'll read that port + // from a "connect" command received on this socket, + // so the next step is to ask to be notified the + // moment we can read from it. + + self.add_listener( + stream.as_raw_fd(), + EpollListener::LocalStream(stream), + ) + }) + .unwrap_or_else(|err| { + warn!("vsock: unable to accept local connection: {:?}", err); + }); + } else { + error!("vsock: unsable to find specific backend {:?}", backend_type) + } + } + + // Data is ready to be read from a host-initiated connection. That + // would be the "connect" command that we're expecting. + Some(EpollListener::LocalStream(_)) => { + if let Some(EpollListener::LocalStream(mut stream)) = self.remove_listener(fd) { + Self::read_local_stream_port(&mut stream) + .and_then(|read_port_result| match read_port_result { + ReadPortResult::Connect(peer_port) => { + let local_port = self.allocate_local_port(); + self.add_connection( + ConnMapKey { + local_port, + peer_port, + }, + VsockConnection::new_local_init( + stream, + uapi::VSOCK_HOST_CID, + self.cid, + local_port, + peer_port, + ), + ) + } + ReadPortResult::PassFd => self.add_listener( + stream.as_raw_fd(), + EpollListener::PassFdStream(stream), + ), + }) + .unwrap_or_else(|err| { + info!("vsock: error adding local-init connection: {:?}", err); + }) + } + } + + Some(EpollListener::PassFdStream(_)) => { + if let Some(EpollListener::PassFdStream(mut stream)) = self.remove_listener(fd) { + Self::passfd_read_port_and_fd(&mut stream) + .map(|(nfd, peer_port)| (nfd, self.allocate_local_port(), peer_port)) + .and_then(|(nfd, local_port, peer_port)| { + // Here we should make sure the nfd the sole owner to convert it + // into an UnixStream object, otherwise, it could cause memory unsafety. + let nstream = unsafe { UnixStream::from_raw_fd(nfd) }; + + let hybridstream = HybridUnixStreamBackend { + unix_stream: Box::new(nstream), + slave_stream: Some(stream), + }; + + self.add_connection( + ConnMapKey { + local_port, + peer_port, + }, + VsockConnection::new_local_init( + Box::new(hybridstream), + uapi::VSOCK_HOST_CID, + self.cid, + local_port, + peer_port, + ), + ) + }) + .unwrap_or_else(|err| { + info!( + "vsock: error adding local-init passthrough fd connection: {:?}", + err + ); + }) + } + } + + _ => { + info!( + "vsock: unexpected event: fd={:?}, evset={:?}", + fd, event_set + ); + } + } + } + + /// Parse a host "connect" command, and extract the destination vsock port. + fn read_local_stream_port(stream: &mut Box) -> Result { + let mut buf = [0u8; 32]; + + // This is the minimum number of bytes that we should be able to read, + // when parsing a valid connection request. I.e. `b"passfd\n"`, otherwise, + // it would be `b"connect 0\n".len()`. + const MIN_READ_LEN: usize = 7; + + // Bring in the minimum number of bytes that we should be able to read. + stream + .read(&mut buf[..MIN_READ_LEN]) + .map_err(Error::BackendRead)?; + + // Now, finish reading the destination port number if it's connect command, + // by bringing in one byte at a time, until we reach an EOL terminator (or our buffer + // space runs out). Yeah, not particularly proud of this approach, but it will have to + // do for now. + let mut blen = MIN_READ_LEN; + while buf[blen - 1] != b'\n' && blen < buf.len() { + stream + .read_exact(&mut buf[blen..=blen]) + .map_err(Error::BackendRead)?; + blen += 1; + } + + let mut word_iter = std::str::from_utf8(&buf) + .map_err(|_| Error::InvalidPortRequest)? + .split_whitespace(); + + word_iter + .next() + .ok_or(Error::InvalidPortRequest) + .and_then(|word| { + let key = word.to_lowercase(); + if key == "connect" { + Ok(true) + } else if key == "passfd" { + Ok(false) + } else { + Err(Error::InvalidPortRequest) + } + }) + .and_then(|connect| { + if connect { + word_iter.next().ok_or(Error::InvalidPortRequest).map(Some) + } else { + Ok(None) + } + }) + .and_then(|word| { + word.map_or_else( + || Ok(ReadPortResult::PassFd), + |word| { + word.parse::() + .map_or(Err(Error::InvalidPortRequest), |word| { + Ok(ReadPortResult::Connect(word)) + }) + }, + ) + }) + .map_err(|_| Error::InvalidPortRequest) + } + + fn passfd_read_port_and_fd(stream: &mut Box) -> Result<(RawFd, u32)> { + let mut buf = [0u8; 32]; + let mut fds = [0, 1]; + let (data_len, fd_len) = stream + .recv_data_fd(&mut buf, &mut fds) + .map_err(Error::BackendRead)?; + + if fd_len != 1 || fds[0] <= 0 { + return Err(Error::InvalidPortRequest); + } + + let mut port_iter = std::str::from_utf8(&buf[..data_len]) + .map_err(|_| Error::InvalidPortRequest)? + .split_whitespace(); + + let port = port_iter + .next() + .ok_or(Error::InvalidPortRequest) + .and_then(|word| word.parse::().map_err(|_| Error::InvalidPortRequest))?; + + Ok((fds[0], port)) + } + + /// Add a new connection to the active connection pool. + fn add_connection(&mut self, key: ConnMapKey, conn: VsockConnection) -> Result<()> { + // We might need to make room for this new connection, so let's sweep + // the kill queue first. It's fine to do this here because: + // - unless the kill queue is out of sync, this is a pretty inexpensive + // operation; and + // - we are under no pressure to respect any accurate timing for + // connection termination. + self.sweep_killq(); + + if self.conn_map.len() >= defs::MAX_CONNECTIONS { + info!( + "vsock: muxer connection limit reached ({})", + defs::MAX_CONNECTIONS + ); + return Err(Error::TooManyConnections); + } + + self.add_listener( + conn.as_raw_fd(), + EpollListener::Connection { + key, + evset: conn.get_polled_evset(), + backend: conn.stream.backend_type(), + }, + ) + .map(|_| { + if conn.has_pending_rx() { + // We can safely ignore any error in adding a connection RX + // indication. Worst case scenario, the RX queue will get + // desynchronized, but we'll handle that the next time we need + // to yield an RX packet. + self.rxq.push(MuxerRx::ConnRx(key)); + } + self.conn_map.insert(key, conn); + }) + } + + /// Remove a connection from the active connection poll. + fn remove_connection(&mut self, key: ConnMapKey) { + if let Some(conn) = self.conn_map.remove(&key) { + self.remove_listener(conn.as_raw_fd()); + } + self.free_local_port(key.local_port); + } + + /// Schedule a connection for immediate termination. I.e. as soon as we can + /// also let our peer know we're dropping the connection, by sending it an + /// RST packet. + fn kill_connection(&mut self, key: ConnMapKey) { + let mut had_rx = false; + + self.conn_map.entry(key).and_modify(|conn| { + had_rx = conn.has_pending_rx(); + conn.kill(); + }); + // This connection will now have an RST packet to yield, so we need to + // add it to the RX queue. However, there's no point in doing that if it + // was already in the queue. + if !had_rx { + // We can safely ignore any error in adding a connection RX + // indication. Worst case scenario, the RX queue will get + // desynchronized, but we'll handle that the next time we need to + // yield an RX packet. + self.rxq.push(MuxerRx::ConnRx(key)); + } + } + + /// Register a new epoll listener under the muxer's nested epoll FD. + pub(crate) fn add_listener(&mut self, fd: RawFd, listener: EpollListener) -> Result<()> { + let evset = match listener { + EpollListener::Connection { evset, .. } => evset, + EpollListener::LocalStream(_) => epoll::Events::EPOLLIN, + EpollListener::Backend(_) => epoll::Events::EPOLLIN, + EpollListener::PassFdStream(_) => epoll::Events::EPOLLIN, + }; + + epoll::ctl( + self.epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + fd, + epoll::Event::new(evset, fd as u64), + ) + .map(|_| { + self.listener_map.insert(fd, listener); + }) + .map_err(Error::EpollAdd)?; + + Ok(()) + } + + /// Remove (and return) a previously registered epoll listener. + fn remove_listener(&mut self, fd: RawFd) -> Option { + let maybe_listener = self.listener_map.remove(&fd); + + if maybe_listener.is_some() { + epoll::ctl( + self.epoll_fd, + epoll::ControlOptions::EPOLL_CTL_DEL, + fd, + epoll::Event::new(epoll::Events::empty(), 0), + ) + .unwrap_or_else(|err| { + warn!( + "vosck muxer: error removing epoll listener for fd {:?}: {:?}", + fd, err + ); + }); + } + + maybe_listener + } + + /// Allocate a host-side port to be assigned to a new host-initiated + /// connection. + fn allocate_local_port(&mut self) -> u32 { + // TODO: this doesn't seem very space-efficient. + // Mybe rewrite this to limit port range and use a bitmap? + + loop { + self.local_port_last = (self.local_port_last + 1) & !(1 << 31) | (1 << 30); + if self.local_port_set.insert(self.local_port_last) { + break; + } + } + self.local_port_last + } + + /// Mark a previously used host-side port as free. + fn free_local_port(&mut self, port: u32) { + self.local_port_set.remove(&port); + } + + /// Handle a new connection request comming from our peer (the guest vsock + /// driver). + /// + /// This will attempt to connect to a host-side backend. If successful, a + /// new connection object will be created and added to the connection pool. + /// On failure, a new RST packet will be scheduled for delivery to the + /// guest. + fn handle_peer_request_pkt(&mut self, pkt: &VsockPacket) { + if self.peer_backend.is_none() { + error!("no usable backend for peer request"); + self.enq_rst(pkt.dst_port(), pkt.src_port()); + return; + } + + // safe to unwrap + if let Some(backend) = self.backend_map.get(self.peer_backend.as_ref().unwrap()) { + backend + .connect(pkt.dst_port()) + .map_err(Error::BackendConnect) + .and_then(|stream| { + self.add_connection( + ConnMapKey { + local_port: pkt.dst_port(), + peer_port: pkt.src_port(), + }, + VsockConnection::new_peer_init( + stream, + uapi::VSOCK_HOST_CID, + self.cid, + pkt.dst_port(), + pkt.src_port(), + pkt.buf_alloc(), + ), + ) + }) + .unwrap_or_else(|e| { + error!("peer request error: {:?}", e); + self.enq_rst(pkt.dst_port(), pkt.src_port()); + }); + } else { + error!("no usable backend selected for peer request"); + self.enq_rst(pkt.dst_port(), pkt.src_port()); + } + } + + /// Perform an action that might mutate a connection's state. + /// + /// This is used as shorthand for repetitive tasks that need to be performed + /// after a connection object mutates. E.g. + /// - update the connection's epoll listener; + /// - schedule the connection to be queried for RX data; + /// - kill the connection if an unrecoverable error occurs. + fn apply_conn_mutation(&mut self, key: ConnMapKey, mut_fn: F) + where + F: FnOnce(&mut VsockConnection), + { + if let Some(conn) = self.conn_map.get_mut(&key) { + let had_rx = conn.has_pending_rx(); + let was_expiring = conn.will_expire(); + let prev_state = conn.state(); + let backend_type = conn.stream.backend_type(); + + mut_fn(conn); + + // If this is a host-initiated connection that has just become + // established, we'll have to send an ack message to the host end. + if prev_state == ConnState::LocalInit && conn.state() == ConnState::Established { + let msg = format!("OK {}\n", key.local_port); + match conn.send_bytes_raw(msg.as_bytes()) { + Ok(written) if written == msg.len() => (), + Ok(_) => { + // If we can't write a dozen bytes to a pristine + // connection something must be really wrong. Killing + // it. + conn.kill(); + warn!("vsock: unable to fully write connection ack msg."); + } + Err(err) => { + conn.kill(); + warn!("vsock: unable to ack host connection [local_cid {}, peer_cid {}, local_port {}, peer_port {}]: {:?}", conn.local_cid, conn.peer_cid, conn.local_port, conn.peer_port, err); + } + }; + } + + // If the connection wasn't previously scheduled for RX, add it to + // our RX queue. + if !had_rx && conn.has_pending_rx() { + self.rxq.push(MuxerRx::ConnRx(key)); + } + + // If the connection wasn't previously scheduled for termination, + // add it to the kill queue. + if !was_expiring && conn.will_expire() { + // It's safe to unwrap here, since `conn.will_expire()` already + // guaranteed that an `conn.expiry` is available. + self.killq.push(key, conn.expiry().unwrap()); + } + + let fd = conn.as_raw_fd(); + let new_evset = conn.get_polled_evset(); + if new_evset.is_empty() { + // If the connection no longer needs epoll notifications, remove + // its listener from our list. + self.remove_listener(fd); + return; + } + if let Some(EpollListener::Connection { evset, .. }) = self.listener_map.get_mut(&fd) { + if *evset != new_evset { + // If the set of events that the connection is interested in + // has changed, we need to update its epoll listener. + debug!( + "vsock: updating listener for (lp={}, pp={}): old={:?}, new={:?}", + key.local_port, key.peer_port, *evset, new_evset + ); + + *evset = new_evset; + epoll::ctl( + self.epoll_fd, + epoll::ControlOptions::EPOLL_CTL_MOD, + fd, + epoll::Event::new(new_evset, fd as u64), + ) + .unwrap_or_else(|err| { + // This really shouldn't happen, like, ever. However, + // "famous last words" and all that, so let's just kill + // it with fire, and walk away. + self.kill_connection(key); + warn!( + "vsock: error updating epoll listener for (lp={}, pp={}): {:?}", + key.local_port, key.peer_port, err + ); + }); + } + } else { + // The connection had previously asked to be removed from the + // listener map (by returning an empty event set via + // `get_polled_fd()`), but now wants back in. + self.add_listener( + fd, + EpollListener::Connection { + key, + evset: new_evset, + backend: backend_type, + }, + ) + .unwrap_or_else(|err| { + self.kill_connection(key); + warn!( + "vsock: error updating epoll listener for (lp={}, pp={}): {:?}", + key.local_port, key.peer_port, err + ); + }); + } + } + } + + /// Check if any connections have timed out, and if so, schedule them for + /// immediate termination. + fn sweep_killq(&mut self) { + while let Some(key) = self.killq.pop() { + // Connections don't get removed from the kill queue when their kill + // timer is disarmed, since that would be a costly operation. This + // means we must check if the connection has indeed expired, prior + // to killing it. + let mut kill = false; + self.conn_map + .entry(key) + .and_modify(|conn| kill = conn.has_expired()); + if kill { + self.kill_connection(key); + } + } + + if self.killq.is_empty() && !self.killq.is_synced() { + self.killq = MuxerKillQ::from_conn_map(&self.conn_map); + // If we've just re-created the kill queue, we can sweep it again; + // maybe there's more to kill. + self.sweep_killq(); + } + } + + /// Enqueue an RST packet into `self.rxq`. + /// + /// Enqueue errors aren't propagated up the call chain, since there is + /// nothing we can do to handle them. We do, however, log a warning, since + /// not being able to enqueue an RST packet means we have to drop it, which + /// is not normal operation. + fn enq_rst(&mut self, local_port: u32, peer_port: u32) { + let pushed = self.rxq.push(MuxerRx::RstPkt { + local_port, + peer_port, + }); + if !pushed { + warn!( + "vsock: muxer.rxq full; dropping RST packet for lp={}, pp={}", + local_port, peer_port + ); + } + } +} + +#[cfg(test)] +mod tests { + use std::fs; + use std::io::{Read, Write}; + use std::ops::Drop; + use std::os::unix::net::{UnixListener, UnixStream}; + use std::path::{Path, PathBuf}; + + use virtio_queue::QueueT; + use vmm_sys_util::tempfile::TempFile; + + use super::super::super::backend::VsockUnixStreamBackend; + use super::super::super::csm::defs as csm_defs; + use super::super::super::defs::RXQ_EVENT; + use super::super::super::tests::TestContext as VsockTestContext; + use super::*; + + const PEER_CID: u64 = 3; + const PEER_BUF_ALLOC: u32 = 64 * 1024; + + struct MuxerTestContext { + _vsock_test_ctx: VsockTestContext, + pkt: VsockPacket, + muxer: VsockMuxer, + host_sock_path: String, + } + + impl Drop for MuxerTestContext { + fn drop(&mut self) { + std::fs::remove_file(self.host_sock_path.as_str()).unwrap(); + } + } + + // Create a TempFile with a given prefix and return it as a nice String + fn get_file(fprefix: &str) -> String { + let listener_path = TempFile::new_with_prefix(fprefix).unwrap(); + listener_path + .as_path() + .as_os_str() + .to_str() + .unwrap() + .to_owned() + } + + impl MuxerTestContext { + fn new(name: &str) -> Self { + let vsock_test_ctx = VsockTestContext::new(); + let mut handler_ctx = vsock_test_ctx.create_event_handler_context(); + let pkt = VsockPacket::from_rx_virtq_head( + &mut handler_ctx.queues[RXQ_EVENT as usize] + .queue_mut() + .pop_descriptor_chain(&vsock_test_ctx.mem) + .unwrap(), + ) + .unwrap(); + + let host_sock_path = get_file(name); + let mut muxer = VsockMuxer::new(PEER_CID).unwrap(); + let uds_backend = + Box::new(VsockUnixStreamBackend::new(host_sock_path.clone()).unwrap()); + muxer.add_backend(uds_backend, true).unwrap(); + Self { + _vsock_test_ctx: vsock_test_ctx, + pkt, + muxer, + host_sock_path, + } + } + + fn init_pkt(&mut self, local_port: u32, peer_port: u32, op: u16) -> &mut VsockPacket { + for b in self.pkt.hdr_mut() { + *b = 0; + } + self.pkt + .set_type(uapi::VSOCK_TYPE_STREAM) + .set_src_cid(PEER_CID) + .set_dst_cid(uapi::VSOCK_HOST_CID) + .set_src_port(peer_port) + .set_dst_port(local_port) + .set_op(op) + .set_buf_alloc(PEER_BUF_ALLOC) + } + + fn init_data_pkt( + &mut self, + local_port: u32, + peer_port: u32, + data: &[u8], + ) -> &mut VsockPacket { + assert!(data.len() <= self.pkt.buf().unwrap().len()); + self.init_pkt(local_port, peer_port, uapi::VSOCK_OP_RW) + .set_len(data.len() as u32); + self.pkt.buf_mut().unwrap()[..data.len()].copy_from_slice(data); + &mut self.pkt + } + + fn send(&mut self) { + self.muxer.send_pkt(&self.pkt).unwrap(); + } + + fn recv(&mut self) { + self.muxer.recv_pkt(&mut self.pkt).unwrap(); + } + + fn notify_muxer(&mut self) { + self.muxer.notify(epoll::Events::EPOLLIN); + } + + fn count_epoll_listeners(&self) -> (usize, usize) { + let mut local_lsn_count = 0usize; + let mut conn_lsn_count = 0usize; + for key in self.muxer.listener_map.values() { + match key { + EpollListener::LocalStream(_) => local_lsn_count += 1, + EpollListener::Connection { .. } => conn_lsn_count += 1, + _ => (), + }; + } + (local_lsn_count, conn_lsn_count) + } + + fn create_local_listener(&self, port: u32) -> LocalListener { + LocalListener::new(format!("{}_{}", self.host_sock_path, port)) + } + + fn local_connect(&mut self, peer_port: u32) -> (UnixStream, u32) { + let (init_local_lsn_count, init_conn_lsn_count) = self.count_epoll_listeners(); + + let mut stream = UnixStream::connect(self.host_sock_path.clone()).unwrap(); + stream.set_nonblocking(true).unwrap(); + // The muxer would now get notified of a new connection having arrived at its Unix + // socket, so it can accept it. + self.notify_muxer(); + + // Just after having accepted a new local connection, the muxer should've added a new + // `LocalStream` listener to its `listener_map`. + let (local_lsn_count, _) = self.count_epoll_listeners(); + assert_eq!(local_lsn_count, init_local_lsn_count + 1); + + let buf = format!("CONNECT {peer_port}\n"); + stream.write_all(buf.as_bytes()).unwrap(); + // The muxer would now get notified that data is available for reading from the locally + // initiated connection. + self.notify_muxer(); + + // Successfully reading and parsing the connection request should have removed the + // LocalStream epoll listener and added a Connection epoll listener. + let (local_lsn_count, conn_lsn_count) = self.count_epoll_listeners(); + assert_eq!(local_lsn_count, init_local_lsn_count); + assert_eq!(conn_lsn_count, init_conn_lsn_count + 1); + + // A LocalInit connection should've been added to the muxer connection map. A new + // local port should also have been allocated for the new LocalInit connection. + let local_port = self.muxer.local_port_last; + let key = ConnMapKey { + local_port, + peer_port, + }; + assert!(self.muxer.conn_map.contains_key(&key)); + assert!(self.muxer.local_port_set.contains(&local_port)); + + // A connection request for the peer should now be available from the muxer. + assert!(self.muxer.has_pending_rx()); + self.recv(); + assert_eq!(self.pkt.op(), uapi::VSOCK_OP_REQUEST); + assert_eq!(self.pkt.dst_port(), peer_port); + assert_eq!(self.pkt.src_port(), local_port); + + self.init_pkt(local_port, peer_port, uapi::VSOCK_OP_RESPONSE); + self.send(); + + let mut buf = vec![0u8; 32]; + let len = stream.read(&mut buf[..]).unwrap(); + assert_eq!(&buf[..len], format!("OK {local_port}\n").as_bytes()); + + (stream, local_port) + } + } + + struct LocalListener { + path: PathBuf, + sock: UnixListener, + } + impl LocalListener { + fn new + Clone>(path: P) -> Self { + let path_buf = path.as_ref().to_path_buf(); + let sock = UnixListener::bind(path).unwrap(); + sock.set_nonblocking(true).unwrap(); + Self { + path: path_buf, + sock, + } + } + fn accept(&mut self) -> UnixStream { + let (stream, _) = self.sock.accept().unwrap(); + stream.set_nonblocking(true).unwrap(); + stream + } + } + impl Drop for LocalListener { + fn drop(&mut self) { + std::fs::remove_file(&self.path).unwrap(); + } + } + + #[test] + fn test_muxer_epoll_listener() { + let ctx = MuxerTestContext::new("/tmp/muxer_epoll_listener"); + assert_eq!(ctx.muxer.as_raw_fd(), ctx.muxer.epoll_fd); + assert_eq!(ctx.muxer.get_polled_evset(), epoll::Events::EPOLLIN); + } + + #[test] + fn test_bad_peer_pkt() { + const LOCAL_PORT: u32 = 1026; + const PEER_PORT: u32 = 1025; + const SOCK_DGRAM: u16 = 2; + + let mut ctx = MuxerTestContext::new("/tmp/bad_peer_pkt"); + ctx.init_pkt(LOCAL_PORT, PEER_PORT, uapi::VSOCK_OP_REQUEST) + .set_type(SOCK_DGRAM); + ctx.send(); + + // The guest sent a SOCK_DGRAM packet. Per the vsock spec, we need to reply with an RST + // packet, since vsock only supports stream sockets. + assert!(ctx.muxer.has_pending_rx()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RST); + assert_eq!(ctx.pkt.src_cid(), uapi::VSOCK_HOST_CID); + assert_eq!(ctx.pkt.dst_cid(), PEER_CID); + assert_eq!(ctx.pkt.src_port(), LOCAL_PORT); + assert_eq!(ctx.pkt.dst_port(), PEER_PORT); + + // Any orphan (i.e. without a connection), non-RST packet, should be replied to with an + // RST. + let bad_ops = [ + uapi::VSOCK_OP_RESPONSE, + uapi::VSOCK_OP_CREDIT_REQUEST, + uapi::VSOCK_OP_CREDIT_UPDATE, + uapi::VSOCK_OP_SHUTDOWN, + uapi::VSOCK_OP_RW, + ]; + for op in bad_ops.iter() { + ctx.init_pkt(LOCAL_PORT, PEER_PORT, *op); + ctx.send(); + assert!(ctx.muxer.has_pending_rx()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RST); + assert_eq!(ctx.pkt.src_port(), LOCAL_PORT); + assert_eq!(ctx.pkt.dst_port(), PEER_PORT); + } + + // Any packet addressed to anything other than VSOCK_VHOST_CID should get dropped. + assert!(!ctx.muxer.has_pending_rx()); + ctx.init_pkt(LOCAL_PORT, PEER_PORT, uapi::VSOCK_OP_REQUEST) + .set_dst_cid(uapi::VSOCK_HOST_CID + 1); + ctx.send(); + assert!(!ctx.muxer.has_pending_rx()); + } + + #[test] + fn test_peer_connection() { + const LOCAL_PORT: u32 = 1026; + const PEER_PORT: u32 = 1025; + + let mut ctx = MuxerTestContext::new("/tmp/peer_connection"); + + // Test peer connection refused. + ctx.init_pkt(LOCAL_PORT, PEER_PORT, uapi::VSOCK_OP_REQUEST); + ctx.send(); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RST); + assert_eq!(ctx.pkt.len(), 0); + assert_eq!(ctx.pkt.src_cid(), uapi::VSOCK_HOST_CID); + assert_eq!(ctx.pkt.dst_cid(), PEER_CID); + assert_eq!(ctx.pkt.src_port(), LOCAL_PORT); + assert_eq!(ctx.pkt.dst_port(), PEER_PORT); + + // Test peer connection accepted. + let mut listener = ctx.create_local_listener(LOCAL_PORT); + ctx.init_pkt(LOCAL_PORT, PEER_PORT, uapi::VSOCK_OP_REQUEST); + ctx.send(); + assert_eq!(ctx.muxer.conn_map.len(), 1); + let mut stream = listener.accept(); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RESPONSE); + assert_eq!(ctx.pkt.len(), 0); + assert_eq!(ctx.pkt.src_cid(), uapi::VSOCK_HOST_CID); + assert_eq!(ctx.pkt.dst_cid(), PEER_CID); + assert_eq!(ctx.pkt.src_port(), LOCAL_PORT); + assert_eq!(ctx.pkt.dst_port(), PEER_PORT); + let key = ConnMapKey { + local_port: LOCAL_PORT, + peer_port: PEER_PORT, + }; + assert!(ctx.muxer.conn_map.contains_key(&key)); + + // Test guest -> host data flow. + let data = [1, 2, 3, 4]; + ctx.init_data_pkt(LOCAL_PORT, PEER_PORT, &data); + ctx.send(); + let mut buf = vec![0; data.len()]; + stream.read_exact(buf.as_mut_slice()).unwrap(); + assert_eq!(buf.as_slice(), data); + + // Test host -> guest data flow. + let data = [5u8, 6, 7, 8]; + stream.write_all(&data).unwrap(); + + // When data is available on the local stream, an EPOLLIN event would normally be delivered + // to the muxer's nested epoll FD. For testing only, we can fake that event notification + // here. + ctx.notify_muxer(); + // After being notified, the muxer should've figured out that RX data was available for one + // of its connections, so it should now be reporting that it can fill in an RX packet. + assert!(ctx.muxer.has_pending_rx()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RW); + assert_eq!(ctx.pkt.buf().unwrap()[..data.len()], data); + assert_eq!(ctx.pkt.src_port(), LOCAL_PORT); + assert_eq!(ctx.pkt.dst_port(), PEER_PORT); + + assert!(!ctx.muxer.has_pending_rx()); + } + + #[test] + fn test_local_connection() { + let mut ctx = MuxerTestContext::new("/tmp/local_connection"); + let peer_port = 1025; + let (mut stream, local_port) = ctx.local_connect(peer_port); + + // Test guest -> host data flow. + let data = [1, 2, 3, 4]; + ctx.init_data_pkt(local_port, peer_port, &data); + ctx.send(); + + let mut buf = vec![0u8; data.len()]; + stream.read_exact(buf.as_mut_slice()).unwrap(); + assert_eq!(buf.as_slice(), &data); + + // Test host -> guest data flow. + let data = [5, 6, 7, 8]; + stream.write_all(&data).unwrap(); + ctx.notify_muxer(); + + assert!(ctx.muxer.has_pending_rx()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RW); + assert_eq!(ctx.pkt.src_port(), local_port); + assert_eq!(ctx.pkt.dst_port(), peer_port); + assert_eq!(ctx.pkt.buf().unwrap()[..data.len()], data); + } + + #[test] + fn test_local_close() { + let peer_port = 1025; + let mut ctx = MuxerTestContext::new("/tmp/local_close"); + let local_port; + { + let (_stream, local_port_) = ctx.local_connect(peer_port); + local_port = local_port_; + } + // Local var `_stream` was now dropped, thus closing the local stream. After the muxer gets + // notified via EPOLLIN, it should attempt to gracefully shutdown the connection, issuing a + // VSOCK_OP_SHUTDOWN with both no-more-send and no-more-recv indications set. + ctx.notify_muxer(); + assert!(ctx.muxer.has_pending_rx()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_SHUTDOWN); + assert_ne!(ctx.pkt.flags() & uapi::VSOCK_FLAGS_SHUTDOWN_SEND, 0); + assert_ne!(ctx.pkt.flags() & uapi::VSOCK_FLAGS_SHUTDOWN_RCV, 0); + assert_eq!(ctx.pkt.src_port(), local_port); + assert_eq!(ctx.pkt.dst_port(), peer_port); + + // The connection should get removed (and its local port freed), after the peer replies + // with an RST. + ctx.init_pkt(local_port, peer_port, uapi::VSOCK_OP_RST); + ctx.send(); + let key = ConnMapKey { + local_port, + peer_port, + }; + assert!(!ctx.muxer.conn_map.contains_key(&key)); + assert!(!ctx.muxer.local_port_set.contains(&local_port)); + } + + #[test] + fn test_peer_close() { + let peer_port = 1025; + let local_port = 1026; + let mut ctx = MuxerTestContext::new("/tmp/peer_close"); + + let mut sock = ctx.create_local_listener(local_port); + ctx.init_pkt(local_port, peer_port, uapi::VSOCK_OP_REQUEST); + ctx.send(); + let mut stream = sock.accept(); + + assert!(ctx.muxer.has_pending_rx()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RESPONSE); + assert_eq!(ctx.pkt.src_port(), local_port); + assert_eq!(ctx.pkt.dst_port(), peer_port); + let key = ConnMapKey { + local_port, + peer_port, + }; + assert!(ctx.muxer.conn_map.contains_key(&key)); + + // Emulate a full shutdown from the peer (no-more-send + no-more-recv). + ctx.init_pkt(local_port, peer_port, uapi::VSOCK_OP_SHUTDOWN) + .set_flag(uapi::VSOCK_FLAGS_SHUTDOWN_SEND) + .set_flag(uapi::VSOCK_FLAGS_SHUTDOWN_RCV); + ctx.send(); + + // Now, the muxer should remove the connection from its map, and reply with an RST. + assert!(ctx.muxer.has_pending_rx()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RST); + assert_eq!(ctx.pkt.src_port(), local_port); + assert_eq!(ctx.pkt.dst_port(), peer_port); + let key = ConnMapKey { + local_port, + peer_port, + }; + assert!(!ctx.muxer.conn_map.contains_key(&key)); + + // The muxer should also drop / close the local Unix socket for this connection. + let mut buf = vec![0u8; 16]; + assert_eq!(stream.read(buf.as_mut_slice()).unwrap(), 0); + } + + #[test] + fn test_muxer_rxq() { + let mut ctx = MuxerTestContext::new("/tmp/muxer_rxq"); + let local_port = 1026; + let peer_port_first = 1025; + let mut listener = ctx.create_local_listener(local_port); + let mut streams: Vec = Vec::new(); + + for peer_port in peer_port_first..peer_port_first + defs::MUXER_RXQ_SIZE { + ctx.init_pkt(local_port, peer_port as u32, uapi::VSOCK_OP_REQUEST); + ctx.send(); + streams.push(listener.accept()); + } + + // The muxer RX queue should now be full (with connection reponses), but still + // synchronized. + assert!(ctx.muxer.rxq.is_synced()); + + // One more queued reply should desync the RX queue. + ctx.init_pkt( + local_port, + (peer_port_first + defs::MUXER_RXQ_SIZE) as u32, + uapi::VSOCK_OP_REQUEST, + ); + ctx.send(); + assert!(!ctx.muxer.rxq.is_synced()); + + // With an out-of-sync queue, an RST should evict any non-RST packet from the queue, and + // take its place. We'll check that by making sure that the last packet popped from the + // queue is an RST. + ctx.init_pkt( + local_port + 1, + peer_port_first as u32, + uapi::VSOCK_OP_REQUEST, + ); + ctx.send(); + + for peer_port in peer_port_first..peer_port_first + defs::MUXER_RXQ_SIZE - 1 { + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RESPONSE); + // The response order should hold. The evicted response should have been the last + // enqueued. + assert_eq!(ctx.pkt.dst_port(), peer_port as u32); + } + // There should be one more packet in the queue: the RST. + assert_eq!(ctx.muxer.rxq.len(), 1); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RST); + + // The queue should now be empty, but out-of-sync, so the muxer should report it has some + // pending RX. + assert!(ctx.muxer.rxq.is_empty()); + assert!(!ctx.muxer.rxq.is_synced()); + assert!(ctx.muxer.has_pending_rx()); + + // The next recv should sync the queue back up. It should also yield one of the two + // responses that are still left: + // - the one that desynchronized the queue; and + // - the one that got evicted by the RST. + ctx.recv(); + assert!(ctx.muxer.rxq.is_synced()); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RESPONSE); + + assert!(ctx.muxer.has_pending_rx()); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RESPONSE); + } + + #[test] + fn test_muxer_killq() { + let mut ctx = MuxerTestContext::new("/tmp/muxer_killq"); + let local_port = 1026; + let peer_port_first = 1025; + let peer_port_last = peer_port_first + defs::MUXER_KILLQ_SIZE; + let mut listener = ctx.create_local_listener(local_port); + + for peer_port in peer_port_first..=peer_port_last { + ctx.init_pkt(local_port, peer_port as u32, uapi::VSOCK_OP_REQUEST); + ctx.send(); + ctx.notify_muxer(); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RESPONSE); + assert_eq!(ctx.pkt.src_port(), local_port); + assert_eq!(ctx.pkt.dst_port(), peer_port as u32); + { + let _stream = listener.accept(); + } + ctx.notify_muxer(); + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_SHUTDOWN); + assert_eq!(ctx.pkt.src_port(), local_port); + assert_eq!(ctx.pkt.dst_port(), peer_port as u32); + // The kill queue should be synchronized, up until the `defs::MUXER_KILLQ_SIZE`th + // connection we schedule for termination. + assert_eq!( + ctx.muxer.killq.is_synced(), + peer_port < peer_port_first + defs::MUXER_KILLQ_SIZE + ); + } + + assert!(!ctx.muxer.killq.is_synced()); + assert!(!ctx.muxer.has_pending_rx()); + + // Wait for the kill timers to expire. + std::thread::sleep(std::time::Duration::from_millis( + csm_defs::CONN_SHUTDOWN_TIMEOUT_MS, + )); + + // Trigger a kill queue sweep, by requesting a new connection. + ctx.init_pkt( + local_port, + peer_port_last as u32 + 1, + uapi::VSOCK_OP_REQUEST, + ); + ctx.send(); + + // After sweeping the kill queue, it should now be synced (assuming the RX queue is larger + // than the kill queue, since an RST packet will be queued for each killed connection). + assert!(ctx.muxer.killq.is_synced()); + assert!(ctx.muxer.has_pending_rx()); + // There should be `defs::MUXER_KILLQ_SIZE` RSTs in the RX queue, from terminating the + // dying connections in the recent killq sweep. + for _p in peer_port_first..peer_port_last { + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RST); + assert_eq!(ctx.pkt.src_port(), local_port); + } + + // There should be one more packet in the RX queue: the connection response our request + // that triggered the kill queue sweep. + ctx.recv(); + assert_eq!(ctx.pkt.op(), uapi::VSOCK_OP_RESPONSE); + assert_eq!(ctx.pkt.dst_port(), peer_port_last as u32 + 1); + + assert!(!ctx.muxer.has_pending_rx()); + } + + #[test] + fn test_regression_handshake() { + // Address one of the issues found while fixing the following issue: + // https://github.com/firecracker-microvm/firecracker/issues/1751 + // This test checks that the handshake message is not accounted for + let mut ctx = MuxerTestContext::new("/tmp/regression_handshake"); + let peer_port = 1025; + + // Create a local connection. + let (_, local_port) = ctx.local_connect(peer_port); + + // Get the connection from the connection map. + let key = ConnMapKey { + local_port, + peer_port, + }; + let conn = ctx.muxer.conn_map.get_mut(&key).unwrap(); + + // Check that fwd_cnt is 0 - "OK ..." was not accounted for. + assert_eq!(conn.fwd_cnt().0, 0); + } + + #[test] + fn test_regression_rxq_pop() { + // Address one of the issues found while fixing the following issue: + // https://github.com/firecracker-microvm/firecracker/issues/1751 + // This test checks that a connection is not popped out of the muxer + // rxq when multiple flags are set + let mut ctx = MuxerTestContext::new("/tmp/regression_rxq_pop"); + let peer_port = 1025; + let (mut stream, local_port) = ctx.local_connect(peer_port); + + // Send some data. + let data = [5u8, 6, 7, 8]; + stream.write_all(&data).unwrap(); + ctx.notify_muxer(); + + // Get the connection from the connection map. + let key = ConnMapKey { + local_port, + peer_port, + }; + let conn = ctx.muxer.conn_map.get_mut(&key).unwrap(); + + // Forcefully insert another flag. + conn.insert_credit_update(); + + // Call recv twice in order to check that the connection is still + // in the rxq. + assert!(ctx.muxer.has_pending_rx()); + ctx.recv(); + assert!(ctx.muxer.has_pending_rx()); + ctx.recv(); + + // Since initially the connection had two flags set, now there should + // not be any pending RX in the muxer. + assert!(!ctx.muxer.has_pending_rx()); + } + + #[test] + fn test_add_backend_to_muxer() { + let host_sock_path_1 = String::from("/tmp/host_sock_path_muxer_1_1"); + let host_sock_path_2 = String::from("/tmp/host_sock_path_muxer_1_2"); + let host_sock_path_3 = String::from("/tmp/host_sock_path_muxer_1_3"); + fs::remove_file(Path::new(&host_sock_path_1)).unwrap_or_default(); + fs::remove_file(Path::new(&host_sock_path_2)).unwrap_or_default(); + fs::remove_file(Path::new(&host_sock_path_3)).unwrap_or_default(); + + let mut muxer_1 = VsockMuxer::new(PEER_CID).unwrap(); + let uds_backend_1 = + Box::new(VsockUnixStreamBackend::new(host_sock_path_1.clone()).unwrap()); + let uds_backend_2 = + Box::new(VsockUnixStreamBackend::new(host_sock_path_2.clone()).unwrap()); + + // add uds backend, ok + assert!(muxer_1.add_backend(uds_backend_1, false).is_ok()); + // add another uds backend, err + assert!(muxer_1.add_backend(uds_backend_2, false).is_err()); + + let mut muxer_2 = VsockMuxer::new(PEER_CID).unwrap(); + let uds_backend_3 = + Box::new(VsockUnixStreamBackend::new(host_sock_path_3.clone()).unwrap()); + assert!(muxer_2.add_backend(uds_backend_3, true).is_ok()); + // peer_backend need to be uds backend + assert!(muxer_2.peer_backend == Some(VsockBackendType::UnixStream)); + + fs::remove_file(Path::new(&host_sock_path_1)).unwrap_or_default(); + fs::remove_file(Path::new(&host_sock_path_2)).unwrap_or_default(); + fs::remove_file(Path::new(&host_sock_path_3)).unwrap_or_default(); + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/muxer/muxer_killq.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/muxer/muxer_killq.rs new file mode 100644 index 000000000..427fd42c5 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/muxer/muxer_killq.rs @@ -0,0 +1,157 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// + +/// `MuxerKillQ` implements a helper object that `VsockMuxer` can use for +/// scheduling forced connection termination. I.e. after one peer issues a clean +/// shutdown request (VSOCK_OP_SHUTDOWN), the concerned connection is queued for +/// termination (VSOCK_OP_RST) in the near future (herein implemented via an +/// expiring timer). +/// +/// Whenever the muxer needs to schedule a connection for termination, it pushes +/// it (or rather an identifier - the connection key) to this queue. A +/// subsequent pop() operation will succeed if and only if the first connection +/// in the queue is ready to be terminated (i.e. its kill timer expired). +/// +/// Without using this queue, the muxer would have to walk its entire connection +/// pool (hashmap), whenever it needs to check for expired kill timers. With +/// this queue, both scheduling and termination are performed in constant time. +/// However, since we don't want to waste space on a kill queue that's as big as +/// the connection hashmap itself, it is possible that this queue may become +/// full at times. We call this kill queue "synchronized" if we are certain that +/// all connections that are awaiting termination are present in the queue. This +/// means a simple constant-time pop() operation is enough to check wether any +/// connections need to be terminated. When the kill queue becomes full, though, +/// pushing fails, so connections that should be terminated are left out. The +/// queue is not synchronized anymore. When that happens, the muxer will first +/// drain the queue, and then replace it with a new queue, created by walking +/// the connection pool, looking for connections that will be expiring in the +/// future. +use std::collections::{HashMap, VecDeque}; +use std::time::Instant; + +use super::super::csm::VsockConnection; +use super::defs; +use super::muxer_impl::ConnMapKey; + +/// A kill queue item, holding the connection key and the scheduled time for +/// termination. +#[derive(Clone, Copy, Debug)] +pub struct MuxerKillQItem { + pub(crate) key: ConnMapKey, + pub(crate) kill_time: Instant, +} + +impl PartialEq for MuxerKillQItem { + fn eq(&self, other: &MuxerKillQItem) -> bool { + // Time error within 10ms is considered no problem + if let Some(duration) = self.kill_time.checked_duration_since(other.kill_time) { + if duration.as_millis() > 10 { + return false; + } + } else if let Some(duration) = other.kill_time.checked_duration_since(self.kill_time) { + if duration.as_millis() > 10 { + return false; + } + } else { + return false; + } + + self.key == other.key + } +} + +/// The connection kill queue: a FIFO structure, storing the connections that +/// are scheduled for termination. +#[derive(PartialEq)] +pub struct MuxerKillQ { + /// The kill queue contents. + pub(crate) q: VecDeque, + + /// The kill queue sync status: + /// - when true, all connections that are awaiting termination are + /// guaranteed to be in this queue; + /// - when false, some connections may have been left out. + pub(crate) synced: bool, +} + +impl MuxerKillQ { + const SIZE: usize = defs::MUXER_KILLQ_SIZE; + + /// Create a kill queue by walking the connection pool, looking for + /// connections that are set to expire at some point in the future. + /// + /// Note: if more than `Self::SIZE` connections are found, the queue will be + /// created in an out-of-sync state, and will be discarded after it is + /// emptied. + pub fn from_conn_map(conn_map: &HashMap) -> Self { + let mut q_buf: Vec = Vec::with_capacity(Self::SIZE); + let mut synced = true; + for (key, conn) in conn_map.iter() { + if !conn.will_expire() { + continue; + } + if q_buf.len() >= Self::SIZE { + synced = false; + break; + } + q_buf.push(MuxerKillQItem { + key: *key, + kill_time: conn.expiry().unwrap(), + }); + } + q_buf.sort_unstable_by_key(|it| it.kill_time); + Self { + q: q_buf.into(), + synced, + } + } + + /// Push a connection key to the queue, scheduling it for termination at + /// `CONN_SHUTDOWN_TIMEOUT_MS` from now (the push time). + pub fn push(&mut self, key: ConnMapKey, kill_time: Instant) { + if !self.is_synced() || self.is_full() { + self.synced = false; + return; + } + self.q.push_back(MuxerKillQItem { key, kill_time }); + } + + /// Attempt to pop an expired connection from the kill queue. + /// + /// This will succeed and return a connection key, only if the connection at + /// the front of the queue has expired. Otherwise, `None` is returned. + pub fn pop(&mut self) -> Option { + if let Some(item) = self.q.front() { + if Instant::now() > item.kill_time { + return Some(self.q.pop_front().unwrap().key); + } + } + None + } + + /// Check if the kill queue is synchronized with the connection pool. + pub fn is_synced(&self) -> bool { + self.synced + } + + /// Check if the kill queue is empty, obviously. + pub fn is_empty(&self) -> bool { + self.q.len() == 0 + } + + /// Check if the kill queue is full. + pub fn is_full(&self) -> bool { + self.q.len() == Self::SIZE + } +} + +impl Default for MuxerKillQ { + /// Trivial kill queue constructor. + fn default() -> Self { + Self { + q: VecDeque::with_capacity(Self::SIZE), + synced: true, + } + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/muxer/muxer_rxq.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/muxer/muxer_rxq.rs new file mode 100644 index 000000000..fd4723b0b --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/muxer/muxer_rxq.rs @@ -0,0 +1,146 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// + +/// `MuxerRxQ` implements a helper object that `VsockMuxer` can use for queuing +/// RX (host -> guest) packets (or rather instructions on how to build said +/// packets). +/// +/// Under ideal operation, every connection that has pending RX data will be +/// present in the muxer RX queue. However, since the RX queue is smaller than +/// the connection pool, it may, under some conditions, become full, meaning +/// that it can no longer account for all the connections that can yield RX +/// data. When that happens, we say that it is no longer "synchronized" (i.e. +/// with the connection pool). A desynchronized RX queue still holds valid data, +/// and the muxer will continue to pop packets from it. However, when a +/// desynchronized queue is drained, additional data may still be available, so +/// the muxer will have to perform a more costly walk of the entire connection +/// pool to find it. This walk is also implemented here, and it is part of the +/// resynchronization procedure: inspect all connections, and add every +/// connection that has pending RX data to the RX queue. +use std::collections::{HashMap, VecDeque}; + +use super::super::csm::VsockConnection; +use super::super::VsockChannel; +use super::defs; +use super::muxer_impl::{ConnMapKey, MuxerRx}; + +/// The muxer RX queue. +#[derive(Eq, PartialEq)] +pub struct MuxerRxQ { + /// The RX queue data. + pub(crate) q: VecDeque, + /// The RX queue sync status. + pub(crate) synced: bool, +} + +impl MuxerRxQ { + const SIZE: usize = defs::MUXER_RXQ_SIZE; + + /// Attempt to build an RX queue, that is synchronized to the connection + /// pool. + /// + /// Note: the resulting queue may still be desynchronized, if there are too + /// many connections that have pending RX data. In that case, the + /// muxer will first drain this queue, and then try again to build a + /// synchronized one. + pub fn from_conn_map(conn_map: &HashMap) -> Self { + let mut q = VecDeque::new(); + let mut synced = true; + + for (key, conn) in conn_map.iter() { + if !conn.has_pending_rx() { + continue; + } + if q.len() >= Self::SIZE { + synced = false; + break; + } + q.push_back(MuxerRx::ConnRx(*key)); + } + Self { q, synced } + } + + /// Push a new RX item to the queue. + /// + /// A push will fail when: + /// - trying to push a connection key onto an out-of-sync, or full queue; or + /// - trying to push an RST onto a queue already full of RSTs. + /// + /// RSTs take precedence over connections, because connections can always be + /// queried for pending RX data later. Aside from this queue, there is no + /// other storage for RSTs, so, failing to push one means that we have to + /// drop the packet. + /// + /// Returns: + /// - `true` if the new item has been successfully queued; or + /// - `false` if there was no room left in the queue. + pub fn push(&mut self, rx: MuxerRx) -> bool { + // Pushing to a non-full, synchronized queue will always succeed. + if self.is_synced() && !self.is_full() { + self.q.push_back(rx); + return true; + } + + match rx { + MuxerRx::RstPkt { .. } => { + // If we just failed to push an RST packet, we'll look through + // the queue, trying to find a connection key that we could + // evict. This way, the queue does lose sync, but we don't drop + // any packets. + for qi in self.q.iter_mut().rev() { + if let MuxerRx::ConnRx(_) = qi { + *qi = rx; + self.synced = false; + return true; + } + } + } + MuxerRx::ConnRx(_) => { + self.synced = false; + } + }; + + false + } + + /// Peek into the front of the queue. + pub fn peek(&self) -> Option { + self.q.front().copied() + } + + /// Pop an RX item from the front of the queue. + pub fn pop(&mut self) -> Option { + self.q.pop_front() + } + + /// Check if the RX queue is synchronized with the connection pool. + pub fn is_synced(&self) -> bool { + self.synced + } + + /// Get the total number of items in the queue. + pub fn len(&self) -> usize { + self.q.len() + } + + /// Check if the queue is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Check if the queue is full. + pub fn is_full(&self) -> bool { + self.len() == Self::SIZE + } +} + +/// Trivial RX queue constructor. +impl Default for MuxerRxQ { + fn default() -> Self { + Self { + q: VecDeque::with_capacity(Self::SIZE), + synced: true, + } + } +} diff --git a/src/dragonball/src/dbs_virtio_devices/src/vsock/packet.rs b/src/dragonball/src/dbs_virtio_devices/src/vsock/packet.rs new file mode 100644 index 000000000..bbdd5f382 --- /dev/null +++ b/src/dragonball/src/dbs_virtio_devices/src/vsock/packet.rs @@ -0,0 +1,763 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +/// `VsockPacket` provides a thin wrapper over the buffers exchanged via virtio +/// queues. There are two components to a vsock packet, each using its own +/// descriptor in a virtio queue: +/// - the packet header; and +/// - the packet data/buffer. +/// +/// There is a 1:1 relation between descriptor chains and packets: the first +/// (chain head) holds the header, and an optional second descriptor holds the +/// data. The second descriptor is only present for data packets (VSOCK_OP_RW). +/// +/// `VsockPacket` wraps these two buffers and provides direct access to the data +/// stored in guest memory. This is done to avoid unnecessarily copying data +/// from guest memory to temporary buffers, before passing it on to the vsock +/// backend. +use std::ops::{Deref, DerefMut}; + +use virtio_queue::{Descriptor, DescriptorChain}; +use vm_memory::GuestMemory; + +use super::defs; +use super::{Result, VsockError}; + +/// The vsock packet header. +// +// The vsock packet header is defined by the C struct: +// +// ```C +// struct virtio_vsock_hdr { +// le64 src_cid; +// le64 dst_cid; +// le32 src_port; +// le32 dst_port; +// le32 len; +// le16 type; +// le16 op; +// le32 flags; +// le32 buf_alloc; +// le32 fwd_cnt; +// } __attribute__((packed)); +// ``` +// +// NOTE: this needs to be marked as repr(C), so we can predict its exact layout +// in memory, since we'll be using guest-provided pointers for access. The Linux +// UAPI headers define this struct as packed, but, in this particular case, +// packing only eliminates 4 trailing padding bytes. Declaring this struct as +// packed would also reduce its alignment to 1, which gets the Rust compiler all +// fidgety. Little does it know, the guest driver already aligned the structure +// properly, so we don't need to worry about alignment. That said, we'll be +// going with only repr(C) (no packing), and hard-coding the struct size as +// `VSOCK_PKT_HDR_SIZE`, since, given this particular layout, the first +// `VSOCK_PKT_HDR_SIZE` bytes are the same in both the packed and unpacked +// layouts. +// +// All fields use the little-endian byte order. Since we're only thinly wrapping +// a pointer to where the guest driver stored the packet header, let's restrict +// this to little-endian targets. +#[cfg(target_endian = "little")] +#[derive(Clone, Copy, Debug, Default)] +#[repr(C)] +pub struct VsockPacketHdr { + /// Source CID. + pub src_cid: u64, + /// Destination CID. + pub dst_cid: u64, + /// Source port. + pub src_port: u32, + /// Destination port. + pub dst_port: u32, + /// Data length (in bytes) - may be 0, if there is now data buffer. + pub len: u32, + /// Socket type. Currently, only connection-oriented streams are defined by + /// the vsock protocol. + pub type_: u16, + /// Operation ID - one of the VSOCK_OP_* values; e.g. + /// - VSOCK_OP_RW: a data packet; + /// - VSOCK_OP_REQUEST: connection request; + /// - VSOCK_OP_RST: forcefull connection termination; + /// etc (see `super::defs::uapi` for the full list). + pub op: u16, + /// Additional options (flags) associated with the current operation (`op`). + /// Currently, only used with shutdown requests (VSOCK_OP_SHUTDOWN). + pub flags: u32, + /// Size (in bytes) of the packet sender receive buffer (for the connection + /// to which this packet belongs). + pub buf_alloc: u32, + /// Number of bytes the sender has received and consumed (for the connection + /// to which this packet belongs). For instance, for our Unix backend, this + /// counter would be the total number of bytes we have successfully written + /// to a backing Unix socket. + pub fwd_cnt: u32, +} + +/// The size (in bytes) of the above packet header struct, as present in a +/// virtio queue buffer. See the explanation above on why we are hard-coding +/// this value here. +pub const VSOCK_PKT_HDR_SIZE: usize = 44; + +/// A thin wrapper over a `VsockPacketHdr` pointer. This is useful because +/// packet headers are provided by the guest via virtio descriptors (so, +/// basically, pointers). We never need to create header structs - only access +/// them. Access to specific members of the wrapped struct is provided via +/// `Deref` and `DerefMut` impls. +pub struct HdrWrapper { + ptr: *mut VsockPacketHdr, +} + +impl HdrWrapper { + /// Create the wrapper from a virtio queue descriptor (a pointer), performing some sanity checks + /// in the process. + pub fn from_virtq_desc(desc: &Descriptor, mem: &M) -> Result { + if desc.len() < VSOCK_PKT_HDR_SIZE as u32 { + return Err(VsockError::HdrDescTooSmall(desc.len())); + } + // TODO: check buffer alignment + + mem.checked_offset(desc.addr(), VSOCK_PKT_HDR_SIZE) + .ok_or_else(|| VsockError::GuestMemoryBounds(desc.addr().0, VSOCK_PKT_HDR_SIZE))?; + + // It's safe to create the wrapper from this pointer, as: + // - the guest driver aligned the data; and + // - `GuestMemory` is page-aligned. + Ok(Self::from_ptr_unchecked( + mem.get_host_address(desc.addr()) + .map_err(VsockError::GuestMemory)?, + )) + } + + /// Create the wrapper from a raw pointer. + /// + /// Warning: the pointer needs to follow proper alignment for + /// `VsockPacketHdr`. This is not a problem for virtq buffers, since the + /// guest driver already handled alignment, and `GuestMemory` is + /// page-aligned. + fn from_ptr_unchecked(ptr: *const u8) -> Self { + #[allow(clippy::cast_ptr_alignment)] + Self { + ptr: ptr as *mut VsockPacketHdr, + } + } + + /// Provide byte-wise access to the data stored inside the header, via a + /// slice / fat-pointer. + pub fn as_slice(&self) -> &[u8] { + // This is safe, since `Self::from_virtq_head()` already performed all the bound checks. + // + unsafe { std::slice::from_raw_parts(self.ptr as *const u8, VSOCK_PKT_HDR_SIZE) } + } + + /// Provide byte-wise mutable access to the data stored inside the header, + /// via a slice / fat-pointer. + pub fn as_mut_slice(&mut self) -> &mut [u8] { + // This is safe, since `Self::from_virtq_head()` already performed all + // the bound checks. + unsafe { std::slice::from_raw_parts_mut(self.ptr as *mut u8, VSOCK_PKT_HDR_SIZE) } + } +} + +/// `Deref` implementation for `HdrWrapper`, allowing access to `VsockPacketHdr` +/// individual members. +impl Deref for HdrWrapper { + type Target = VsockPacketHdr; + + fn deref(&self) -> &VsockPacketHdr { + // Dereferencing this pointer is safe, because it was already validated + // by the `HdrWrapper` constructor. + unsafe { &*self.ptr } + } +} + +/// `DerefMut` implementation for `HdrWrapper`, allowing mutable access to +/// `VsockPacketHdr` individual members. +impl DerefMut for HdrWrapper { + fn deref_mut(&mut self) -> &mut VsockPacketHdr { + // Dereferencing this pointer is safe, because it was already validated + // by the `HdrWrapper` constructor. + unsafe { &mut *self.ptr } + } +} + +/// A thin wrapper over a vsock data pointer in guest memory. The wrapper is +/// meant to be constructed from a guest-provided virtq descriptor, and provides +/// byte-slice-like access. +pub struct BufWrapper { + ptr: *mut u8, + len: usize, +} + +impl BufWrapper { + /// Create the data wrapper from a virtq descriptor. + pub fn from_virtq_desc(desc: &Descriptor, mem: &M) -> Result { + // Check the guest provided pointer and data size. + mem.checked_offset(desc.addr(), desc.len() as usize) + .ok_or_else(|| VsockError::GuestMemoryBounds(desc.addr().0, desc.len() as usize))?; + + Ok(Self::from_fat_ptr_unchecked( + mem.get_host_address(desc.addr()) + .map_err(VsockError::GuestMemory)?, + desc.len() as usize, + )) + } + + /// Create the data wrapper from a pointer and size. + /// + /// Warning: Both `ptr` and `len` must be insured as valid by the caller. + fn from_fat_ptr_unchecked(ptr: *const u8, len: usize) -> Self { + Self { + ptr: ptr as *mut u8, + len, + } + } + + /// Provide access to the data buffer, as a byte slice. + pub fn as_slice(&self) -> &[u8] { + // This is safe since bound checks have already been performed when + // creating the buffer from the virtq descriptor. + unsafe { std::slice::from_raw_parts(self.ptr as *const u8, self.len) } + } + + /// Provide mutable access to the data buffer, as a byte slice. + pub fn as_mut_slice(&mut self) -> &mut [u8] { + // This is safe since bound checks have already been performed when + // creating the buffer from the virtq descriptor. + unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) } + } +} + +/// The vsock packet, implemented as a wrapper over a virtq descriptor chain: +/// - the chain head, holding the packet header; and +/// - (an optional) data/buffer descriptor, only present for data packets +/// (VSOCK_OP_RW). +pub struct VsockPacket { + hdr: HdrWrapper, + buf: Option, +} + +impl VsockPacket { + /// Create the packet wrapper from a TX virtq chain head. + /// + /// The chain head is expected to hold valid packet header data. A following + /// packet buffer descriptor can optionally end the chain. Bounds and + /// pointer checks are performed when creating the wrapper. + pub fn from_tx_virtq_head( + desc_chain: &mut DescriptorChain<&M>, + ) -> Result { + let desc = desc_chain.next().ok_or(VsockError::BufDescMissing)?; + + // All buffers in the TX queue must be readable. + if desc.is_write_only() { + return Err(VsockError::UnreadableDescriptor); + } + + let hdr = HdrWrapper::from_virtq_desc(&desc, desc_chain.memory())?; + + // Reject weirdly-sized packets. + if hdr.len > defs::MAX_PKT_BUF_SIZE as u32 { + return Err(VsockError::InvalidPktLen(hdr.len)); + } + + // Don't bother to look for the data descriptor, if the header says + // there's no data. + if hdr.len == 0 { + return Ok(Self { hdr, buf: None }); + } + + let buf_desc = desc_chain.next().ok_or(VsockError::BufDescMissing)?; + + // All TX buffers must be readable. + if buf_desc.is_write_only() { + return Err(VsockError::UnreadableDescriptor); + } + + // The data descriptor should be large enough to hold the data length + // indicated by the header. + if buf_desc.len() < hdr.len { + return Err(VsockError::BufDescTooSmall); + } + + Ok(Self { + hdr, + buf: Some(BufWrapper::from_virtq_desc(&buf_desc, desc_chain.memory())?), + }) + } + + /// Create the packet wrapper from an RX virtq chain head. + /// + /// There must be two descriptors in the chain, both writable: a header + /// descriptor and a data descriptor. Bounds and pointer checks are + /// performed when creating the wrapper. + pub fn from_rx_virtq_head( + desc_chain: &mut DescriptorChain<&M>, + ) -> Result { + let desc = desc_chain.next().ok_or(VsockError::BufDescMissing)?; + + // All RX buffers must be writable. + if !desc.is_write_only() { + return Err(VsockError::UnwritableDescriptor); + } + + let hdr = HdrWrapper::from_virtq_desc(&desc, desc_chain.memory())?; + + let buf_desc = desc_chain.next().ok_or(VsockError::BufDescMissing)?; + if !buf_desc.is_write_only() { + return Err(VsockError::UnwritableDescriptor); + } + + Ok(Self { + hdr, + buf: Some(BufWrapper::from_virtq_desc(&buf_desc, desc_chain.memory())?), + }) + } + + /// Provides in-place, byte-slice, access to the vsock packet header. + pub fn hdr(&self) -> &[u8] { + self.hdr.as_slice() + } + + /// Provides in-place, byte-slice, mutable access to the vsock packet + /// header. + pub fn hdr_mut(&mut self) -> &mut [u8] { + self.hdr.as_mut_slice() + } + + // Provides in-place, byte-slice access to the vsock packet data buffer. + /// + /// Note: control packets (e.g. connection request or reset) have no data + /// buffer associated. For those packets, this method will return + /// `None`. Also note: calling `len()` on the returned slice will + /// yield the buffer size, which may be (and often is) larger than the + /// length of the packet data. The packet data length is stored in the + /// packet header, and accessible via `VsockPacket::len()`. + pub fn buf(&self) -> Option<&[u8]> { + self.buf.as_ref().map(|buf| buf.as_slice()) + } + + /// Provides in-place, byte-slice, mutable access to the vsock packet data + /// buffer. + /// + /// Note: control packets (e.g. connection request or reset) have no data + /// buffer associated. For those packets, this method will return + /// `None`. Also note: calling `len()` on the returned slice will + /// yield the buffer size, which may be (and often is) larger than the + /// length of the packet data. The packet data length is stored in the + /// packet header, and accessible via `VsockPacket::len()`. + pub fn buf_mut(&mut self) -> Option<&mut [u8]> { + self.buf.as_mut().map(|buf| buf.as_mut_slice()) + } + + pub fn src_cid(&self) -> u64 { + self.hdr.src_cid + } + + pub fn set_src_cid(&mut self, cid: u64) -> &mut Self { + self.hdr.src_cid = cid; + self + } + + pub fn dst_cid(&self) -> u64 { + self.hdr.dst_cid + } + + pub fn set_dst_cid(&mut self, cid: u64) -> &mut Self { + self.hdr.dst_cid = cid; + self + } + + pub fn src_port(&self) -> u32 { + self.hdr.src_port + } + + pub fn set_src_port(&mut self, port: u32) -> &mut Self { + self.hdr.src_port = port; + self + } + + pub fn dst_port(&self) -> u32 { + self.hdr.dst_port + } + + pub fn set_dst_port(&mut self, port: u32) -> &mut Self { + self.hdr.dst_port = port; + self + } + + pub fn len(&self) -> u32 { + self.hdr.len + } + + pub fn set_len(&mut self, len: u32) -> &mut Self { + self.hdr.len = len; + self + } + + pub fn type_(&self) -> u16 { + self.hdr.type_ + } + + pub fn set_type(&mut self, type_: u16) -> &mut Self { + self.hdr.type_ = type_; + self + } + + pub fn op(&self) -> u16 { + self.hdr.op + } + + pub fn set_op(&mut self, op: u16) -> &mut Self { + self.hdr.op = op; + self + } + + pub fn flags(&self) -> u32 { + self.hdr.flags + } + + pub fn set_flags(&mut self, flags: u32) -> &mut Self { + self.hdr.flags = flags; + self + } + + pub fn set_flag(&mut self, flag: u32) -> &mut Self { + self.set_flags(self.flags() | flag); + self + } + + pub fn buf_alloc(&self) -> u32 { + self.hdr.buf_alloc + } + + pub fn set_buf_alloc(&mut self, buf_alloc: u32) -> &mut Self { + self.hdr.buf_alloc = buf_alloc; + self + } + + pub fn fwd_cnt(&self) -> u32 { + self.hdr.fwd_cnt + } + + pub fn set_fwd_cnt(&mut self, fwd_cnt: u32) -> &mut Self { + self.hdr.fwd_cnt = fwd_cnt; + self + } +} + +#[cfg(test)] +mod tests { + use virtio_queue::QueueT; + use vm_memory::{GuestAddress, GuestMemoryMmap}; + + use super::super::defs::MAX_PKT_BUF_SIZE; + use super::super::tests::{test_bytes, TestContext}; + use super::defs::{RXQ_EVENT, TXQ_EVENT}; + use super::*; + use crate::tests::{VirtqDesc as GuestQDesc, VIRTQ_DESC_F_WRITE}; + + const HDROFF_SRC_CID: usize = 0; + const HDROFF_DST_CID: usize = 8; + const HDROFF_SRC_PORT: usize = 16; + const HDROFF_DST_PORT: usize = 20; + const HDROFF_LEN: usize = 24; + const HDROFF_TYPE: usize = 28; + const HDROFF_OP: usize = 30; + const HDROFF_FLAGS: usize = 32; + const HDROFF_BUF_ALLOC: usize = 36; + const HDROFF_FWD_CNT: usize = 40; + + macro_rules! create_context { + ($test_ctx:ident, $handler_ctx:ident) => { + let $test_ctx = TestContext::new(); + let mut $handler_ctx = $test_ctx.create_event_handler_context(); + // For TX packets, hdr.len should be set to a valid value. + set_pkt_len(1024, &$handler_ctx.guest_txvq.dtable(0), &$test_ctx.mem); + }; + } + + macro_rules! expect_asm_error { + (tx, $test_ctx:expr, $handler_ctx:expr, $err:pat) => { + expect_asm_error!($test_ctx, $handler_ctx, $err, from_tx_virtq_head, TXQ_EVENT); + }; + (rx, $test_ctx:expr, $handler_ctx:expr, $err:pat) => { + expect_asm_error!($test_ctx, $handler_ctx, $err, from_rx_virtq_head, RXQ_EVENT); + }; + ($test_ctx:expr, $handler_ctx:expr, $err:pat, $ctor:ident, $vq_index:ident) => { + match VsockPacket::$ctor( + &mut $handler_ctx.queues[$vq_index as usize] + .queue_mut() + .pop_descriptor_chain(&$test_ctx.mem) + .unwrap(), + ) { + Err($err) => (), + Ok(_) => panic!("Packet assembly should've failed!"), + Err(other) => panic!("Packet assembly failed with: {:?}", other), + } + }; + } + + fn set_pkt_len(len: u32, guest_desc: &GuestQDesc, mem: &GuestMemoryMmap) { + let hdr_gpa = guest_desc.addr(); + let hdr_ptr = mem.get_host_address(GuestAddress(hdr_gpa.load())).unwrap(); + let len_ptr = unsafe { hdr_ptr.add(HDROFF_LEN) }; + + unsafe { std::slice::from_raw_parts_mut(len_ptr, 4).copy_from_slice(&len.to_le_bytes()) }; + } + + #[test] + #[allow(clippy::cognitive_complexity)] + fn test_tx_packet_assembly() { + // Test case: successful TX packet assembly. + { + create_context!(test_ctx, handler_ctx); + let pkt = VsockPacket::from_tx_virtq_head( + &mut handler_ctx.queues[TXQ_EVENT as usize] + .queue_mut() + .pop_descriptor_chain(&test_ctx.mem) + .unwrap(), + ) + .unwrap(); + assert_eq!(pkt.hdr().len(), VSOCK_PKT_HDR_SIZE); + assert_eq!( + pkt.buf().unwrap().len(), + handler_ctx.guest_txvq.dtable(1).len().load() as usize + ); + } + + // Test case: error on write-only hdr descriptor. + { + create_context!(test_ctx, handler_ctx); + handler_ctx + .guest_txvq + .dtable(0) + .flags() + .store(VIRTQ_DESC_F_WRITE); + expect_asm_error!(tx, test_ctx, handler_ctx, VsockError::UnreadableDescriptor); + } + + // Test case: header descriptor has insufficient space to hold the packet header. + { + create_context!(test_ctx, handler_ctx); + handler_ctx + .guest_txvq + .dtable(0) + .len() + .store(VSOCK_PKT_HDR_SIZE as u32 - 1); + expect_asm_error!(tx, test_ctx, handler_ctx, VsockError::HdrDescTooSmall(_)); + } + + // Test case: zero-length TX packet. + { + create_context!(test_ctx, handler_ctx); + set_pkt_len(0, &handler_ctx.guest_txvq.dtable(0), &test_ctx.mem); + let mut pkt = VsockPacket::from_tx_virtq_head( + &mut handler_ctx.queues[TXQ_EVENT as usize] + .queue_mut() + .pop_descriptor_chain(&test_ctx.mem) + .unwrap(), + ) + .unwrap(); + assert!(pkt.buf().is_none()); + assert!(pkt.buf_mut().is_none()); + } + + // Test case: TX packet has more data than we can handle. + { + create_context!(test_ctx, handler_ctx); + set_pkt_len( + MAX_PKT_BUF_SIZE as u32 + 1, + &handler_ctx.guest_txvq.dtable(0), + &test_ctx.mem, + ); + expect_asm_error!(tx, test_ctx, handler_ctx, VsockError::InvalidPktLen(_)); + } + + // Test case: + // - packet header advertises some data length; and + // - the data descriptor is missing. + { + create_context!(test_ctx, handler_ctx); + set_pkt_len(1024, &handler_ctx.guest_txvq.dtable(0), &test_ctx.mem); + handler_ctx.guest_txvq.dtable(0).flags().store(0); + expect_asm_error!(tx, test_ctx, handler_ctx, VsockError::BufDescMissing); + } + + // Test case: error on write-only buf descriptor. + { + create_context!(test_ctx, handler_ctx); + handler_ctx + .guest_txvq + .dtable(1) + .flags() + .store(VIRTQ_DESC_F_WRITE); + expect_asm_error!(tx, test_ctx, handler_ctx, VsockError::UnreadableDescriptor); + } + + // Test case: the buffer descriptor cannot fit all the data advertised by the the + // packet header `len` field. + { + create_context!(test_ctx, handler_ctx); + set_pkt_len(8 * 1024, &handler_ctx.guest_txvq.dtable(0), &test_ctx.mem); + handler_ctx.guest_txvq.dtable(1).len().store(4 * 1024); + expect_asm_error!(tx, test_ctx, handler_ctx, VsockError::BufDescTooSmall); + } + } + + #[test] + fn test_rx_packet_assembly() { + // Test case: successful RX packet assembly. + { + create_context!(test_ctx, handler_ctx); + let pkt = VsockPacket::from_rx_virtq_head( + &mut handler_ctx.queues[RXQ_EVENT as usize] + .queue_mut() + .pop_descriptor_chain(&test_ctx.mem) + .unwrap(), + ) + .unwrap(); + assert_eq!(pkt.hdr().len(), VSOCK_PKT_HDR_SIZE); + assert_eq!( + pkt.buf().unwrap().len(), + handler_ctx.guest_rxvq.dtable(1).len().load() as usize + ); + } + + // Test case: read-only RX packet header. + { + create_context!(test_ctx, handler_ctx); + handler_ctx.guest_rxvq.dtable(0).flags().store(0); + expect_asm_error!(rx, test_ctx, handler_ctx, VsockError::UnwritableDescriptor); + } + + // Test case: RX descriptor head cannot fit the entire packet header. + { + create_context!(test_ctx, handler_ctx); + handler_ctx + .guest_rxvq + .dtable(0) + .len() + .store(VSOCK_PKT_HDR_SIZE as u32 - 1); + expect_asm_error!(rx, test_ctx, handler_ctx, VsockError::HdrDescTooSmall(_)); + } + + // Test case: RX descriptor chain is missing the packet buffer descriptor. + { + create_context!(test_ctx, handler_ctx); + handler_ctx + .guest_rxvq + .dtable(0) + .flags() + .store(VIRTQ_DESC_F_WRITE); + expect_asm_error!(rx, test_ctx, handler_ctx, VsockError::BufDescMissing); + } + } + + #[test] + #[allow(clippy::cognitive_complexity)] + fn test_packet_hdr_accessors() { + const SRC_CID: u64 = 1; + const DST_CID: u64 = 2; + const SRC_PORT: u32 = 3; + const DST_PORT: u32 = 4; + const LEN: u32 = 5; + const TYPE: u16 = 6; + const OP: u16 = 7; + const FLAGS: u32 = 8; + const BUF_ALLOC: u32 = 9; + const FWD_CNT: u32 = 10; + + create_context!(test_ctx, handler_ctx); + let mut pkt = VsockPacket::from_rx_virtq_head( + &mut handler_ctx.queues[RXQ_EVENT as usize] + .queue_mut() + .pop_descriptor_chain(&test_ctx.mem) + .unwrap(), + ) + .unwrap(); + + // Test field accessors. + pkt.set_src_cid(SRC_CID) + .set_dst_cid(DST_CID) + .set_src_port(SRC_PORT) + .set_dst_port(DST_PORT) + .set_len(LEN) + .set_type(TYPE) + .set_op(OP) + .set_flags(FLAGS) + .set_buf_alloc(BUF_ALLOC) + .set_fwd_cnt(FWD_CNT); + + assert_eq!(pkt.src_cid(), SRC_CID); + assert_eq!(pkt.dst_cid(), DST_CID); + assert_eq!(pkt.src_port(), SRC_PORT); + assert_eq!(pkt.dst_port(), DST_PORT); + assert_eq!(pkt.len(), LEN); + assert_eq!(pkt.type_(), TYPE); + assert_eq!(pkt.op(), OP); + assert_eq!(pkt.flags(), FLAGS); + assert_eq!(pkt.buf_alloc(), BUF_ALLOC); + assert_eq!(pkt.fwd_cnt(), FWD_CNT); + + // Test individual flag setting. + let flags = pkt.flags() | 0b1000; + pkt.set_flag(0b1000); + assert_eq!(pkt.flags(), flags); + + // Test packet header as-slice access. + // + + assert_eq!(pkt.hdr().len(), VSOCK_PKT_HDR_SIZE); + + test_bytes(&SRC_CID.to_le_bytes(), &pkt.hdr()[HDROFF_SRC_CID..]); + test_bytes(&DST_CID.to_le_bytes(), &pkt.hdr()[HDROFF_DST_CID..]); + test_bytes(&SRC_PORT.to_le_bytes(), &pkt.hdr()[HDROFF_SRC_PORT..]); + test_bytes(&DST_PORT.to_le_bytes(), &pkt.hdr()[HDROFF_DST_PORT..]); + test_bytes(&LEN.to_le_bytes(), &pkt.hdr()[HDROFF_LEN..]); + test_bytes(&TYPE.to_le_bytes(), &pkt.hdr()[HDROFF_TYPE..]); + test_bytes(&OP.to_le_bytes(), &pkt.hdr()[HDROFF_OP..]); + test_bytes(&FLAGS.to_le_bytes(), &pkt.hdr()[HDROFF_FLAGS..]); + test_bytes(&BUF_ALLOC.to_le_bytes(), &pkt.hdr()[HDROFF_BUF_ALLOC..]); + test_bytes(&FWD_CNT.to_le_bytes(), &pkt.hdr()[HDROFF_FWD_CNT..]); + + assert_eq!(pkt.hdr_mut().len(), VSOCK_PKT_HDR_SIZE); + for b in pkt.hdr_mut() { + *b = 0; + } + assert_eq!(pkt.src_cid(), 0); + assert_eq!(pkt.dst_cid(), 0); + assert_eq!(pkt.src_port(), 0); + assert_eq!(pkt.dst_port(), 0); + assert_eq!(pkt.len(), 0); + assert_eq!(pkt.type_(), 0); + assert_eq!(pkt.op(), 0); + assert_eq!(pkt.flags(), 0); + assert_eq!(pkt.buf_alloc(), 0); + assert_eq!(pkt.fwd_cnt(), 0); + } + + #[test] + fn test_packet_buf() { + create_context!(test_ctx, handler_ctx); + let mut pkt = VsockPacket::from_rx_virtq_head( + &mut handler_ctx.queues[RXQ_EVENT as usize] + .queue_mut() + .pop_descriptor_chain(&test_ctx.mem) + .unwrap(), + ) + .unwrap(); + + assert_eq!( + pkt.buf().unwrap().len(), + handler_ctx.guest_rxvq.dtable(1).len().load() as usize + ); + assert_eq!( + pkt.buf_mut().unwrap().len(), + handler_ctx.guest_rxvq.dtable(1).len().load() as usize + ); + + for i in 0..pkt.buf().unwrap().len() { + pkt.buf_mut().unwrap()[i] = (i % 0x100) as u8; + assert_eq!(pkt.buf().unwrap()[i], (i % 0x100) as u8); + } + } +} diff --git a/src/dragonball/src/device_manager/mod.rs b/src/dragonball/src/device_manager/mod.rs index b007270c9..fefdf4f77 100644 --- a/src/dragonball/src/device_manager/mod.rs +++ b/src/dragonball/src/device_manager/mod.rs @@ -1240,7 +1240,11 @@ mod tests { Some(vm.vm_config().clone()), vm.shared_info().clone(), ); + #[cfg(target_arch = "x86_64")] let guest_addr = GuestAddress(0x200000000000); + // TODO: #7290 - https://github.com/kata-containers/kata-containers/issues/7290 + #[cfg(target_arch = "aarch64")] + let guest_addr = GuestAddress(0xF800000000); let cache_len = 1024 * 1024 * 1024; let mmap_region = MmapRegion::build( diff --git a/src/dragonball/src/vm/mod.rs b/src/dragonball/src/vm/mod.rs index 323db0a86..15fb00a72 100644 --- a/src/dragonball/src/vm/mod.rs +++ b/src/dragonball/src/vm/mod.rs @@ -901,6 +901,8 @@ impl Vm { #[cfg(test)] pub mod tests { + #[cfg(target_arch = "aarch64")] + use dbs_boot::layout::GUEST_MEM_START; #[cfg(target_arch = "x86_64")] use kvm_ioctls::VcpuExit; use linux_loader::cmdline::Cmdline; @@ -964,7 +966,13 @@ pub mod tests { let vm_memory = vm.address_space.vm_memory().unwrap(); assert_eq!(vm_memory.num_regions(), 1); + #[cfg(target_arch = "x86_64")] assert_eq!(vm_memory.last_addr(), GuestAddress(0xffffff)); + #[cfg(target_arch = "aarch64")] + assert_eq!( + vm_memory.last_addr(), + GuestAddress(GUEST_MEM_START + 0xffffff) + ); // Reconfigure an already configured vm will be ignored and just return OK. let vm_config = VmConfigInfo { @@ -987,9 +995,18 @@ pub mod tests { assert!(vm.init_guest_memory().is_ok()); let vm_memory = vm.address_space.vm_memory().unwrap(); assert_eq!(vm_memory.num_regions(), 1); + #[cfg(target_arch = "x86_64")] assert_eq!(vm_memory.last_addr(), GuestAddress(0xffffff)); + #[cfg(target_arch = "aarch64")] + assert_eq!( + vm_memory.last_addr(), + GuestAddress(GUEST_MEM_START + 0xffffff) + ); + #[cfg(target_arch = "x86_64")] let obj_addr = GuestAddress(0xf0); + #[cfg(target_arch = "aarch64")] + let obj_addr = GuestAddress(GUEST_MEM_START + 0xf0); vm_memory.write_obj(67u8, obj_addr).unwrap(); let read_val: u8 = vm_memory.read_obj(obj_addr).unwrap(); assert_eq!(read_val, 67u8); @@ -1029,7 +1046,13 @@ pub mod tests { let vm_memory = vm.address_space.vm_memory().unwrap(); assert_eq!(vm_memory.num_regions(), 1); + #[cfg(target_arch = "x86_64")] assert_eq!(vm_memory.last_addr(), GuestAddress(0xffffff)); + #[cfg(target_arch = "aarch64")] + assert_eq!( + vm_memory.last_addr(), + GuestAddress(GUEST_MEM_START + 0xffffff) + ); let kernel_file = TempFile::new().unwrap(); let cmd_line = Cmdline::new(64); diff --git a/src/libs/Cargo.lock b/src/libs/Cargo.lock index 5da16917f..e8cd3e39a 100644 --- a/src/libs/Cargo.lock +++ b/src/libs/Cargo.lock @@ -34,6 +34,17 @@ dependencies = [ "syn", ] +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + [[package]] name = "autocfg" version = "1.1.0" @@ -130,7 +141,7 @@ dependencies = [ "js-sys", "num-integer", "num-traits", - "time", + "time 0.1.43", "wasm-bindgen", "winapi", ] @@ -172,6 +183,27 @@ dependencies = [ "syn", ] +[[package]] +name = "dirs-next" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" +dependencies = [ + "cfg-if", + "dirs-sys-next", +] + +[[package]] +name = "dirs-sys-next" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + [[package]] name = "either" version = "1.6.1" @@ -472,6 +504,7 @@ dependencies = [ name = "kata-sys-util" version = "0.1.0" dependencies = [ + "anyhow", "byteorder", "cgroups-rs", "chrono", @@ -559,6 +592,7 @@ dependencies = [ "slog-async", "slog-json", "slog-scope", + "slog-term", "tempfile", ] @@ -681,6 +715,15 @@ dependencies = [ "libc", ] +[[package]] +name = "num_threads" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" +dependencies = [ + "libc", +] + [[package]] name = "oci" version = "0.1.0" @@ -950,13 +993,24 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.10" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ "bitflags", ] +[[package]] +name = "redox_users" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" +dependencies = [ + "getrandom", + "redox_syscall", + "thiserror", +] + [[package]] name = "regex" version = "1.6.0" @@ -983,6 +1037,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "rustversion" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06" + [[package]] name = "ryu" version = "1.0.9" @@ -1113,6 +1173,19 @@ dependencies = [ "slog", ] +[[package]] +name = "slog-term" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87d29185c55b7b258b4f120eab00f48557d4d9bc814f41713f449d35b0f8977c" +dependencies = [ + "atty", + "slog", + "term", + "thread_local", + "time 0.3.22", +] + [[package]] name = "smallvec" version = "1.8.0" @@ -1170,6 +1243,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "term" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f" +dependencies = [ + "dirs-next", + "rustversion", + "winapi", +] + [[package]] name = "test-utils" version = "0.1.0" @@ -1216,6 +1300,35 @@ dependencies = [ "winapi", ] +[[package]] +name = "time" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea9e1b3cf1243ae005d9e74085d4d542f3125458f3a81af210d901dcd7411efd" +dependencies = [ + "itoa", + "libc", + "num_threads", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" + +[[package]] +name = "time-macros" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b" +dependencies = [ + "time-core", +] + [[package]] name = "tokio" version = "1.17.0" diff --git a/src/libs/kata-sys-util/Cargo.toml b/src/libs/kata-sys-util/Cargo.toml index 40381b4d0..0d6cff91f 100644 --- a/src/libs/kata-sys-util/Cargo.toml +++ b/src/libs/kata-sys-util/Cargo.toml @@ -11,6 +11,7 @@ license = "Apache-2.0" edition = "2018" [dependencies] +anyhow = "1.0.31" byteorder = "1.4.3" cgroups = { package = "cgroups-rs", version = "0.3.2" } chrono = "0.4.0" diff --git a/src/libs/kata-sys-util/src/cpu.rs b/src/libs/kata-sys-util/src/cpu.rs new file mode 100644 index 000000000..97bc2fd94 --- /dev/null +++ b/src/libs/kata-sys-util/src/cpu.rs @@ -0,0 +1,414 @@ +// Copyright (c) 2022 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use anyhow::{anyhow, Result}; + +#[cfg(target_arch = "s390x")] +use std::collections::HashMap; +#[cfg(target_arch = "s390x")] +use std::io::BufRead; +#[cfg(target_arch = "s390x")] +use std::io::BufReader; + +#[allow(dead_code)] +const ERR_NO_CPUINFO: &str = "cpu_info string is empty"; + +pub const PROC_CPUINFO: &str = "/proc/cpuinfo"; + +#[cfg(target_arch = "x86_64")] +pub const CPUINFO_DELIMITER: &str = "\nprocessor"; +#[cfg(target_arch = "x86_64")] +pub const CPUINFO_FLAGS_TAG: &str = "flags"; + +fn read_file_contents(file_path: &str) -> Result { + let contents = std::fs::read_to_string(file_path)?; + Ok(contents) +} + +// get_single_cpu_info returns the contents of the first cpu from +// the specified cpuinfo file by parsing based on a specified delimiter +pub fn get_single_cpu_info(cpu_info_file: &str, substring: &str) -> Result { + let contents = read_file_contents(cpu_info_file)?; + + if contents.is_empty() { + return Err(anyhow!(ERR_NO_CPUINFO)); + } + + let subcontents: Vec<&str> = contents.split(substring).collect(); + let result = subcontents + .first() + .ok_or("error splitting contents of cpuinfo") + .map_err(|e| anyhow!(e))? + .to_string(); + Ok(result) +} + +// get_cpu_flags returns a string of cpu flags from cpuinfo, passed in +// as a string +#[cfg(any(target_arch = "s390x", target_arch = "x86_64"))] +pub fn get_cpu_flags(cpu_info: &str, cpu_flags_tag: &str) -> Result { + if cpu_info.is_empty() { + return Err(anyhow!(ERR_NO_CPUINFO)); + } + + if cpu_flags_tag.is_empty() { + return Err(anyhow!("cpu flags delimiter string is empty"))?; + } + + get_cpu_flags_from_file(cpu_info, cpu_flags_tag) +} + +// get a list of cpu flags in cpu_info_flags +// +// cpu_info is the content of cpuinfo file passed in as a string +// returns empty Vec if no flags are found +#[cfg(any(target_arch = "s390x", target_arch = "x86_64"))] +pub fn get_cpu_flags_vec(cpu_info: &str, cpu_flags_tag: &str) -> Result> { + if cpu_info.is_empty() { + return Err(anyhow!(ERR_NO_CPUINFO)); + } + + if cpu_flags_tag.is_empty() { + return Err(anyhow!("cpu flags delimiter string is empty"))?; + } + + let flags = get_cpu_flags_from_file(cpu_info, cpu_flags_tag)?; + + // take each flag, trim whitespace, convert to String, and add to list + // skip the first token in the iterator since it is empty + let flags_vec: Vec = flags + .split(' ') + .skip(1) + .map(|f| f.trim().to_string()) + .collect::>(); + + Ok(flags_vec) +} + +// check if the given flag exists in the given flags_vec +// +// flags_vec can be created by calling get_cpu_flags_vec +#[cfg(any(target_arch = "s390x", target_arch = "x86_64"))] +pub fn contains_cpu_flag(flags_vec: &[String], flag: &str) -> Result { + if flag.is_empty() { + return Err(anyhow!("parameter specifying flag to look for is empty"))?; + } + + Ok(flags_vec.iter().any(|f| f == flag)) +} + +// get a String containing the cpu flags in cpu_info +// +// this function returns the list of flags as a single String +// if no flags are found, returns an empty String +#[cfg(any(target_arch = "s390x", target_arch = "x86_64"))] +fn get_cpu_flags_from_file(cpu_info: &str, cpu_flags_tag: &str) -> Result { + let subcontents: Vec<&str> = cpu_info.split('\n').collect(); + for line in subcontents { + if line.starts_with(cpu_flags_tag) { + let line_data: Vec<&str> = line.split(':').collect(); + let flags = line_data + .last() + .ok_or("error splitting flags in cpuinfo") + .map_err(|e| anyhow!(e))? + .to_string(); + return Ok(flags); + } + } + + Ok("".to_string()) +} + +#[cfg(target_arch = "s390x")] +pub fn retrieve_cpu_facilities() -> Result> { + let f = std::fs::File::open(PROC_CPUINFO)?; + let mut reader = BufReader::new(f); + let mut contents = String::new(); + let facilities_field = "facilities"; + let mut facilities = HashMap::new(); + + while reader.read_line(&mut contents)? > 0 { + let fields: Vec<&str> = contents.split_whitespace().collect(); + if fields.len() < 2 { + contents.clear(); + continue; + } + + if !fields[0].starts_with(facilities_field) { + contents.clear(); + continue; + } + + let mut start = 1; + if fields[1] == ":" { + start = 2; + } + + for field in fields.iter().skip(start) { + let bit = field.parse::()?; + facilities.insert(bit, true); + } + return Ok(facilities); + } + + Ok(facilities) +} + +#[cfg(any(target_arch = "s390x", target_arch = "x86_64"))] +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use std::io::Write; + use tempfile::tempdir; + + #[test] + fn test_get_single_cpu_info() { + // Valid cpuinfo example + let dir = tempdir().unwrap(); + let file_path_full = dir.path().join("cpuinfo_full"); + let path_full = file_path_full.clone(); + let mut file_full = fs::File::create(file_path_full).unwrap(); + let contents = "processor : 0\nvendor_id : VendorExample\nflags : flag_1 flag_2 flag_3 flag_4\nprocessor : 1\n".to_string(); + writeln!(file_full, "{}", contents).unwrap(); + + // Empty cpuinfo example + let file_path_empty = dir.path().join("cpuinfo_empty"); + let path_empty = file_path_empty.clone(); + let mut _file_empty = fs::File::create(file_path_empty).unwrap(); + + #[derive(Debug)] + struct TestData<'a> { + cpuinfo_path: &'a str, + processor_delimiter_str: &'a str, + result: Result, + } + let tests = &[ + // Failure scenarios + TestData { + cpuinfo_path: "", + processor_delimiter_str: "", + result: Err(anyhow!("No such file or directory (os error 2)")), + }, + TestData { + cpuinfo_path: &path_empty.as_path().display().to_string(), + processor_delimiter_str: "\nprocessor", + result: Err(anyhow!(ERR_NO_CPUINFO)), + }, + // Success scenarios + TestData { + cpuinfo_path: &path_full.as_path().display().to_string(), + processor_delimiter_str: "\nprocessor", + result: Ok( + "processor : 0\nvendor_id : VendorExample\nflags : flag_1 flag_2 flag_3 flag_4" + .to_string(), + ), + }, + ]; + + for (i, d) in tests.iter().enumerate() { + let msg = format!("test[{}]: {:?}", i, d); + let result = get_single_cpu_info(d.cpuinfo_path, d.processor_delimiter_str); + let msg = format!("{}, result: {:?}", msg, result); + + if d.result.is_ok() { + assert_eq!( + result.as_ref().unwrap(), + d.result.as_ref().unwrap(), + "{}", + msg + ); + continue; + } + + let expected_error = format!("{}", d.result.as_ref().unwrap_err()); + let actual_error = format!("{}", result.unwrap_err()); + assert!(actual_error == expected_error, "{}", msg); + } + } + + #[test] + fn test_get_cpu_flags() { + let contents = "processor : 0\nvendor_id : VendorExample\nflags : flag_1 flag_2 flag_3 flag_4\nprocessor : 1\n"; + + #[derive(Debug)] + struct TestData<'a> { + cpu_info_str: &'a str, + cpu_flags_tag: &'a str, + result: Result, + } + let tests = &[ + // Failure scenarios + TestData { + cpu_info_str: "", + cpu_flags_tag: "", + result: Err(anyhow!(ERR_NO_CPUINFO)), + }, + TestData { + cpu_info_str: "", + cpu_flags_tag: "flags", + result: Err(anyhow!(ERR_NO_CPUINFO)), + }, + TestData { + cpu_info_str: contents, + cpu_flags_tag: "", + result: Err(anyhow!("cpu flags delimiter string is empty")), + }, + // Success scenarios + TestData { + cpu_info_str: contents, + cpu_flags_tag: "flags", + result: Ok(" flag_1 flag_2 flag_3 flag_4".to_string()), + }, + TestData { + cpu_info_str: contents, + cpu_flags_tag: "flags_err", + result: Ok("".to_string()), + }, + ]; + + for (i, d) in tests.iter().enumerate() { + let msg = format!("test[{}]: {:?}", i, d); + let result = get_cpu_flags(d.cpu_info_str, d.cpu_flags_tag); + let msg = format!("{}, result: {:?}", msg, result); + + if d.result.is_ok() { + assert_eq!( + result.as_ref().unwrap(), + d.result.as_ref().unwrap(), + "{}", + msg + ); + continue; + } + + let expected_error = format!("{}", d.result.as_ref().unwrap_err()); + let actual_error = format!("{}", result.unwrap_err()); + assert!(actual_error == expected_error, "{}", msg); + } + } + + #[test] + fn test_get_cpu_flags_vec() { + let contents = "processor : 0\nvendor_id : VendorExample\nflags : flag_1 flag_2 flag_3 flag_4\nprocessor : 1\n"; + + #[derive(Debug)] + struct TestData<'a> { + cpu_info_str: &'a str, + cpu_flags_tag: &'a str, + result: Result>, + } + let tests = &[ + // Failure scenarios + TestData { + cpu_info_str: "", + cpu_flags_tag: "", + result: Err(anyhow!(ERR_NO_CPUINFO)), + }, + TestData { + cpu_info_str: "", + cpu_flags_tag: "flags", + result: Err(anyhow!(ERR_NO_CPUINFO)), + }, + TestData { + cpu_info_str: contents, + cpu_flags_tag: "", + result: Err(anyhow!("cpu flags delimiter string is empty")), + }, + // Success scenarios + TestData { + cpu_info_str: contents, + cpu_flags_tag: "flags", + result: Ok(vec![ + "flag_1".to_string(), + "flag_2".to_string(), + "flag_3".to_string(), + "flag_4".to_string(), + ]), + }, + TestData { + cpu_info_str: contents, + cpu_flags_tag: "flags_err", + result: Ok(Vec::new()), + }, + ]; + + for (i, d) in tests.iter().enumerate() { + let msg = format!("test[{}]: {:?}", i, d); + let result = get_cpu_flags_vec(d.cpu_info_str, d.cpu_flags_tag); + let msg = format!("{}, result: {:?}", msg, result); + + if d.result.is_ok() { + assert_eq!( + result.as_ref().unwrap(), + d.result.as_ref().unwrap(), + "{}", + msg + ); + continue; + } + + let expected_error = format!("{}", d.result.as_ref().unwrap_err()); + let actual_error = format!("{}", result.unwrap_err()); + assert!(actual_error == expected_error, "{}", msg); + } + } + + #[test] + fn test_contains_cpu_flag() { + let flags_vec = vec![ + "flag_1".to_string(), + "flag_2".to_string(), + "flag_3".to_string(), + "flag_4".to_string(), + ]; + + #[derive(Debug)] + struct TestData<'a> { + cpu_flags_vec: &'a Vec, + cpu_flag: &'a str, + result: Result, + } + let tests = &[ + // Failure scenarios + TestData { + cpu_flags_vec: &flags_vec, + cpu_flag: "flag_5", + result: Ok(false), + }, + TestData { + cpu_flags_vec: &flags_vec, + cpu_flag: "", + result: Err(anyhow!("parameter specifying flag to look for is empty")), + }, + // Success scenarios + TestData { + cpu_flags_vec: &flags_vec, + cpu_flag: "flag_1", + result: Ok(true), + }, + ]; + + for (i, d) in tests.iter().enumerate() { + let msg = format!("test[{}]: {:?}", i, d); + let result = contains_cpu_flag(d.cpu_flags_vec, d.cpu_flag); + let msg = format!("{}, result: {:?}", msg, result); + + if d.result.is_ok() { + assert_eq!( + result.as_ref().unwrap(), + d.result.as_ref().unwrap(), + "{}", + msg + ); + continue; + } + + let expected_error = format!("{}", d.result.as_ref().unwrap_err()); + let actual_error = format!("{}", result.unwrap_err()); + assert!(actual_error == expected_error, "{}", msg); + } + } +} diff --git a/src/libs/kata-sys-util/src/lib.rs b/src/libs/kata-sys-util/src/lib.rs index 2c90adb7c..531883a39 100644 --- a/src/libs/kata-sys-util/src/lib.rs +++ b/src/libs/kata-sys-util/src/lib.rs @@ -6,16 +6,22 @@ #[macro_use] extern crate slog; +pub mod cpu; pub mod device; pub mod fs; pub mod hooks; pub mod k8s; pub mod mount; pub mod numa; +pub mod protection; pub mod rand; pub mod spec; pub mod validate; +use anyhow::Result; +use std::io::BufRead; +use std::io::BufReader; + // Convenience macro to obtain the scoped logger #[macro_export] macro_rules! sl { @@ -31,3 +37,41 @@ macro_rules! eother { std::io::Error::new(std::io::ErrorKind::Other, format!($fmt, $($arg)*)) }) } + +pub fn check_kernel_cmd_line( + kernel_cmdline_path: &str, + search_param: &str, + search_values: &[&str], +) -> Result { + let f = std::fs::File::open(kernel_cmdline_path)?; + let reader = BufReader::new(f); + + let check_fn = if search_values.is_empty() { + |param: &str, search_param: &str, _search_values: &[&str]| { + param.eq_ignore_ascii_case(search_param) + } + } else { + |param: &str, search_param: &str, search_values: &[&str]| { + let split: Vec<&str> = param.splitn(2, '=').collect(); + if split.len() < 2 || split[0] != search_param { + return false; + } + + for value in search_values { + if value.eq_ignore_ascii_case(split[1]) { + return true; + } + } + false + } + }; + + for line in reader.lines() { + for field in line?.split_whitespace() { + if check_fn(field, search_param, search_values) { + return Ok(true); + } + } + } + Ok(false) +} diff --git a/src/libs/kata-sys-util/src/mount.rs b/src/libs/kata-sys-util/src/mount.rs index 3c6f5f261..522ce3acb 100644 --- a/src/libs/kata-sys-util/src/mount.rs +++ b/src/libs/kata-sys-util/src/mount.rs @@ -103,6 +103,8 @@ pub enum Error { MountOptionTooBig, #[error("Path for mountpoint is null")] NullMountPointPath, + #[error("Invalid Propagation type Flag")] + InvalidPgMountFlag, #[error("Faile to open file {0} by path, {1}")] OpenByPath(PathBuf, io::Error), #[error("Can not read metadata of {0}, {1}")] @@ -227,7 +229,13 @@ pub fn bind_remount>(dst: P, readonly: bool) -> Result<()> { do_rebind_mount(dst, readonly, MsFlags::empty()) } -/// Bind mount `src` to `dst` in slave mode, optionally in readonly mode if `readonly` is true. +/// Bind mount `src` to `dst` with a custom propagation type, optionally in readonly mode if +/// `readonly` is true. +/// +/// Propagation type: MsFlags::MS_SHARED or MsFlags::MS_SLAVE +/// MsFlags::MS_SHARED is used to bind mount the sandbox path to enable `exec` (in case of FC +/// jailer). +/// MsFlags::MS_SLAVE is used on all other cases. /// /// # Safety /// Caller needs to ensure: @@ -238,6 +246,7 @@ pub fn bind_mount_unchecked, D: AsRef>( src: S, dst: D, readonly: bool, + pgflag: MsFlags, ) -> Result<()> { fail::fail_point!("bind_mount", |_| { Err(Error::FailureInject( @@ -268,8 +277,11 @@ pub fn bind_mount_unchecked, D: AsRef>( ) .map_err(|e| Error::BindMount(abs_src, dst.to_path_buf(), e))?; - // Change into slave propagation mode. - mount(Some(""), dst, Some(""), MsFlags::MS_SLAVE, Some("")) + // Change into the chosen propagation mode. + if !(pgflag == MsFlags::MS_SHARED || pgflag == MsFlags::MS_SLAVE) { + return Err(Error::InvalidPgMountFlag); + } + mount(Some(""), dst, Some(""), pgflag, Some("")) .map_err(|e| Error::Mount(PathBuf::new(), dst.to_path_buf(), e))?; // Optionally rebind into readonly mode. @@ -828,7 +840,7 @@ mod tests { Err(Error::InvalidPath(_)) )); - bind_mount_unchecked(tmpdir2.path(), tmpdir.path(), true).unwrap(); + bind_mount_unchecked(tmpdir2.path(), tmpdir.path(), true, MsFlags::MS_SLAVE).unwrap(); bind_remount(tmpdir.path(), true).unwrap(); umount_timeout(tmpdir.path().to_str().unwrap(), 0).unwrap(); } @@ -844,25 +856,26 @@ mod tests { dst.push("src"); assert!(matches!( - bind_mount_unchecked(Path::new(""), Path::new(""), false), + bind_mount_unchecked(Path::new(""), Path::new(""), false, MsFlags::MS_SLAVE), Err(Error::NullMountPointPath) )); assert!(matches!( - bind_mount_unchecked(tmpdir2.path(), Path::new(""), false), + bind_mount_unchecked(tmpdir2.path(), Path::new(""), false, MsFlags::MS_SLAVE), Err(Error::NullMountPointPath) )); assert!(matches!( bind_mount_unchecked( Path::new("/_does_not_exist_/___aahhhh"), Path::new("/tmp/_does_not_exist/___bbb"), - false + false, + MsFlags::MS_SLAVE ), Err(Error::InvalidPath(_)) )); let dst = create_mount_destination(tmpdir2.path(), &dst, tmpdir.path(), "bind").unwrap(); - bind_mount_unchecked(tmpdir2.path(), dst.as_ref(), true).unwrap(); - bind_mount_unchecked(&src, dst.as_ref(), false).unwrap(); + bind_mount_unchecked(tmpdir2.path(), dst.as_ref(), true, MsFlags::MS_SLAVE).unwrap(); + bind_mount_unchecked(&src, dst.as_ref(), false, MsFlags::MS_SLAVE).unwrap(); umount_all(dst.as_ref(), false).unwrap(); let mut src = tmpdir.path().to_owned(); @@ -871,7 +884,7 @@ mod tests { let mut dst = tmpdir.path().to_owned(); dst.push("file"); let dst = create_mount_destination(&src, &dst, tmpdir.path(), "bind").unwrap(); - bind_mount_unchecked(&src, dst.as_ref(), false).unwrap(); + bind_mount_unchecked(&src, dst.as_ref(), false, MsFlags::MS_SLAVE).unwrap(); assert!(dst.as_ref().is_file()); umount_timeout(dst.as_ref(), 0).unwrap(); } diff --git a/src/libs/kata-sys-util/src/protection.rs b/src/libs/kata-sys-util/src/protection.rs new file mode 100644 index 000000000..02b356d33 --- /dev/null +++ b/src/libs/kata-sys-util/src/protection.rs @@ -0,0 +1,270 @@ +// Copyright (c) 2022 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +#[cfg(target_arch = "x86_64")] +use anyhow::anyhow; +#[cfg(any(target_arch = "s390x", target_arch = "x86_64", target_arch = "aarch64"))] +use anyhow::Result; +use std::fmt; +#[cfg(target_arch = "x86_64")] +use std::path::Path; +use thiserror::Error; + +#[cfg(any(target_arch = "s390x", target_arch = "x86_64"))] +use nix::unistd::Uid; + +#[cfg(target_arch = "x86_64")] +use std::fs; + +#[allow(dead_code)] +#[derive(Debug, PartialEq)] +pub enum GuestProtection { + NoProtection, + Tdx, + Sev, + Snp, + Pef, + Se, +} + +impl fmt::Display for GuestProtection { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + GuestProtection::Tdx => write!(f, "tdx"), + GuestProtection::Sev => write!(f, "sev"), + GuestProtection::Snp => write!(f, "snp"), + GuestProtection::Pef => write!(f, "pef"), + GuestProtection::Se => write!(f, "se"), + GuestProtection::NoProtection => write!(f, "none"), + } + } +} + +#[allow(dead_code)] +#[derive(Error, Debug)] +pub enum ProtectionError { + #[error("No permission to check guest protection")] + NoPerms, + + #[error("Failed to check guest protection: {0}")] + CheckFailed(String), + + #[error("Invalid guest protection value: {0}")] + InvalidValue(String), +} + +#[cfg(target_arch = "x86_64")] +pub const TDX_SYS_FIRMWARE_DIR: &str = "/sys/firmware/tdx_seam/"; +#[cfg(target_arch = "x86_64")] +pub const TDX_CPU_FLAG: &str = "tdx"; +#[cfg(target_arch = "x86_64")] +pub const SEV_KVM_PARAMETER_PATH: &str = "/sys/module/kvm_amd/parameters/sev"; +#[cfg(target_arch = "x86_64")] +pub const SNP_KVM_PARAMETER_PATH: &str = "/sys/module/kvm_amd/parameters/sev_snp"; + +#[cfg(target_arch = "x86_64")] +pub fn available_guest_protection() -> Result { + if !Uid::effective().is_root() { + return Err(ProtectionError::NoPerms); + } + + arch_guest_protection( + TDX_SYS_FIRMWARE_DIR, + TDX_CPU_FLAG, + SEV_KVM_PARAMETER_PATH, + SNP_KVM_PARAMETER_PATH, + ) +} + +#[cfg(target_arch = "x86_64")] +fn retrieve_cpu_flags() -> Result { + let cpu_info = + crate::cpu::get_single_cpu_info(crate::cpu::PROC_CPUINFO, crate::cpu::CPUINFO_DELIMITER)?; + + let cpu_flags = + crate::cpu::get_cpu_flags(&cpu_info, crate::cpu::CPUINFO_FLAGS_TAG).map_err(|e| { + anyhow!( + "Error parsing CPU flags, file {:?}, {:?}", + crate::cpu::PROC_CPUINFO, + e + ) + })?; + + Ok(cpu_flags) +} + +#[cfg(target_arch = "x86_64")] +pub fn arch_guest_protection( + tdx_path: &str, + tdx_flag: &str, + sev_path: &str, + snp_path: &str, +) -> Result { + let flags = + retrieve_cpu_flags().map_err(|err| ProtectionError::CheckFailed(err.to_string()))?; + + let metadata = fs::metadata(tdx_path); + + if metadata.is_ok() && metadata.unwrap().is_dir() && flags.contains(tdx_flag) { + return Ok(GuestProtection::Tdx); + } + + let check_contents = |file_name: &str| -> Result { + let file_path = Path::new(file_name); + if !file_path.exists() { + return Ok(false); + } + + let contents = fs::read_to_string(file_name).map_err(|err| { + ProtectionError::CheckFailed(format!("Error reading file {} : {}", file_name, err)) + })?; + + if contents.trim() == "Y" { + return Ok(true); + } + Ok(false) + }; + + if check_contents(snp_path)? { + return Ok(GuestProtection::Snp); + } + + if check_contents(sev_path)? { + return Ok(GuestProtection::Sev); + } + + Ok(GuestProtection::NoProtection) +} + +#[cfg(target_arch = "s390x")] +#[allow(dead_code)] +// Guest protection is not supported on ARM64. +pub fn available_guest_protection() -> Result { + if !Uid::effective().is_root() { + return Err(ProtectionError::NoPerms)?; + } + + let facilities = crate::cpu::retrieve_cpu_facilities().map_err(|err| { + ProtectionError::CheckFailed(format!( + "Error retrieving cpu facilities file : {}", + err.to_string() + )) + })?; + + // Secure Execution + // https://www.kernel.org/doc/html/latest/virt/kvm/s390-pv.html + let se_cpu_facility_bit: i32 = 158; + if !facilities.contains_key(&se_cpu_facility_bit) { + return Ok(GuestProtection::NoProtection); + } + + let cmd_line_values = vec!["1", "on", "y", "yes"]; + let se_cmdline_param = "prot_virt"; + + let se_cmdline_present = + crate::check_kernel_cmd_line("/proc/cmdline", se_cmdline_param, &cmd_line_values) + .map_err(|err| ProtectionError::CheckFailed(err.to_string()))?; + + if !se_cmdline_present { + return Err(ProtectionError::InvalidValue(String::from( + "Protected Virtualization is not enabled on kernel command line!", + ))); + } + + Ok(GuestProtection::Se) +} + +#[cfg(target_arch = "powerpc64le")] +pub fn available_guest_protection() -> Result { + if !Uid::effective().is_root() { + return Err(check::ProtectionError::NoPerms); + } + + let metadata = fs::metadata(PEF_SYS_FIRMWARE_DIR); + if metadata.is_ok() && metadata.unwrap().is_dir() { + Ok(check::GuestProtection::Pef) + } + + Ok(check::GuestProtection::NoProtection) +} + +#[cfg(target_arch = "aarch64")] +#[allow(dead_code)] +// Guest protection is not supported on ARM64. +pub fn available_guest_protection() -> Result { + Ok(GuestProtection::NoProtection) +} + +#[cfg(target_arch = "x86_64")] +#[cfg(test)] +mod tests { + use super::*; + use nix::unistd::Uid; + use std::fs; + use std::io::Write; + use tempfile::tempdir; + + #[test] + fn test_available_guest_protection_no_privileges() { + if !Uid::effective().is_root() { + let res = available_guest_protection(); + assert!(res.is_err()); + assert_eq!( + "No permission to check guest protection", + res.unwrap_err().to_string() + ); + } + } + + #[test] + fn test_arch_guest_protection_snp() { + // Test snp + let dir = tempdir().unwrap(); + let snp_file_path = dir.path().join("sev_snp"); + let path = snp_file_path.clone(); + let mut snp_file = fs::File::create(snp_file_path).unwrap(); + writeln!(snp_file, "Y").unwrap(); + + let actual = + arch_guest_protection("/xyz/tmp", TDX_CPU_FLAG, "/xyz/tmp", path.to_str().unwrap()); + assert!(actual.is_ok()); + assert_eq!(actual.unwrap(), GuestProtection::Snp); + + writeln!(snp_file, "N").unwrap(); + let actual = + arch_guest_protection("/xyz/tmp", TDX_CPU_FLAG, "/xyz/tmp", path.to_str().unwrap()); + assert!(actual.is_ok()); + assert_eq!(actual.unwrap(), GuestProtection::NoProtection); + } + + #[test] + fn test_arch_guest_protection_sev() { + // Test sev + let dir = tempdir().unwrap(); + let sev_file_path = dir.path().join("sev"); + let sev_path = sev_file_path.clone(); + let mut sev_file = fs::File::create(sev_file_path).unwrap(); + writeln!(sev_file, "Y").unwrap(); + + let actual = arch_guest_protection( + "/xyz/tmp", + TDX_CPU_FLAG, + sev_path.to_str().unwrap(), + "/xyz/tmp", + ); + assert!(actual.is_ok()); + assert_eq!(actual.unwrap(), GuestProtection::Sev); + + writeln!(sev_file, "N").unwrap(); + let actual = arch_guest_protection( + "/xyz/tmp", + TDX_CPU_FLAG, + sev_path.to_str().unwrap(), + "/xyz/tmp", + ); + assert!(actual.is_ok()); + assert_eq!(actual.unwrap(), GuestProtection::NoProtection); + } +} diff --git a/src/libs/kata-types/src/annotations/mod.rs b/src/libs/kata-types/src/annotations/mod.rs index f094ddd70..89da372de 100644 --- a/src/libs/kata-types/src/annotations/mod.rs +++ b/src/libs/kata-types/src/annotations/mod.rs @@ -474,8 +474,8 @@ impl Annotation { let u32_err = io::Error::new(io::ErrorKind::InvalidData, "parse u32 error".to_string()); let u64_err = io::Error::new(io::ErrorKind::InvalidData, "parse u64 error".to_string()); let i32_err = io::Error::new(io::ErrorKind::InvalidData, "parse i32 error".to_string()); - let mut hv = config.hypervisor.get_mut(hypervisor_name).unwrap(); - let mut ag = config.agent.get_mut(agent_name).unwrap(); + let hv = config.hypervisor.get_mut(hypervisor_name).unwrap(); + let ag = config.agent.get_mut(agent_name).unwrap(); for (key, value) in &self.annotations { if hv.security_info.is_annotation_enabled(key) { match key.as_str() { diff --git a/src/libs/kata-types/src/config/default.rs b/src/libs/kata-types/src/config/default.rs index 6b5495c5a..f55f597da 100644 --- a/src/libs/kata-types/src/config/default.rs +++ b/src/libs/kata-types/src/config/default.rs @@ -32,7 +32,7 @@ pub const DEFAULT_HYPERVISOR: &str = HYPERVISOR_NAME_DRAGONBALL; pub const DEFAULT_INTERNETWORKING_MODEL: &str = "tcfilter"; -pub const DEFAULT_BLOCK_DEVICE_TYPE: &str = "virtio-blk"; +pub const DEFAULT_BLOCK_DEVICE_TYPE: &str = "virtio-blk-pci"; pub const DEFAULT_VHOST_USER_STORE_PATH: &str = "/var/run/vhost-user"; pub const DEFAULT_BLOCK_NVDIMM_MEM_OFFSET: u64 = 0; diff --git a/src/libs/kata-types/src/config/hypervisor/mod.rs b/src/libs/kata-types/src/config/hypervisor/mod.rs index fedbba14b..9056a0dd0 100644 --- a/src/libs/kata-types/src/config/hypervisor/mod.rs +++ b/src/libs/kata-types/src/config/hypervisor/mod.rs @@ -47,7 +47,7 @@ const VIRTIO_BLK_PCI: &str = "virtio-blk-pci"; const VIRTIO_BLK_MMIO: &str = "virtio-blk-mmio"; const VIRTIO_BLK_CCW: &str = "virtio-blk-ccw"; const VIRTIO_SCSI: &str = "virtio-scsi"; -const VIRTIO_PMEM: &str = "nvdimm"; +const VIRTIO_PMEM: &str = "virtio-pmem"; const VIRTIO_9P: &str = "virtio-9p"; const VIRTIO_FS: &str = "virtio-fs"; const VIRTIO_FS_INLINE: &str = "inline-virtio-fs"; @@ -221,6 +221,10 @@ pub struct BootInfo { /// If you want that qemu uses the default firmware leave this option empty. #[serde(default)] pub firmware: String, + /// Block storage driver to be used for the VM rootfs is backed + /// by a block device. This is virtio-pmem, virtio-blk-pci or virtio-blk-mmio + #[serde(default)] + pub vm_rootfs_driver: String, } impl BootInfo { @@ -230,6 +234,11 @@ impl BootInfo { resolve_path!(self.image, "guest boot image file {} is invalid: {}")?; resolve_path!(self.initrd, "guest initrd image file {} is invalid: {}")?; resolve_path!(self.firmware, "firmware image file {} is invalid: {}")?; + + if self.vm_rootfs_driver.is_empty() { + self.vm_rootfs_driver = default::DEFAULT_BLOCK_DEVICE_TYPE.to_string(); + } + Ok(()) } @@ -242,6 +251,21 @@ impl BootInfo { if !self.image.is_empty() && !self.initrd.is_empty() { return Err(eother!("Can not configure both initrd and image for boot")); } + + let l = [ + VIRTIO_BLK_PCI, + VIRTIO_BLK_CCW, + VIRTIO_BLK_MMIO, + VIRTIO_PMEM, + VIRTIO_SCSI, + ]; + if !l.contains(&self.vm_rootfs_driver.as_str()) { + return Err(eother!( + "{} is unsupported block device type.", + self.vm_rootfs_driver + )); + } + Ok(()) } diff --git a/src/runtime-rs/Cargo.lock b/src/runtime-rs/Cargo.lock index 21128c5ec..04a709006 100644 --- a/src/runtime-rs/Cargo.lock +++ b/src/runtime-rs/Cargo.lock @@ -57,6 +57,7 @@ dependencies = [ "slog", "slog-scope", "tokio", + "tracing", "ttrpc", "url", ] @@ -67,20 +68,26 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" dependencies = [ - "getrandom 0.2.8", + "getrandom 0.2.10", "once_cell", "version_check", ] [[package]] name = "aho-corasick" -version = "0.7.20" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" dependencies = [ "memchr", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -92,9 +99,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.69" +version = "1.0.71" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800" +checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8" [[package]] name = "api_client" @@ -112,9 +119,9 @@ checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6" [[package]] name = "arrayref" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" +checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" [[package]] name = "arrayvec" @@ -135,9 +142,9 @@ dependencies = [ [[package]] name = "async-executor" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17adb73da160dfb475c183343c8cccd80721ea5a605d3eb57125f0a7b7a92d0b" +checksum = "6fa3dc5f2a8564f07759c008b9109dc0d39de92a88d5588b8a5036d286383afb" dependencies = [ "async-lock", "async-task", @@ -164,32 +171,31 @@ dependencies = [ [[package]] name = "async-io" -version = "1.12.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c374dda1ed3e7d8f0d9ba58715f924862c63eae6849c92d3a18e7fbde9e2794" +checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af" dependencies = [ "async-lock", "autocfg", + "cfg-if 1.0.0", "concurrent-queue", "futures-lite", - "libc", "log", "parking", "polling", + "rustix", "slab", "socket2", "waker-fn", - "windows-sys 0.42.0", ] [[package]] name = "async-lock" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8101efe8695a6c17e02911402145357e718ac92d3ff88ae8419e84b1707b685" +checksum = "fa24f727524730b077666307f2734b4a1a1c57acb79193127dcc8914d5242dd7" dependencies = [ "event-listener", - "futures-lite", ] [[package]] @@ -220,26 +226,26 @@ dependencies = [ [[package]] name = "async-task" -version = "4.3.0" +version = "4.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a40729d2133846d9ed0ea60a8b9541bccddab49cd30f0715a1da672fe9a2524" +checksum = "ecc7ab41815b3c653ccd2978ec3255c81349336702dfdf62ee6f7069b12a3aae" [[package]] name = "async-trait" -version = "0.1.64" +version = "0.1.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd7fce9ba8c3c042128ce72d8b2ddbf3a05747efb67ea0313c635e10bda47a2" +checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.18", ] [[package]] name = "atomic-waker" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "debc29dde2e69f9e47506b525f639ed42300fc014a3e007832592448fa8e4599" +checksum = "1181e1e0d1fce796a03db1ae795d67167da795f9cf4a39c37589e85ef57f26d3" [[package]] name = "atty" @@ -274,7 +280,7 @@ dependencies = [ "cc", "cfg-if 1.0.0", "libc", - "miniz_oxide", + "miniz_oxide 0.6.2", "object", "rustc-demangle", ] @@ -303,16 +309,16 @@ dependencies = [ [[package]] name = "blake3" -version = "1.3.3" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42ae2468a89544a466886840aa467a25b766499f4f04bf7d9fcd10ecee9fccef" +checksum = "729b71f35bd3fa1a4c86b85d32c8b9069ea7fe14f7a53cfabb65f62d4265b888" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if 1.0.0", "constant_time_eq", - "digest 0.10.6", + "digest 0.10.7", ] [[package]] @@ -326,18 +332,18 @@ dependencies = [ [[package]] name = "block-buffer" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cce20737498f97b993470a6e536b8523f0af7892a4f928cceb1ac5e52ebe7e" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ "generic-array", ] [[package]] name = "blocking" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c67b173a56acffd6d2326fb7ab938ba0b00a71480e14902b2591c87bc5741e8" +checksum = "77231a1c8f801696fc0123ec6150ce92cffb8e164a02afb9c8ddee0e9b65ad65" dependencies = [ "async-channel", "async-lock", @@ -345,13 +351,14 @@ dependencies = [ "atomic-waker", "fastrand", "futures-lite", + "log", ] [[package]] name = "bumpalo" -version = "3.12.0" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" +checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" [[package]] name = "byte-unit" @@ -361,9 +368,9 @@ checksum = "415301c9de11005d4b92193c0eb7ac7adc37e5a49e0ac9bed0a42343512744b8" [[package]] name = "byte-unit" -version = "4.0.18" +version = "4.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3348673602e04848647fffaa8e9a861e7b5d5cae6570727b41bde0f722514484" +checksum = "da78b32057b8fdfc352504708feeba7216dcd65a2c9ab02978cbd288d1279b6c" dependencies = [ "serde", "utf8-width", @@ -451,13 +458,13 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.23" +version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" +checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5" dependencies = [ + "android-tzdata", "iana-time-zone", "js-sys", - "num-integer", "num-traits", "time 0.1.45", "wasm-bindgen", @@ -466,23 +473,13 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.49" +version = "0.1.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db34956e100b30725f2eb215f90d4871051239535632f84fea3bc92722c66b7c" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" dependencies = [ "cc", ] -[[package]] -name = "codespan-reporting" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" -dependencies = [ - "termcolor", - "unicode-width", -] - [[package]] name = "common" version = "0.1.0" @@ -515,18 +512,18 @@ checksum = "2382f75942f4b3be3690fe4f86365e9c853c1587d6ee58212cebf6e2a9ccd101" [[package]] name = "concurrent-queue" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c278839b831783b70278b14df4d45e1beb1aad306c07bb796637de9a0e323e8e" +checksum = "62ec6771ecfa0762d24683ee5a32ad78487a3d3afdc0fb8cae19d2c5deb50b7c" dependencies = [ "crossbeam-utils", ] [[package]] name = "constant_time_eq" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3ad85c1f65dc7b37604eb0e89748faf0b9653065f2a8ef69f96a687ec1e9279" +checksum = "21a53c0a4d288377e7415b53dcfc3c04da5cdc2cc95c8d5ac178b58f0b861ad6" [[package]] name = "containerd-shim-protos" @@ -542,15 +539,15 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" -version = "0.2.5" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" +checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58" dependencies = [ "libc", ] @@ -572,9 +569,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf2b3e8478797446514c91ef04bafcb59faba183e621ad488df88983cc14128c" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" dependencies = [ "cfg-if 1.0.0", "crossbeam-utils", @@ -600,64 +597,21 @@ dependencies = [ ] [[package]] -name = "ctor" -version = "0.1.26" +name = "dashmap" +version = "5.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096" +checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc" dependencies = [ - "quote", - "syn 1.0.109", -] - -[[package]] -name = "cxx" -version = "1.0.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86d3488e7665a7a483b57e25bdd90d0aeb2bc7608c8d0346acf2ad3f1caf1d62" -dependencies = [ - "cc", - "cxxbridge-flags", - "cxxbridge-macro", - "link-cplusplus", -] - -[[package]] -name = "cxx-build" -version = "1.0.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48fcaf066a053a41a81dfb14d57d99738b767febb8b735c3016e469fac5da690" -dependencies = [ - "cc", - "codespan-reporting", + "cfg-if 1.0.0", + "hashbrown", + "lock_api", "once_cell", - "proc-macro2", - "quote", - "scratch", - "syn 1.0.109", -] - -[[package]] -name = "cxxbridge-flags" -version = "1.0.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2ef98b8b717a829ca5603af80e1f9e2e48013ab227b68ef37872ef84ee479bf" - -[[package]] -name = "cxxbridge-macro" -version = "1.0.91" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "086c685979a698443656e5cf7856c95c642295a38599f12fb1ff76fb28d19892" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", + "parking_lot_core 0.9.8", ] [[package]] name = "dbs-address-space" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95e20d28a9cd13bf00d0ecd1bd073d242242b04f0acb663d7adfc659f8879322" dependencies = [ "arc-swap", "lazy_static", @@ -671,8 +625,6 @@ dependencies = [ [[package]] name = "dbs-allocator" version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "543711b94b4bc1437d2ebb45f856452e96a45a67ab39f8dcf8c887c2a3701004" dependencies = [ "thiserror", ] @@ -680,8 +632,6 @@ dependencies = [ [[package]] name = "dbs-arch" version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "194c844946cd1d13f7a9eb29b84afbc5354578eee2b06fea96226bc3872e7424" dependencies = [ "kvm-bindings", "kvm-ioctls", @@ -695,8 +645,6 @@ dependencies = [ [[package]] name = "dbs-boot" version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5466a92f75aa928a9103dcb2088f6d1638ef9da8945fad7389a73864dfa0182c" dependencies = [ "dbs-arch", "kvm-bindings", @@ -711,8 +659,6 @@ dependencies = [ [[package]] name = "dbs-device" version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14ecea44b4bc861c0c2ccb51868bea781286dc70e40ae46b54d4511e690a654a" dependencies = [ "thiserror", ] @@ -720,8 +666,6 @@ dependencies = [ [[package]] name = "dbs-interrupt" version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1eb2c5bb9f8f123ace33b1b2e8d53dd2d87331ee770ad1f82e56c3382c6bed6d" dependencies = [ "dbs-arch", "dbs-device", @@ -734,11 +678,10 @@ dependencies = [ [[package]] name = "dbs-legacy-devices" version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4d089ac1c4d186c8133be59de09462e9793f7add10017c5b040318a3a7f431f" dependencies = [ "dbs-device", "dbs-utils", + "libc", "log", "serde", "vm-superio", @@ -748,8 +691,6 @@ dependencies = [ [[package]] name = "dbs-upcall" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea3a78128fd0be8b8b10257675c262b378dc5d00b1e18157736a6c27e45ce4fb" dependencies = [ "anyhow", "dbs-utils", @@ -762,8 +703,6 @@ dependencies = [ [[package]] name = "dbs-utils" version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cb6ff873451b76e22789af7fbe1d0478c42c717f817e66908be7a3a2288068c" dependencies = [ "anyhow", "event-manager", @@ -778,8 +717,6 @@ dependencies = [ [[package]] name = "dbs-virtio-devices" version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d671cc3e5f98b84ef6b6bed007d28f72f16d3aea8eb38e2d42b00b2973c1d8" dependencies = [ "byteorder", "caps", @@ -795,9 +732,10 @@ dependencies = [ "log", "nix 0.24.3", "nydus-api", - "nydus-blobfs", "nydus-rafs", + "nydus-storage", "rlimit", + "sendfd", "serde", "serde_json", "thiserror", @@ -830,11 +768,11 @@ dependencies = [ [[package]] name = "digest" -version = "0.10.6" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ - "block-buffer 0.10.3", + "block-buffer 0.10.4", "crypto-common", "subtle", ] @@ -920,13 +858,13 @@ dependencies = [ [[package]] name = "errno" -version = "0.2.8" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" dependencies = [ "errno-dragonfly", "libc", - "winapi", + "windows-sys", ] [[package]] @@ -977,14 +915,14 @@ dependencies = [ [[package]] name = "filetime" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a3de6e8d11b22ff9edc6d916f890800597d60f8b2da1caf2955c274638d6412" +checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" dependencies = [ "cfg-if 1.0.0", "libc", - "redox_syscall", - "windows-sys 0.45.0", + "redox_syscall 0.2.16", + "windows-sys", ] [[package]] @@ -995,13 +933,13 @@ checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d" [[package]] name = "flate2" -version = "1.0.25" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841" +checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" dependencies = [ "crc32fast", "libz-sys", - "miniz_oxide", + "miniz_oxide 0.7.1", ] [[package]] @@ -1011,10 +949,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] -name = "form_urlencoded" -version = "1.1.0" +name = "foreign-types" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" dependencies = [ "percent-encoding", ] @@ -1027,9 +980,9 @@ checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" [[package]] name = "fuse-backend-rs" -version = "0.10.2" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08af89cb80a7c5693bd63a2b1ee7ac31a307670977c18fda036b3aa94be8c47f" +checksum = "dc24820b14267bec37fa87f5c2a32b5f1c5405b8c60cc3aa77afd481bd2628a6" dependencies = [ "arc-swap", "bitflags", @@ -1054,9 +1007,9 @@ checksum = "3a471a38ef8ed83cd6e40aa59c1ffe17db6855c18e3604d9c4ed8c08ebc28678" [[package]] name = "futures" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13e2792b0ff0340399d58445b88fd9770e3489eff258a4cbc1523418f12abf84" +checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" dependencies = [ "futures-channel", "futures-core", @@ -1069,9 +1022,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e5317663a9089767a1ec00a487df42e0ca174b61b4483213ac24448e4664df5" +checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" dependencies = [ "futures-core", "futures-sink", @@ -1079,15 +1032,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec90ff4d0fe1f57d600049061dc6bb68ed03c7d2fbd697274c41805dcb3f8608" +checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" [[package]] name = "futures-executor" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8de0a35a6ab97ec8869e32a2473f4b1324459e14c29275d14b10cb1fd19b50e" +checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" dependencies = [ "futures-core", "futures-task", @@ -1096,15 +1049,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfb8371b6fb2aeb2d280374607aeabfc99d95c72edfe51692e42d3d7f0d08531" +checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" [[package]] name = "futures-lite" -version = "1.12.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7694489acd39452c77daa48516b894c153f192c3578d5a839b62c58099fcbf48" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" dependencies = [ "fastrand", "futures-core", @@ -1117,32 +1070,32 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95a73af87da33b5acf53acfebdc339fe592ecf5357ac7c0a7734ab9d8c876a70" +checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.18", ] [[package]] name = "futures-sink" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f310820bb3e8cfd46c80db4d7fb8353e15dfff853a127158425f31e0be6c8364" +checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" [[package]] name = "futures-task" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcf79a1bf610b10f42aea489289c5a2c478a786509693b80cd39c44ccd936366" +checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" [[package]] name = "futures-util" -version = "0.3.26" +version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c1d6de3acfef38d2be4b1f543f553131788603495be83da675e180c8d6b7bd1" +checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" dependencies = [ "futures-channel", "futures-core", @@ -1158,9 +1111,9 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.6" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", @@ -1179,9 +1132,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.8" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ "cfg-if 1.0.0", "libc", @@ -1221,6 +1174,25 @@ dependencies = [ "cfg-if 0.1.10", ] +[[package]] +name = "h2" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782" +dependencies = [ + "bytes 1.4.0", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -1230,6 +1202,31 @@ dependencies = [ "ahash", ] +[[package]] +name = "headers" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3e372db8e5c0d213e0cd0b9be18be2aca3d44cf2fe30a9d46a65581cd454584" +dependencies = [ + "base64", + "bitflags", + "bytes 1.4.0", + "headers-core", + "http", + "httpdate", + "mime", + "sha1", +] + +[[package]] +name = "headers-core" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429" +dependencies = [ + "http", +] + [[package]] name = "heck" version = "0.3.3" @@ -1263,6 +1260,12 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" + [[package]] name = "hex" version = "0.4.3" @@ -1305,14 +1308,15 @@ checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" [[package]] name = "hyper" -version = "0.14.24" +version = "0.14.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e011372fa0b68db8350aa7a248930ecc7839bf46d8485577d69f117a75f164c" +checksum = "ab302d72a6f11a3b910431ff93aae7e773078c769f0a3ef15fb9ec692ed147d4" dependencies = [ "bytes 1.4.0", "futures-channel", "futures-core", "futures-util", + "h2", "http", "http-body", "httparse", @@ -1350,7 +1354,7 @@ dependencies = [ "crossbeam-channel", "dbs-utils", "dragonball", - "futures 0.3.26", + "futures 0.3.28", "go-flag", "kata-sys-util", "kata-types", @@ -1371,38 +1375,38 @@ dependencies = [ "slog-scope", "thiserror", "tokio", + "tracing", "vmm-sys-util 0.11.1", ] [[package]] name = "iana-time-zone" -version = "0.1.53" +version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "winapi", + "windows", ] [[package]] name = "iana-time-zone-haiku" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" dependencies = [ - "cxx", - "cxx-build", + "cc", ] [[package]] name = "idna" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" +checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" dependencies = [ "unicode-bidi", "unicode-normalization", @@ -1410,9 +1414,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.9.2" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", "hashbrown", @@ -1428,13 +1432,20 @@ dependencies = [ ] [[package]] -name = "io-lifetimes" -version = "1.0.5" +name = "integer-encoding" +version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "io-lifetimes" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" dependencies = [ + "hermit-abi 0.3.1", "libc", - "windows-sys 0.45.0", + "windows-sys", ] [[package]] @@ -1467,9 +1478,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440" +checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" [[package]] name = "jobserver" @@ -1482,9 +1493,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.61" +version = "0.3.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790" dependencies = [ "wasm-bindgen", ] @@ -1493,6 +1504,7 @@ dependencies = [ name = "kata-sys-util" version = "0.1.0" dependencies = [ + "anyhow", "byteorder", "cgroups-rs", "chrono", @@ -1571,9 +1583,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "leaky-bucket" -version = "0.12.2" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79e0d52231827974ba682f6257fd42a2f79749689f7ca0d763e198a0f7051c91" +checksum = "5e8b256cabb5f5c7affd490acbb12f951d725385971fa602dedb11e09c896b6d" dependencies = [ "parking_lot 0.12.1", "tokio", @@ -1582,15 +1594,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.139" +version = "0.2.146" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" +checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" [[package]] name = "libz-sys" -version = "1.1.8" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9702761c3935f8cc2f101793272e202c72b99da8f4224a19ddcf1279a6450bbf" +checksum = "56ee889ecc9568871456d42f603d6a0ce59ff328d291063a45cbdf0036baf6db" dependencies = [ "cc", "cmake", @@ -1599,15 +1611,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "link-cplusplus" -version = "1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" -dependencies = [ - "cc", -] - [[package]] name = "linux-loader" version = "0.6.0" @@ -1619,9 +1622,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.1.4" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" [[package]] name = "linux_container" @@ -1636,9 +1639,9 @@ dependencies = [ [[package]] name = "lock_api" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" dependencies = [ "autocfg", "scopeguard", @@ -1646,11 +1649,10 @@ dependencies = [ [[package]] name = "log" -version = "0.4.17" +version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" dependencies = [ - "cfg-if 1.0.0", "value-bag", ] @@ -1710,6 +1712,12 @@ dependencies = [ "autocfg", ] +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + [[package]] name = "miniz_oxide" version = "0.6.2" @@ -1720,15 +1728,24 @@ dependencies = [ ] [[package]] -name = "mio" -version = "0.8.6" +name = "miniz_oxide" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] + +[[package]] +name = "mio" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" dependencies = [ "libc", "log", "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.45.0", + "windows-sys", ] [[package]] @@ -1782,7 +1799,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "65b4b14489ab424703c092062176d52ba55485a89c076b4f9db05092b7223aa6" dependencies = [ "bytes 1.4.0", - "futures 0.3.26", + "futures 0.3.28", "log", "netlink-packet-core", "netlink-sys", @@ -1792,12 +1809,12 @@ dependencies = [ [[package]] name = "netlink-sys" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "260e21fbb6f3d253a14df90eb0000a6066780a15dd901a7519ce02d77a94985b" +checksum = "6471bf08e7ac0135876a9581bf3217ef0333c191c128d34878079f42ee150411" dependencies = [ "bytes 1.4.0", - "futures 0.3.26", + "futures 0.3.28", "libc", "log", "tokio", @@ -1867,13 +1884,13 @@ dependencies = [ ] [[package]] -name = "num-integer" -version = "0.1.45" +name = "nu-ansi-term" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" dependencies = [ - "autocfg", - "num-traits", + "overload", + "winapi", ] [[package]] @@ -1906,82 +1923,45 @@ dependencies = [ [[package]] name = "nydus-api" -version = "0.2.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1899def1a22ed32b1d60de4e444f525c4023a208ee0d1136a65399cff82837ce" +checksum = "33a6ca41dd10813e3d29397550fbb0f15ad149381f312e04659d39e0adcf2002" dependencies = [ + "backtrace", "libc", "log", - "nydus-error", "serde", "serde_json", "toml 0.5.11", ] -[[package]] -name = "nydus-blobfs" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784cf6e1319da7a94734987dcc71d2940f74231256922431a505c832fc778dd3" -dependencies = [ - "fuse-backend-rs", - "libc", - "log", - "nydus-api", - "nydus-error", - "nydus-rafs", - "nydus-storage", - "serde", - "serde_json", - "vm-memory", -] - -[[package]] -name = "nydus-error" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae2ec1efd1589377dbefca6b1047294c71b2fbab164d93319f97b20faae92001" -dependencies = [ - "backtrace", - "httpdate", - "libc", - "log", - "serde", - "serde_json", -] - [[package]] name = "nydus-rafs" -version = "0.2.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0ace6945daa16842e72e9fe7647e2b8715856f50f07350cce82bd68db1ed02c" +checksum = "ed21e44a99472850d2afc4fb07427ed46d4e6a8b1cce28b42bd689319e45076d" dependencies = [ "anyhow", "arc-swap", "bitflags", - "blake3", "fuse-backend-rs", - "futures 0.3.26", "lazy_static", "libc", "log", - "lz4-sys", "nix 0.24.3", "nydus-api", - "nydus-error", "nydus-storage", "nydus-utils", "serde", "serde_json", - "spmc", "vm-memory", ] [[package]] name = "nydus-storage" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08bc5ea9054fca2ec8b19dcce25ea600679b7fbf035aad86cfe4a659002c88b" +checksum = "9591fbee1875895bf1f765656695d0be6887fe65372fbf4924b8b3959bd61375" dependencies = [ "arc-swap", "bitflags", @@ -1993,7 +1973,6 @@ dependencies = [ "log", "nix 0.24.3", "nydus-api", - "nydus-error", "nydus-utils", "serde", "serde_json", @@ -2004,12 +1983,13 @@ dependencies = [ [[package]] name = "nydus-utils" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1e681d7207a1ec500323d5ca39ebb7e381fc4f14db5ff0c532c18ff1226a81f" +checksum = "fe8b9269e3a370682f272a1b2cac4bdaf6d6657f3f6966560c4fedab36548362" dependencies = [ "blake3", "flate2", + "httpdate", "lazy_static", "libc", "libz-sys", @@ -2017,7 +1997,8 @@ dependencies = [ "lz4", "lz4-sys", "nix 0.24.3", - "nydus-error", + "nydus-api", + "openssl", "serde", "serde_json", "sha2 0.10.6", @@ -2027,9 +2008,9 @@ dependencies = [ [[package]] name = "object" -version = "0.30.3" +version = "0.30.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea86265d3d3dcb6a27fc51bd29a4bf387fae9d2986b823079d4986af253eb439" +checksum = "03b4680b86d9cfafba8fc491dc9b6df26b68cf40e9e6cd73909194759a63c385" dependencies = [ "memchr", ] @@ -2046,9 +2027,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.17.1" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "opaque-debug" @@ -2056,6 +2037,155 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" +[[package]] +name = "openssl" +version = "0.10.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d" +dependencies = [ + "bitflags", + "cfg-if 1.0.0", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.18", +] + +[[package]] +name = "openssl-src" +version = "111.26.0+1.1.1u" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efc62c9f12b22b8f5208c23a7200a442b2e5999f8bdf80233852122b5a4f6f37" +dependencies = [ + "cc", +] + +[[package]] +name = "openssl-sys" +version = "0.9.90" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6" +dependencies = [ + "cc", + "libc", + "openssl-src", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "opentelemetry" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e" +dependencies = [ + "opentelemetry_api", + "opentelemetry_sdk", +] + +[[package]] +name = "opentelemetry-http" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d" +dependencies = [ + "async-trait", + "bytes 1.4.0", + "http", + "hyper", + "opentelemetry_api", + "tokio", +] + +[[package]] +name = "opentelemetry-jaeger" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e785d273968748578931e4dc3b4f5ec86b26e09d9e0d66b55adda7fce742f7a" +dependencies = [ + "async-trait", + "futures 0.3.28", + "futures-executor", + "headers", + "http", + "hyper", + "once_cell", + "opentelemetry", + "opentelemetry-http", + "opentelemetry-semantic-conventions", + "thiserror", + "thrift", + "tokio", +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb" +dependencies = [ + "opentelemetry", +] + +[[package]] +name = "opentelemetry_api" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22" +dependencies = [ + "fnv", + "futures-channel", + "futures-util", + "indexmap", + "js-sys", + "once_cell", + "pin-project-lite", + "thiserror", +] + +[[package]] +name = "opentelemetry_sdk" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113" +dependencies = [ + "async-trait", + "crossbeam-channel", + "dashmap", + "fnv", + "futures-channel", + "futures-executor", + "futures-util", + "once_cell", + "opentelemetry_api", + "percent-encoding", + "rand 0.8.5", + "thiserror", + "tokio", + "tokio-stream", +] + +[[package]] +name = "ordered-float" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3305af35278dd29f46fcdd139e0b1fbfae2153f0e5928b39b035542dd31e37b7" +dependencies = [ + "num-traits", +] + [[package]] name = "ordered-multimap" version = "0.4.3" @@ -2067,10 +2197,16 @@ dependencies = [ ] [[package]] -name = "parking" -version = "2.0.0" +name = "overload" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + +[[package]] +name = "parking" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14f2252c834a40ed9bb5422029649578e63aa341ac401f74e719dd1afda8394e" [[package]] name = "parking_lot" @@ -2090,7 +2226,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", - "parking_lot_core 0.9.7", + "parking_lot_core 0.9.8", ] [[package]] @@ -2102,29 +2238,29 @@ dependencies = [ "cfg-if 1.0.0", "instant", "libc", - "redox_syscall", + "redox_syscall 0.2.16", "smallvec", "winapi", ] [[package]] name = "parking_lot_core" -version = "0.9.7" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" +checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ "cfg-if 1.0.0", "libc", - "redox_syscall", + "redox_syscall 0.3.5", "smallvec", - "windows-sys 0.45.0", + "windows-targets", ] [[package]] name = "paste" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d01a5bd0424d00070b0098dd17ebca6f961a959dead1dbcbbbc1d1cd8d3deeba" +checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79" [[package]] name = "path-clean" @@ -2134,9 +2270,9 @@ checksum = "17359afc20d7ab31fdb42bb844c8b3bb1dabd7dcf7e68428492da7f16966fcef" [[package]] name = "percent-encoding" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" +checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" [[package]] name = "persist" @@ -2165,22 +2301,22 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.0.12" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad29a609b6bcd67fee905812e544992d216af9d755757c05ed2d0e15a74c6ecc" +checksum = "c95a7476719eab1e366eaf73d0260af3021184f18177925b07f54b30089ceead" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.0.12" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55" +checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.18", ] [[package]] @@ -2197,22 +2333,24 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" [[package]] name = "polling" -version = "2.5.2" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22122d5ec4f9fe1b3916419b76be1e80bcb93f618d071d2edf841b137b2a2bd6" +checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" dependencies = [ "autocfg", + "bitflags", "cfg-if 1.0.0", + "concurrent-queue", "libc", "log", - "wepoll-ffi", - "windows-sys 0.42.0", + "pin-project-lite", + "windows-sys", ] [[package]] @@ -2223,9 +2361,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.58" +version = "1.0.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8" +checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" dependencies = [ "unicode-ident", ] @@ -2360,9 +2498,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500" +checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" dependencies = [ "proc-macro2", ] @@ -2464,7 +2602,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.8", + "getrandom 0.2.10", ] [[package]] @@ -2494,22 +2632,31 @@ dependencies = [ "bitflags", ] +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags", +] + [[package]] name = "redox_users" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" dependencies = [ - "getrandom 0.2.8", - "redox_syscall", + "getrandom 0.2.10", + "redox_syscall 0.2.16", "thiserror", ] [[package]] name = "regex" -version = "1.7.1" +version = "1.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" +checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" dependencies = [ "aho-corasick", "memchr", @@ -2518,9 +2665,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.28" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" [[package]] name = "resource" @@ -2531,9 +2678,9 @@ dependencies = [ "anyhow", "async-trait", "bitflags", - "byte-unit 4.0.18", + "byte-unit 4.0.19", "cgroups-rs", - "futures 0.3.26", + "futures 0.3.28", "hex", "hypervisor", "kata-sys-util", @@ -2557,6 +2704,7 @@ dependencies = [ "tempfile", "test-utils", "tokio", + "tracing", "uuid", ] @@ -2575,7 +2723,7 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46f1cfa18f8cebe685373a2697915d7e0db3b4554918bba118385e0f71f258a7" dependencies = [ - "futures 0.3.26", + "futures 0.3.28", "log", "netlink-packet-route", "netlink-proto", @@ -2602,6 +2750,8 @@ dependencies = [ "netns-rs", "nix 0.25.1", "oci", + "opentelemetry", + "opentelemetry-jaeger", "persist", "resource", "serde_json", @@ -2609,6 +2759,9 @@ dependencies = [ "slog", "slog-scope", "tokio", + "tracing", + "tracing-opentelemetry", + "tracing-subscriber", "url", "virt_container", "wasm_container", @@ -2626,35 +2779,35 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.21" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" [[package]] name = "rustix" -version = "0.36.8" +version = "0.37.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644" +checksum = "b96e891d04aa506a6d1f318d2771bcb1c7dfda84e126660ace067c9b474bb2c0" dependencies = [ "bitflags", "errno", "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.45.0", + "windows-sys", ] [[package]] name = "rustversion" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5583e89e108996506031660fe09baa5011b9dd0341b89029313006d1fb508d70" +checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06" [[package]] name = "ryu" -version = "1.0.12" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" +checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" [[package]] name = "safe-path" @@ -2684,12 +2837,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" -[[package]] -name = "scratch" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" - [[package]] name = "seccompiler" version = "0.2.0" @@ -2700,30 +2847,39 @@ dependencies = [ ] [[package]] -name = "serde" -version = "1.0.152" +name = "sendfd" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" +checksum = "604b71b8fc267e13bb3023a2c901126c8f349393666a6d98ac1ae5729b701798" +dependencies = [ + "libc", +] + +[[package]] +name = "serde" +version = "1.0.164" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.152" +version = "1.0.164" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e" +checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.18", ] [[package]] name = "serde_json" -version = "1.0.93" +version = "1.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cad406b69c91885b5107daf2c29572f6c8cdb3c66826821e286c533490c0bc76" +checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1" dependencies = [ "itoa", "ryu", @@ -2767,9 +2923,21 @@ dependencies = [ "slog", "slog-scope", "tokio", + "tracing", "ttrpc", ] +[[package]] +name = "sha1" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" +dependencies = [ + "cfg-if 1.0.0", + "cpufeatures", + "digest 0.10.7", +] + [[package]] name = "sha2" version = "0.9.3" @@ -2791,7 +2959,16 @@ checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" dependencies = [ "cfg-if 1.0.0", "cpufeatures", - "digest 0.10.6", + "digest 0.10.7", +] + +[[package]] +name = "sharded-slab" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31" +dependencies = [ + "lazy_static", ] [[package]] @@ -2811,6 +2988,7 @@ dependencies = [ "oci", "protobuf 3.2.0", "rand 0.8.5", + "runtimes", "serial_test", "service", "sha2 0.9.3", @@ -2822,6 +3000,8 @@ dependencies = [ "tests_utils", "thiserror", "tokio", + "tracing", + "tracing-opentelemetry", "unix_socket2", ] @@ -2891,7 +3071,7 @@ dependencies = [ "serde", "serde_json", "slog", - "time 0.3.20", + "time 0.3.22", ] [[package]] @@ -2926,7 +3106,7 @@ dependencies = [ "slog", "term", "thread_local", - "time 0.3.20", + "time 0.3.22", ] [[package]] @@ -2945,12 +3125,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "spmc" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02a8428da277a8e3a15271d79943e80ccc2ef254e78813a166a08d65e4c3ece5" - [[package]] name = "static_assertions" version = "1.1.0" @@ -2991,9 +3165,9 @@ dependencies = [ [[package]] name = "subtle" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "syn" @@ -3008,9 +3182,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.16" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01" +checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" dependencies = [ "proc-macro2", "quote", @@ -3036,15 +3210,16 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.4.0" +version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95" +checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6" dependencies = [ + "autocfg", "cfg-if 1.0.0", "fastrand", - "redox_syscall", + "redox_syscall 0.3.5", "rustix", - "windows-sys 0.42.0", + "windows-sys", ] [[package]] @@ -3058,15 +3233,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "termcolor" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" -dependencies = [ - "winapi-util", -] - [[package]] name = "test-utils" version = "0.1.0" @@ -3083,22 +3249,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.38" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" +checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.38" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" +checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.18", ] [[package]] @@ -3120,6 +3286,19 @@ dependencies = [ "num_cpus", ] +[[package]] +name = "thrift" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09678c4cdbb4eed72e18b7c2af1329c69825ed16fcbac62d083fc3e2b0590ff0" +dependencies = [ + "byteorder", + "integer-encoding", + "log", + "ordered-float", + "threadpool", +] + [[package]] name = "time" version = "0.1.45" @@ -3133,9 +3312,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.20" +version = "0.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" +checksum = "ea9e1b3cf1243ae005d9e74085d4d542f3125458f3a81af210d901dcd7411efd" dependencies = [ "itoa", "libc", @@ -3147,24 +3326,24 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" [[package]] name = "time-macros" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36" +checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b" dependencies = [ "time-core", ] [[package]] name = "timerfd" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0664936efa25f2bbe03ca25b62c50f5f492abec07e59d6dcf45131014b33483f" +checksum = "3d3fd47d83ad0b5c7be2e8db0b9d712901ef6ce5afbcc6f676761004f5104ea2" dependencies = [ "rustix", ] @@ -3186,9 +3365,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.28.1" +version = "1.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105" +checksum = "94d7b1cfd2aa4011f2de74c2c4c63665e27a71006b0a192dcd2710272e73dfa2" dependencies = [ "autocfg", "bytes 1.4.0", @@ -3200,7 +3379,7 @@ dependencies = [ "signal-hook-registry", "socket2", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -3211,7 +3390,18 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.16", + "syn 2.0.18", +] + +[[package]] +name = "tokio-stream" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", ] [[package]] @@ -3228,6 +3418,20 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-util" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" +dependencies = [ + "bytes 1.4.0", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", + "tracing", +] + [[package]] name = "tokio-vsock" version = "0.3.4" @@ -3235,7 +3439,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b33556828911d16e24d8b5d336446b0bf6b4b9bfda52cbdc2fa35b7a2862ebc" dependencies = [ "bytes 0.4.12", - "futures 0.3.26", + "futures 0.3.28", "libc", "tokio", "vsock", @@ -3279,22 +3483,62 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" +checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.18", ] [[package]] name = "tracing-core" -version = "0.1.30" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" +checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922" +dependencies = [ + "lazy_static", + "log", + "tracing-core", +] + +[[package]] +name = "tracing-opentelemetry" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de" +dependencies = [ + "once_cell", + "opentelemetry", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" +dependencies = [ + "nu-ansi-term", + "sharded-slab", + "smallvec", + "thread_local", + "tracing-core", + "tracing-log", ] [[package]] @@ -3311,7 +3555,7 @@ checksum = "a35f22a2964bea14afee161665bb260b83cb48e665e0260ca06ec0e775c8b06c" dependencies = [ "async-trait", "byteorder", - "futures 0.3.26", + "futures 0.3.28", "libc", "log", "nix 0.23.2", @@ -3357,15 +3601,15 @@ checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "unicode-bidi" -version = "0.3.10" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54675592c1dbefd78cbd98db9bacd89886e1ca50692a0692baefffdeb92dd58" +checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-ident" -version = "1.0.6" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" +checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" [[package]] name = "unicode-normalization" @@ -3382,12 +3626,6 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" -[[package]] -name = "unicode-width" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" - [[package]] name = "unix_socket2" version = "0.5.4" @@ -3399,9 +3637,9 @@ dependencies = [ [[package]] name = "url" -version = "2.3.1" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643" +checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" dependencies = [ "form_urlencoded", "idna", @@ -3424,14 +3662,16 @@ dependencies = [ ] [[package]] -name = "value-bag" -version = "1.0.0-alpha.9" +name = "valuable" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55" -dependencies = [ - "ctor", - "version_check", -] +checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" + +[[package]] +name = "value-bag" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4d330786735ea358f3bc09eea4caa098569c1c93f342d9aca0514915022fe7e" [[package]] name = "vcpkg" @@ -3456,7 +3696,7 @@ dependencies = [ "awaitgroup", "common", "containerd-shim-protos", - "futures 0.3.26", + "futures 0.3.28", "hypervisor", "kata-sys-util", "kata-types", @@ -3475,6 +3715,7 @@ dependencies = [ "slog-scope", "tokio", "toml 0.4.10", + "tracing", "url", ] @@ -3585,9 +3826,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" +checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73" dependencies = [ "cfg-if 1.0.0", "wasm-bindgen-macro", @@ -3595,24 +3836,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.18", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.34" +version = "0.4.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454" +checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e" dependencies = [ "cfg-if 1.0.0", "js-sys", @@ -3622,9 +3863,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3632,22 +3873,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.18", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.84" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" +checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93" [[package]] name = "wasm_container" @@ -3662,23 +3903,14 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.61" +version = "0.3.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33b99f4b23ba3eec1a53ac264e35a755f00e966e0065077d6027c0f575b0b97" +checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2" dependencies = [ "js-sys", "wasm-bindgen", ] -[[package]] -name = "wepoll-ffi" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d743fdedc5c64377b5fc2bc036b01c7fd642205a0d96356034ae3404d49eb7fb" -dependencies = [ - "cc", -] - [[package]] name = "which" version = "4.4.0" @@ -3706,15 +3938,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi", -] - [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -3722,27 +3945,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] -name = "windows-sys" -version = "0.42.0" +name = "windows" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" dependencies = [ - "windows_aarch64_gnullvm 0.42.1", - "windows_aarch64_msvc 0.42.1", - "windows_i686_gnu 0.42.1", - "windows_i686_msvc 0.42.1", - "windows_x86_64_gnu 0.42.1", - "windows_x86_64_gnullvm 0.42.1", - "windows_x86_64_msvc 0.42.1", -] - -[[package]] -name = "windows-sys" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" -dependencies = [ - "windows-targets 0.42.1", + "windows-targets", ] [[package]] @@ -3751,22 +3959,7 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets 0.48.0", -] - -[[package]] -name = "windows-targets" -version = "0.42.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" -dependencies = [ - "windows_aarch64_gnullvm 0.42.1", - "windows_aarch64_msvc 0.42.1", - "windows_i686_gnu 0.42.1", - "windows_i686_msvc 0.42.1", - "windows_x86_64_gnu 0.42.1", - "windows_x86_64_gnullvm 0.42.1", - "windows_x86_64_msvc 0.42.1", + "windows-targets", ] [[package]] @@ -3775,93 +3968,51 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" dependencies = [ - "windows_aarch64_gnullvm 0.48.0", - "windows_aarch64_msvc 0.48.0", - "windows_i686_gnu 0.48.0", - "windows_i686_msvc 0.48.0", - "windows_x86_64_gnu 0.48.0", - "windows_x86_64_gnullvm 0.48.0", - "windows_x86_64_msvc 0.48.0", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.42.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" - [[package]] name = "windows_aarch64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" -[[package]] -name = "windows_aarch64_msvc" -version = "0.42.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" - [[package]] name = "windows_aarch64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" -[[package]] -name = "windows_i686_gnu" -version = "0.42.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" - [[package]] name = "windows_i686_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" -[[package]] -name = "windows_i686_msvc" -version = "0.42.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" - [[package]] name = "windows_i686_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" -[[package]] -name = "windows_x86_64_gnu" -version = "0.42.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" - [[package]] name = "windows_x86_64_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.42.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" - [[package]] name = "windows_x86_64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" -[[package]] -name = "windows_x86_64_msvc" -version = "0.42.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" - [[package]] name = "windows_x86_64_msvc" version = "0.48.0" @@ -3898,9 +4049,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.7+zstd.1.5.4" +version = "2.0.8+zstd.1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94509c3ba2fe55294d752b79842c530ccfab760192521df74a081a78d2b3c7f5" +checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" dependencies = [ "cc", "libc", diff --git a/src/runtime-rs/Makefile b/src/runtime-rs/Makefile index 5304290ca..1981a37d9 100644 --- a/src/runtime-rs/Makefile +++ b/src/runtime-rs/Makefile @@ -202,6 +202,7 @@ ifneq (,$(DBCMD)) SYSCONFIG_PATHS += $(SYSCONFIG_DB) CONFIGS += $(CONFIG_DB) # dragonball-specific options (all should be suffixed by "_DB") + VMROOTFSDRIVER_DB := virtio-blk-pci DEFMAXVCPUS_DB := 1 DEFBLOCKSTORAGEDRIVER_DB := virtio-blk-mmio DEFNETWORKMODEL_DB := tcfilter @@ -235,6 +236,7 @@ USER_VARS += SYSCONFIG USER_VARS += IMAGENAME USER_VARS += IMAGEPATH USER_VARS += DEFROOTFSTYPE +USER_VARS += VMROOTFSDRIVER_DB USER_VARS += MACHINETYPE USER_VARS += KERNELDIR USER_VARS += KERNELTYPE diff --git a/src/runtime-rs/config/configuration-dragonball.toml.in b/src/runtime-rs/config/configuration-dragonball.toml.in index e9c2b8c19..58e29b9dd 100644 --- a/src/runtime-rs/config/configuration-dragonball.toml.in +++ b/src/runtime-rs/config/configuration-dragonball.toml.in @@ -23,6 +23,11 @@ image = "@IMAGEPATH@" # - erofs rootfs_type=@DEFROOTFSTYPE@ + +# Block storage driver to be used for the VM rootfs is backed +# by a block device. This is virtio-blk-pci, virtio-blk-mmio or nvdimm +vm_rootfs_driver = "@VMROOTFSDRIVER_DB@" + # List of valid annotation names for the hypervisor # Each member of the list is a regular expression, which is the base name # of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path" diff --git a/src/runtime-rs/crates/agent/Cargo.toml b/src/runtime-rs/crates/agent/Cargo.toml index 7639cb4f7..53f6c7290 100644 --- a/src/runtime-rs/crates/agent/Cargo.toml +++ b/src/runtime-rs/crates/agent/Cargo.toml @@ -19,6 +19,7 @@ slog = "2.5.2" slog-scope = "4.4.0" ttrpc = { version = "0.7.1" } tokio = { version = "1.28.1", features = ["fs", "rt"] } +tracing = "0.1.36" url = "2.2.2" nix = "0.24.2" diff --git a/src/runtime-rs/crates/agent/src/kata/agent.rs b/src/runtime-rs/crates/agent/src/kata/agent.rs index 13ba4085d..d5f5b2b7f 100644 --- a/src/runtime-rs/crates/agent/src/kata/agent.rs +++ b/src/runtime-rs/crates/agent/src/kata/agent.rs @@ -6,6 +6,7 @@ use anyhow::{Context, Result}; use async_trait::async_trait; +use tracing::instrument; use ttrpc::context as ttrpc_ctx; use kata_types::config::Agent as AgentConfig; @@ -22,6 +23,7 @@ fn new_ttrpc_ctx(timeout: i64) -> ttrpc_ctx::Context { #[async_trait] impl AgentManager for KataAgent { + #[instrument] async fn start(&self, address: &str) -> Result<()> { info!(sl!(), "begin to connect agent {:?}", address); self.set_socket_address(address) @@ -73,6 +75,7 @@ macro_rules! impl_agent { ($($name: tt | $req: ty | $resp: ty | $new_timeout: expr),*) => { #[async_trait] impl Agent for KataAgent { + #[instrument(skip(req))] $(async fn $name(&self, req: $req) -> Result<$resp> { let r = req.into(); let (client, mut timeout, _) = self.get_agent_client().await.context("get client")?; diff --git a/src/runtime-rs/crates/agent/src/kata/mod.rs b/src/runtime-rs/crates/agent/src/kata/mod.rs index d87965f61..ce3f2305e 100644 --- a/src/runtime-rs/crates/agent/src/kata/mod.rs +++ b/src/runtime-rs/crates/agent/src/kata/mod.rs @@ -44,6 +44,19 @@ pub(crate) struct KataAgentInner { log_forwarder: LogForwarder, } +impl std::fmt::Debug for KataAgentInner { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("KataAgentInner") + .field("client_fd", &self.client_fd) + .field("socket_address", &self.socket_address) + .field("config", &self.config) + .finish() + } +} + +unsafe impl Send for KataAgent {} +unsafe impl Sync for KataAgent {} +#[derive(Debug)] pub struct KataAgent { pub(crate) inner: Arc>, } diff --git a/src/runtime-rs/crates/hypervisor/Cargo.toml b/src/runtime-rs/crates/hypervisor/Cargo.toml index f629c3bd0..85a4666ea 100644 --- a/src/runtime-rs/crates/hypervisor/Cargo.toml +++ b/src/runtime-rs/crates/hypervisor/Cargo.toml @@ -11,7 +11,7 @@ license = "Apache-2.0" actix-rt = "2.7.0" anyhow = "^1.0" async-trait = "0.1.48" -dbs-utils = "0.2.0" +dbs-utils = { path = "../../../dragonball/src/dbs_utils" } go-flag = "0.1.0" libc = ">=0.2.39" nix = "0.24.2" @@ -28,6 +28,7 @@ vmm-sys-util = "0.11.0" rand = "0.8.4" path-clean = "1.0.1" lazy_static = "1.4" +tracing = "0.1.36" kata-sys-util = { path = "../../../libs/kata-sys-util" } kata-types = { path = "../../../libs/kata-types" } diff --git a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs index 381a324fa..76f62feed 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/device_manager.rs @@ -11,8 +11,9 @@ use kata_sys_util::rand::RandomBytes; use tokio::sync::{Mutex, RwLock}; use crate::{ - device::VhostUserBlkDevice, BlockConfig, BlockDevice, Hypervisor, VfioDevice, VhostUserConfig, - KATA_BLK_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE, VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, + vhost_user_blk::VhostUserBlkDevice, BlockConfig, BlockDevice, Hypervisor, NetworkDevice, + VfioDevice, VhostUserConfig, KATA_BLK_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE, KATA_NVDIMM_DEV_TYPE, + VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, VIRTIO_PMEM, }; use super::{ @@ -22,57 +23,66 @@ use super::{ pub type ArcMutexDevice = Arc>; +macro_rules! declare_index { + ($self:ident, $index:ident, $released_index:ident) => {{ + let current_index = if let Some(index) = $self.$released_index.pop() { + index + } else { + $self.$index + }; + $self.$index += 1; + Ok(current_index) + }}; +} + +macro_rules! release_index { + ($self:ident, $index:ident, $released_index:ident) => {{ + $self.$released_index.push($index); + $self.$released_index.sort_by(|a, b| b.cmp(a)); + }}; +} + /// block_index and released_block_index are used to search an available block index /// in Sandbox. +/// pmem_index and released_pmem_index are used to search an available pmem index +/// in Sandbox. /// -/// @block_driver to be used for block device; -/// @block_index generally default is 1 for ; +/// @pmem_index generally default is 0 for ; +/// @block_index generally default is 0 for ; +/// @released_pmem_index for pmem devices removed and indexes will released at the same time. /// @released_block_index for blk devices removed and indexes will released at the same time. #[derive(Clone, Debug, Default)] struct SharedInfo { - block_driver: String, + pmem_index: u64, block_index: u64, + released_pmem_index: Vec, released_block_index: Vec, } impl SharedInfo { - async fn new(hypervisor: Arc) -> Self { - // get hypervisor block driver - let block_driver = match hypervisor - .hypervisor_config() - .await - .blockdev_info - .block_device_driver - .as_str() - { - // convert the block driver to kata type - VIRTIO_BLOCK_MMIO => KATA_MMIO_BLK_DEV_TYPE.to_string(), - VIRTIO_BLOCK_PCI => KATA_BLK_DEV_TYPE.to_string(), - _ => "".to_string(), - }; - + async fn new() -> Self { SharedInfo { - block_driver, - block_index: 1, + pmem_index: 0, + block_index: 0, + released_pmem_index: vec![], released_block_index: vec![], } } - // declare the available block index - fn declare_device_index(&mut self) -> Result { - let current_index = if let Some(index) = self.released_block_index.pop() { - index + fn declare_device_index(&mut self, is_pmem: bool) -> Result { + if is_pmem { + declare_index!(self, pmem_index, released_pmem_index) } else { - self.block_index - }; - self.block_index += 1; - - Ok(current_index) + declare_index!(self, block_index, released_block_index) + } } - fn release_device_index(&mut self, index: u64) { - self.released_block_index.push(index); - self.released_block_index.sort_by(|a, b| b.cmp(a)); + fn release_device_index(&mut self, index: u64, is_pmem: bool) { + if is_pmem { + release_index!(self, index, released_pmem_index); + } else { + release_index!(self, index, released_block_index); + } } } @@ -89,12 +99,20 @@ impl DeviceManager { let devices = HashMap::::new(); Ok(DeviceManager { devices, - hypervisor: hypervisor.clone(), - shared_info: SharedInfo::new(hypervisor.clone()).await, + hypervisor, + shared_info: SharedInfo::new().await, }) } - pub async fn try_add_device(&mut self, device_id: &str) -> Result<()> { + async fn get_block_driver(&self) -> String { + self.hypervisor + .hypervisor_config() + .await + .blockdev_info + .block_device_driver + } + + async fn try_add_device(&mut self, device_id: &str) -> Result<()> { // find the device let device = self .devices @@ -107,7 +125,10 @@ impl DeviceManager { if let Err(e) = result { match device_guard.get_device_info().await { DeviceType::Block(device) => { - self.shared_info.release_device_index(device.config.index); + self.shared_info.release_device_index( + device.config.index, + device.config.driver_option == *KATA_NVDIMM_DEV_TYPE, + ); } DeviceType::Vfio(device) => { // safe here: @@ -115,11 +136,12 @@ impl DeviceManager { // and needs do release_device_index. otherwise, let it go. if device.config.dev_type == DEVICE_TYPE_BLOCK { self.shared_info - .release_device_index(device.config.virt_path.unwrap().0); + .release_device_index(device.config.virt_path.unwrap().0, false); } } DeviceType::VhostUserBlk(device) => { - self.shared_info.release_device_index(device.config.index); + self.shared_info + .release_device_index(device.config.index, false); } _ => { debug!(sl!(), "no need to do release device index."); @@ -141,8 +163,14 @@ impl DeviceManager { let result = match device_guard.detach(self.hypervisor.as_ref()).await { Ok(index) => { if let Some(i) = index { - // release the declared block device index - self.shared_info.release_device_index(i); + // release the declared device index + let is_pmem = + if let DeviceType::Block(blk) = device_guard.get_device_info().await { + blk.config.driver_option == *KATA_NVDIMM_DEV_TYPE + } else { + false + }; + self.shared_info.release_device_index(i, is_pmem); } Ok(()) } @@ -193,6 +221,11 @@ impl DeviceManager { return Some(device_id.to_string()); } } + DeviceType::Network(device) => { + if device.config.host_dev_name == host_path { + return Some(device_id.to_string()); + } + } _ => { // TODO: support find other device type continue; @@ -203,13 +236,19 @@ impl DeviceManager { None } - fn get_dev_virt_path(&mut self, dev_type: &str) -> Result> { + fn get_dev_virt_path( + &mut self, + dev_type: &str, + is_pmem: bool, + ) -> Result> { let virt_path = if dev_type == DEVICE_TYPE_BLOCK { - // generate virt path - let current_index = self.shared_info.declare_device_index()?; - let drive_name = get_virt_drive_name(current_index as i32)?; + let current_index = self.shared_info.declare_device_index(is_pmem)?; + let drive_name = if is_pmem { + format!("pmem{}", current_index) + } else { + get_virt_drive_name(current_index as i32)? + }; let virt_path_name = format!("/dev/{}", drive_name); - Some((current_index, virt_path_name)) } else { // only dev_type is block, otherwise, it's None. @@ -241,8 +280,7 @@ impl DeviceManager { if let Some(device_matched_id) = self.find_device(dev_host_path).await { return Ok(device_matched_id); } - - let virt_path = self.get_dev_virt_path(vfio_dev_config.dev_type.as_str())?; + let virt_path = self.get_dev_virt_path(vfio_dev_config.dev_type.as_str(), false)?; vfio_dev_config.virt_path = virt_path; Arc::new(Mutex::new(VfioDevice::new( @@ -267,6 +305,21 @@ impl DeviceManager { .await .context("failed to create vhost blk device")? } + DeviceConfig::NetworkCfg(config) => { + // try to find the device, found and just return id. + if let Some(dev_id_matched) = self.find_device(config.host_dev_name.clone()).await { + info!( + sl!(), + "network device with path:{:?} found. return network device id: {:?}", + config.host_dev_name.clone(), + dev_id_matched + ); + + return Ok(dev_id_matched); + } + + Arc::new(Mutex::new(NetworkDevice::new(device_id.clone(), config))) + } _ => { return Err(anyhow!("invliad device type")); } @@ -283,12 +336,28 @@ impl DeviceManager { config: &VhostUserConfig, device_id: String, ) -> Result { + // TODO virtio-scsi let mut vhu_blk_config = config.clone(); - vhu_blk_config.driver_option = self.shared_info.block_driver.clone(); + + match vhu_blk_config.driver_option.as_str() { + // convert the block driver to kata type + VIRTIO_BLOCK_MMIO => { + vhu_blk_config.driver_option = KATA_MMIO_BLK_DEV_TYPE.to_string(); + } + VIRTIO_BLOCK_PCI => { + vhu_blk_config.driver_option = KATA_BLK_DEV_TYPE.to_string(); + } + _ => { + return Err(anyhow!( + "unsupported driver type {}", + vhu_blk_config.driver_option + )); + } + }; // generate block device index and virt path // safe here, Block device always has virt_path. - if let Some(virt_path) = self.get_dev_virt_path(DEVICE_TYPE_BLOCK)? { + if let Some(virt_path) = self.get_dev_virt_path(DEVICE_TYPE_BLOCK, false)? { vhu_blk_config.index = virt_path.0; vhu_blk_config.virt_path = virt_path.1; } @@ -305,10 +374,30 @@ impl DeviceManager { device_id: String, ) -> Result { let mut block_config = config.clone(); - block_config.driver_option = self.shared_info.block_driver.clone(); + let mut is_pmem = false; + + match block_config.driver_option.as_str() { + // convert the block driver to kata type + VIRTIO_BLOCK_MMIO => { + block_config.driver_option = KATA_MMIO_BLK_DEV_TYPE.to_string(); + } + VIRTIO_BLOCK_PCI => { + block_config.driver_option = KATA_BLK_DEV_TYPE.to_string(); + } + VIRTIO_PMEM => { + block_config.driver_option = KATA_NVDIMM_DEV_TYPE.to_string(); + is_pmem = true; + } + _ => { + return Err(anyhow!( + "unsupported driver type {}", + block_config.driver_option + )); + } + }; // generate virt path - if let Some(virt_path) = self.get_dev_virt_path(DEVICE_TYPE_BLOCK)? { + if let Some(virt_path) = self.get_dev_virt_path(DEVICE_TYPE_BLOCK, is_pmem)? { block_config.index = virt_path.0; block_config.virt_path = virt_path.1; } @@ -377,3 +466,7 @@ pub async fn do_handle_device( Ok(device_info) } + +pub async fn get_block_driver(d: &RwLock) -> String { + d.read().await.get_block_driver().await +} diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs index 7b676fd56..3feb2d2ea 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/mod.rs @@ -16,8 +16,8 @@ pub use vfio::{ VfioBusMode, VfioConfig, VfioDevice, }; pub use virtio_blk::{ - BlockConfig, BlockDevice, KATA_BLK_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE, VIRTIO_BLOCK_MMIO, - VIRTIO_BLOCK_PCI, + BlockConfig, BlockDevice, KATA_BLK_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE, KATA_NVDIMM_DEV_TYPE, + VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI, VIRTIO_PMEM, }; pub use virtio_fs::{ ShareFsDevice, ShareFsDeviceConfig, ShareFsMountConfig, ShareFsMountDevice, ShareFsMountType, diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs index 5091b688e..4f6b44ef4 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_blk.rs @@ -13,8 +13,10 @@ use async_trait::async_trait; /// VIRTIO_BLOCK_PCI indicates block driver is virtio-pci based pub const VIRTIO_BLOCK_PCI: &str = "virtio-blk-pci"; pub const VIRTIO_BLOCK_MMIO: &str = "virtio-blk-mmio"; +pub const VIRTIO_PMEM: &str = "virtio-pmem"; pub const KATA_MMIO_BLK_DEV_TYPE: &str = "mmioblk"; pub const KATA_BLK_DEV_TYPE: &str = "blk"; +pub const KATA_NVDIMM_DEV_TYPE: &str = "nvdimm"; #[derive(Debug, Clone, Default)] pub struct BlockConfig { diff --git a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs index 3be861ced..2fa825b51 100644 --- a/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs +++ b/src/runtime-rs/crates/hypervisor/src/device/driver/virtio_net.rs @@ -6,6 +6,14 @@ use std::fmt; +use anyhow::{Context, Result}; +use async_trait::async_trait; + +use crate::{ + device::{Device, DeviceType}, + Hypervisor as hypervisor, +}; + #[derive(Clone)] pub struct Address(pub [u8; 6]); @@ -20,20 +28,71 @@ impl fmt::Debug for Address { } } -#[derive(Debug, Clone)] +#[derive(Clone, Debug, Default)] pub struct NetworkConfig { + /// for detach, now it's default value 0. + pub index: u64, + /// Host level path for the guest network interface. pub host_dev_name: String, + /// Guest iface name for the guest network interface. + pub virt_iface_name: String, + /// Guest MAC address. pub guest_mac: Option
, } -#[derive(Debug, Clone)] +#[derive(Clone, Debug, Default)] pub struct NetworkDevice { /// Unique identifier of the device - pub id: String, + pub device_id: String, /// Network Device config info pub config: NetworkConfig, } + +impl NetworkDevice { + // new creates a NetworkDevice + pub fn new(device_id: String, config: &NetworkConfig) -> Self { + Self { + device_id, + config: config.clone(), + } + } +} + +#[async_trait] +impl Device for NetworkDevice { + async fn attach(&mut self, h: &dyn hypervisor) -> Result<()> { + h.add_device(DeviceType::Network(self.clone())) + .await + .context("add network device.")?; + + return Ok(()); + } + + async fn detach(&mut self, h: &dyn hypervisor) -> Result> { + h.remove_device(DeviceType::Network(self.clone())) + .await + .context("remove network device.")?; + + Ok(Some(self.config.index)) + } + + async fn get_device_info(&self) -> DeviceType { + DeviceType::Network(self.clone()) + } + + async fn increase_attach_count(&mut self) -> Result { + // network devices will not be attached multiple times, Just return Ok(false) + + Ok(false) + } + + async fn decrease_attach_count(&mut self) -> Result { + // network devices will not be detached multiple times, Just return Ok(false) + + Ok(false) + } +} diff --git a/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs b/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs index 91f9406b8..4e49e0baa 100644 --- a/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs +++ b/src/runtime-rs/crates/hypervisor/src/dragonball/inner.rs @@ -7,23 +7,24 @@ use super::vmm_instance::VmmInstance; use crate::{ device::DeviceType, hypervisor_persist::HypervisorState, kernel_param::KernelParams, VmmState, - DEV_HUGEPAGES, HUGETLBFS, HYPERVISOR_DRAGONBALL, SHMEM, VM_ROOTFS_DRIVER_BLK, - VM_ROOTFS_DRIVER_MMIO, + DEV_HUGEPAGES, HUGETLBFS, HYPERVISOR_DRAGONBALL, SHMEM, }; use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; use dragonball::{ - api::v1::{BlockDeviceConfigInfo, BootSourceConfig, VcpuResizeInfo}, + api::v1::{BootSourceConfig, VcpuResizeInfo}, vm::VmConfigInfo, }; + use kata_sys_util::mount; use kata_types::{ capabilities::{Capabilities, CapabilityBits}, config::hypervisor::Hypervisor as HypervisorConfig, }; +use nix::mount::MsFlags; use persist::sandbox_persist::Persist; use shim_interface::KATA_PATH; -use std::{collections::HashSet, fs::create_dir_all, path::PathBuf}; +use std::{collections::HashSet, fs::create_dir_all}; const DRAGONBALL_KERNEL: &str = "vmlinux"; const DRAGONBALL_ROOT_FS: &str = "rootfs"; @@ -120,22 +121,6 @@ impl DragonballInner { ) .context("set_boot_source")?; - // get vm rootfs - let image = { - let initrd_path = self.config.boot_info.initrd.clone(); - let image_path = self.config.boot_info.image.clone(); - if !initrd_path.is_empty() { - Ok(initrd_path) - } else if !image_path.is_empty() { - Ok(image_path) - } else { - Err(anyhow!("failed to get image")) - } - } - .context("get image")?; - self.set_vm_rootfs(&image, &rootfs_driver) - .context("set vm rootfs")?; - // add pending devices while let Some(dev) = self.pending_devices.pop() { self.add_device(dev).await.context("add_device")?; @@ -232,7 +217,8 @@ impl DragonballInner { } let jailed_location = [self.jailer_root.as_str(), dst].join("/"); - mount::bind_mount_unchecked(src, jailed_location.as_str(), false).context("bind_mount")?; + mount::bind_mount_unchecked(src, jailed_location.as_str(), false, MsFlags::MS_SLAVE) + .context("bind_mount")?; let mut abs_path = String::from("/"); abs_path.push_str(dst); @@ -261,37 +247,6 @@ impl DragonballInner { .context("put boot source") } - fn set_vm_rootfs(&mut self, path: &str, driver: &str) -> Result<()> { - info!(sl!(), "set vm rootfs {} {}", path, driver); - let jail_drive = self - .get_resource(path, DRAGONBALL_ROOT_FS) - .context("get resource")?; - - if driver == VM_ROOTFS_DRIVER_BLK || driver == VM_ROOTFS_DRIVER_MMIO { - let blk_cfg = BlockDeviceConfigInfo { - path_on_host: PathBuf::from(jail_drive), - drive_id: DRAGONBALL_ROOT_FS.to_string(), - is_root_device: false, - // Add it as a regular block device - // This allows us to use a partitioned root block device - // is_read_only - is_read_only: true, - is_direct: false, - ..Default::default() - }; - - self.vmm_instance - .insert_block_device(blk_cfg) - .context("inert block device") - } else { - Err(anyhow!( - "Unknown vm_rootfs driver {} path {:?}", - driver, - path - )) - } - } - fn start_vmm_instance(&mut self) -> Result<()> { info!(sl!(), "Starting VM"); self.vmm_instance diff --git a/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs b/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs index 1bd8f2ecc..fe7186c76 100644 --- a/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs +++ b/src/runtime-rs/crates/hypervisor/src/dragonball/inner_device.rs @@ -48,7 +48,7 @@ impl DragonballInner { info!(sl!(), "dragonball add device {:?}", &device); match device { DeviceType::Network(network) => self - .add_net_device(&network.config, network.id) + .add_net_device(&network.config) .context("add net device"), DeviceType::Vfio(hostdev) => self.add_vfio_device(&hostdev).context("add vfio device"), DeviceType::Block(block) => self @@ -84,6 +84,15 @@ impl DragonballInner { info!(sl!(), "remove device {} ", device); match device { + DeviceType::Network(network) => { + // Dragonball doesn't support remove network device, just print message. + info!( + sl!(), + "dragonball remove network device: {:?}.", network.config.virt_iface_name + ); + + Ok(()) + } DeviceType::Block(block) => { let drive_id = drive_index_to_id(block.config.index); self.remove_block_drive(drive_id.as_str()) @@ -197,9 +206,9 @@ impl DragonballInner { Ok(()) } - fn add_net_device(&mut self, config: &NetworkConfig, device_id: String) -> Result<()> { + fn add_net_device(&mut self, config: &NetworkConfig) -> Result<()> { let iface_cfg = VirtioNetDeviceConfigInfo { - iface_id: device_id, + iface_id: config.virt_iface_name.clone(), host_dev_name: config.host_dev_name.clone(), guest_mac: match &config.guest_mac { Some(mac) => MacAddr::from_bytes(&mac.0).ok(), diff --git a/src/runtime-rs/crates/hypervisor/src/dragonball/mod.rs b/src/runtime-rs/crates/hypervisor/src/dragonball/mod.rs index 23445bf94..76e949381 100644 --- a/src/runtime-rs/crates/hypervisor/src/dragonball/mod.rs +++ b/src/runtime-rs/crates/hypervisor/src/dragonball/mod.rs @@ -19,14 +19,20 @@ use async_trait::async_trait; use kata_types::capabilities::Capabilities; use kata_types::config::hypervisor::Hypervisor as HypervisorConfig; use tokio::sync::RwLock; +use tracing::instrument; use crate::{DeviceType, Hypervisor, VcpuThreadIds}; -#[derive(Debug)] pub struct Dragonball { inner: Arc>, } +impl std::fmt::Debug for Dragonball { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Dragonball").finish() + } +} + impl Default for Dragonball { fn default() -> Self { Self::new() @@ -48,11 +54,13 @@ impl Dragonball { #[async_trait] impl Hypervisor for Dragonball { + #[instrument] async fn prepare_vm(&self, id: &str, netns: Option) -> Result<()> { let mut inner = self.inner.write().await; inner.prepare_vm(id, netns).await } + #[instrument] async fn start_vm(&self, timeout: i32) -> Result<()> { let mut inner = self.inner.write().await; inner.start_vm(timeout).await diff --git a/src/runtime-rs/crates/hypervisor/src/kernel_param.rs b/src/runtime-rs/crates/hypervisor/src/kernel_param.rs index 554d61660..e2804dd95 100644 --- a/src/runtime-rs/crates/hypervisor/src/kernel_param.rs +++ b/src/runtime-rs/crates/hypervisor/src/kernel_param.rs @@ -87,7 +87,7 @@ impl KernelParams { params.push(Param::new("rootflags", "dax ro")); } _ => { - return Err(anyhow!("Unsupported rootfs type")); + return Err(anyhow!("Unsupported rootfs type {}", rootfs_type)); } } } @@ -101,12 +101,12 @@ impl KernelParams { params.push(Param::new("rootflags", "ro")); } _ => { - return Err(anyhow!("Unsupported rootfs type")); + return Err(anyhow!("Unsupported rootfs type {}", rootfs_type)); } } } _ => { - return Err(anyhow!("Unsupported rootfs driver")); + return Err(anyhow!("Unsupported rootfs driver {}", rootfs_driver)); } } @@ -310,7 +310,7 @@ mod tests { ] .to_vec(), }, - result: Err(anyhow!("Unsupported rootfs driver")), + result: Err(anyhow!("Unsupported rootfs driver foo")), }, // Unsupported rootfs type TestData { @@ -324,7 +324,7 @@ mod tests { ] .to_vec(), }, - result: Err(anyhow!("Unsupported rootfs type")), + result: Err(anyhow!("Unsupported rootfs type foo")), }, ]; @@ -332,7 +332,6 @@ mod tests { let msg = format!("test[{}]: {:?}", i, t); let result = KernelParams::new_rootfs_kernel_params(t.rootfs_driver, t.rootfs_type); let msg = format!("{}, result: {:?}", msg, result); - if t.result.is_ok() { assert!(result.is_ok(), "{}", msg); assert_eq!(t.expect_params, result.unwrap()); diff --git a/src/runtime-rs/crates/resource/Cargo.toml b/src/runtime-rs/crates/resource/Cargo.toml index 22ffda48b..62f1c2fd0 100644 --- a/src/runtime-rs/crates/resource/Cargo.toml +++ b/src/runtime-rs/crates/resource/Cargo.toml @@ -31,6 +31,7 @@ serde_json = "1.0.82" slog = "2.5.2" slog-scope = "4.4.0" tokio = { version = "1.28.1", features = ["process"] } +tracing = "0.1.36" uuid = { version = "0.4", features = ["v4"] } agent = { path = "../agent" } diff --git a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs index 2a8b6e600..bb3af793d 100644 --- a/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs +++ b/src/runtime-rs/crates/resource/src/cpu_mem/initial_size.rs @@ -96,7 +96,7 @@ impl InitialSizeManager { pub fn setup_config(&self, config: &mut TomlConfig) -> Result<()> { // update this data to the hypervisor config for later use by hypervisor let hypervisor_name = &config.runtime.hypervisor_name; - let mut hv = config + let hv = config .hypervisor .get_mut(hypervisor_name) .context("failed to get hypervisor config")?; diff --git a/src/runtime-rs/crates/resource/src/lib.rs b/src/runtime-rs/crates/resource/src/lib.rs index 4e4aae9e8..c12aa0e48 100644 --- a/src/runtime-rs/crates/resource/src/lib.rs +++ b/src/runtime-rs/crates/resource/src/lib.rs @@ -17,6 +17,7 @@ pub mod manager; mod manager_inner; pub mod network; pub mod resource_persist; +use hypervisor::BlockConfig; use network::NetworkConfig; pub mod rootfs; pub mod share_fs; @@ -30,6 +31,7 @@ use kata_types::config::hypervisor::SharedFsInfo; pub enum ResourceConfig { Network(NetworkConfig), ShareFs(SharedFsInfo), + VmRootfs(BlockConfig), } #[derive(Debug, Clone, Copy, PartialEq)] diff --git a/src/runtime-rs/crates/resource/src/manager.rs b/src/runtime-rs/crates/resource/src/manager.rs index 58a42db0f..a96e16b8d 100644 --- a/src/runtime-rs/crates/resource/src/manager.rs +++ b/src/runtime-rs/crates/resource/src/manager.rs @@ -17,6 +17,7 @@ use kata_types::mount::Mount; use oci::{Linux, LinuxResources}; use persist::sandbox_persist::Persist; use tokio::sync::RwLock; +use tracing::instrument; use crate::network::NetworkConfig; use crate::resource_persist::ResourceState; @@ -34,6 +35,12 @@ pub struct ResourceManager { inner: Arc>, } +impl std::fmt::Debug for ResourceManager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ResourceManager").finish() + } +} + impl ResourceManager { pub async fn new( sid: &str, @@ -58,6 +65,7 @@ impl ResourceManager { inner.get_device_manager() } + #[instrument] pub async fn prepare_before_start_vm(&self, device_configs: Vec) -> Result<()> { let mut inner = self.inner.write().await; inner.prepare_before_start_vm(device_configs).await @@ -68,6 +76,7 @@ impl ResourceManager { inner.handle_network(network_config).await } + #[instrument] pub async fn setup_after_start_vm(&self) -> Result<()> { let mut inner = self.inner.write().await; inner.setup_after_start_vm().await diff --git a/src/runtime-rs/crates/resource/src/manager_inner.rs b/src/runtime-rs/crates/resource/src/manager_inner.rs index 567af5bac..f4e17ce74 100644 --- a/src/runtime-rs/crates/resource/src/manager_inner.rs +++ b/src/runtime-rs/crates/resource/src/manager_inner.rs @@ -121,6 +121,11 @@ impl ResourceManagerInner { .await .context("failed to handle network")?; } + ResourceConfig::VmRootfs(r) => { + do_handle_device(&self.device_manager, &DeviceConfig::BlockCfg(r)) + .await + .context("do handle device failed.")?; + } }; } @@ -141,15 +146,13 @@ impl ResourceManagerInner { // but it is not in netns. So, the previous thread would still remain in the pod netns. // The solution is to block the future on the current thread, it is enabled by spawn an os thread, create a // tokio runtime, and block the task on it. - let hypervisor = self.hypervisor.clone(); let device_manager = self.device_manager.clone(); let network = thread::spawn(move || -> Result> { let rt = runtime::Builder::new_current_thread().enable_io().build()?; let d = rt .block_on(network::new(&network_config, device_manager)) .context("new network")?; - rt.block_on(d.setup(hypervisor.as_ref())) - .context("setup network")?; + rt.block_on(d.setup()).context("setup network")?; Ok(d) }) .join() diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs b/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs index 92662a0d0..0a7b545f2 100644 --- a/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs +++ b/src/runtime-rs/crates/resource/src/network/endpoint/endpoints_test.rs @@ -6,11 +6,12 @@ #[cfg(test)] mod tests { - use anyhow::Context; + use std::{fs, path::Path, sync::Arc}; + + use anyhow::{anyhow, Context, Result}; use netlink_packet_route::MACVLAN_MODE_PRIVATE; use scopeguard::defer; - - use std::sync::Arc; + use tokio::sync::RwLock; use crate::network::{ endpoint::{IPVlanEndpoint, MacVlanEndpoint, VlanEndpoint}, @@ -22,6 +23,41 @@ mod tests { network_pair::{NetworkInterface, NetworkPair, TapInterface}, utils::link::net_test_utils::delete_link, }; + use hypervisor::{device::device_manager::DeviceManager, qemu::Qemu}; + use kata_types::config::{QemuConfig, TomlConfig}; + + async fn get_device_manager() -> Result>> { + let path = env!("CARGO_MANIFEST_DIR"); + let path = Path::new(path) + .join("../../../libs/kata-types/tests/texture/configuration-anno-0.toml"); + + let content = fs::read_to_string(path).context("read configuration failed")?; + // just for test, use x/kata-types/tests/texture/configuration-anno-0.toml as + // the test configuration.toml which is for qemu. + let hypervisor_name: &str = "qemu"; + + let qemu = QemuConfig::new(); + qemu.register(); + + let toml_config = TomlConfig::load(&content).context("load toml config failed")?; + let hypervisor_config = toml_config + .hypervisor + .get(hypervisor_name) + .ok_or_else(|| anyhow!("failed to get hypervisor for {}", &hypervisor_name))?; + + let mut hypervisor = Qemu::new(); + hypervisor + .set_hypervisor_config(hypervisor_config.clone()) + .await; + + let dm = Arc::new(RwLock::new( + DeviceManager::new(Arc::new(hypervisor)) + .await + .context("device manager")?, + )); + + Ok(dm) + } // this unit test tests the integrity of MacVlanEndpoint::new() #[actix_rt::test] @@ -33,6 +69,10 @@ mod tests { let dummy_name = format!("dummy{}", idx); let vlanid = 123; + let dm = get_device_manager().await; + assert!(dm.is_ok()); + let d = dm.unwrap(); + if let Ok((conn, handle, _)) = rtnetlink::new_connection().context("failed to create netlink connection") { @@ -63,11 +103,12 @@ mod tests { .await .context("failed to create manual veth pair") { - if let Ok(mut result) = VlanEndpoint::new(&handle, "", idx, 5) + if let Ok(mut result) = VlanEndpoint::new(&d, &handle, "", idx, 5) .await .context("failed to create new ipvlan endpoint") { let manual = VlanEndpoint { + d, net_pair: NetworkPair { tap: TapInterface { id: String::from("uniqueTestID_kata"), @@ -144,6 +185,9 @@ mod tests { let tap_iface_name = format!("tap{}_kata", idx); // create by NetworkPair::new() let model_str = TC_FILTER_NET_MODEL_STR; let dummy_name = format!("dummy{}", idx); + let dm = get_device_manager().await; + assert!(dm.is_ok()); + let d = dm.unwrap(); if let Ok((conn, handle, _)) = rtnetlink::new_connection().context("failed to create netlink connection") @@ -180,6 +224,7 @@ mod tests { { // model here does not matter, could be any of supported models if let Ok(mut result) = MacVlanEndpoint::new( + &d, &handle, manual_macvlan_iface_name.clone().as_str(), idx, @@ -190,6 +235,7 @@ mod tests { .context("failed to create new macvlan endpoint") { let manual = MacVlanEndpoint { + d, net_pair: NetworkPair { tap: TapInterface { id: String::from("uniqueTestID_kata"), @@ -267,6 +313,9 @@ mod tests { let mac_addr = String::from("02:00:CA:FE:00:04"); let manual_virt_iface_name = format!("eth{}", idx); let tap_iface_name = format!("tap{}_kata", idx); // create by kata + let dm = get_device_manager().await; + assert!(dm.is_ok()); + let d = dm.unwrap(); if let Ok((conn, handle, _)) = rtnetlink::new_connection().context("failed to create netlink connection") @@ -286,11 +335,12 @@ mod tests { .await .context("failed to create manual veth pair") { - if let Ok(mut result) = IPVlanEndpoint::new(&handle, "", idx, 5) + if let Ok(mut result) = IPVlanEndpoint::new(&d, &handle, "", idx, 5) .await .context("failed to create new ipvlan endpoint") { let manual = IPVlanEndpoint { + d, net_pair: NetworkPair { tap: TapInterface { id: String::from("uniqueTestID_kata"), diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/ipvlan_endpoint.rs b/src/runtime-rs/crates/resource/src/network/endpoint/ipvlan_endpoint.rs index a8f08088b..7039275e8 100644 --- a/src/runtime-rs/crates/resource/src/network/endpoint/ipvlan_endpoint.rs +++ b/src/runtime-rs/crates/resource/src/network/endpoint/ipvlan_endpoint.rs @@ -4,37 +4,54 @@ // SPDX-License-Identifier: Apache-2.0 // -use std::io::{self, Error}; +use std::{ + io::{self, Error}, + sync::Arc, +}; -use super::endpoint_persist::{EndpointState, IpVlanEndpointState}; use anyhow::{Context, Result}; use async_trait::async_trait; -use hypervisor::device::DeviceType; -use hypervisor::NetworkDevice; +use tokio::sync::RwLock; -use super::Endpoint; -use crate::network::network_model::TC_FILTER_NET_MODEL_STR; -use crate::network::{utils, NetworkPair}; -use hypervisor::{device::driver::NetworkConfig, Hypervisor}; +use hypervisor::{ + device::{ + device_manager::{do_handle_device, DeviceManager}, + driver::NetworkConfig, + DeviceConfig, DeviceType, + }, + Hypervisor, NetworkDevice, +}; + +use super::{ + endpoint_persist::{EndpointState, IpVlanEndpointState}, + Endpoint, +}; +use crate::network::{network_model::TC_FILTER_NET_MODEL_STR, utils, NetworkPair}; // IPVlanEndpoint is the endpoint bridged to VM #[derive(Debug)] pub struct IPVlanEndpoint { pub(crate) net_pair: NetworkPair, + pub(crate) d: Arc>, } impl IPVlanEndpoint { pub async fn new( + d: &Arc>, handle: &rtnetlink::Handle, name: &str, idx: u32, queues: usize, ) -> Result { - // tc filter network model is the only one works for ipvlan + // tc filter network model is the only for ipvlan let net_pair = NetworkPair::new(handle, idx, name, TC_FILTER_NET_MODEL_STR, queues) .await .context("error creating new NetworkPair")?; - Ok(IPVlanEndpoint { net_pair }) + + Ok(IPVlanEndpoint { + net_pair, + d: d.clone(), + }) } fn get_network_config(&self) -> Result { @@ -45,9 +62,12 @@ impl IPVlanEndpoint { format!("hard_addr {}", &iface.hard_addr), ) })?; + Ok(NetworkConfig { host_dev_name: iface.name.clone(), + virt_iface_name: self.net_pair.virt_iface.name.clone(), guest_mac: Some(guest_mac), + ..Default::default() }) } } @@ -62,18 +82,16 @@ impl Endpoint for IPVlanEndpoint { self.net_pair.tap.tap_iface.hard_addr.clone() } - async fn attach(&self, h: &dyn Hypervisor) -> Result<()> { + async fn attach(&self) -> Result<()> { self.net_pair .add_network_model() .await .context("error adding network model")?; + let config = self.get_network_config().context("get network config")?; - h.add_device(DeviceType::Network(NetworkDevice { - id: self.net_pair.virt_iface.name.clone(), - config, - })) - .await - .context("error adding device by hypervisor")?; + do_handle_device(&self.d, &DeviceConfig::NetworkCfg(config)) + .await + .context("do handle network IPVlan endpoint device failed.")?; Ok(()) } @@ -86,12 +104,13 @@ impl Endpoint for IPVlanEndpoint { let config = self .get_network_config() .context("error getting network config")?; + h.remove_device(DeviceType::Network(NetworkDevice { - id: self.net_pair.virt_iface.name.clone(), config, + ..Default::default() })) .await - .context("error removing device by hypervisor")?; + .context("remove IPVlan endpoint device by hypervisor failed.")?; Ok(()) } diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/macvlan_endpoint.rs b/src/runtime-rs/crates/resource/src/network/endpoint/macvlan_endpoint.rs index 13716e877..ad390973f 100644 --- a/src/runtime-rs/crates/resource/src/network/endpoint/macvlan_endpoint.rs +++ b/src/runtime-rs/crates/resource/src/network/endpoint/macvlan_endpoint.rs @@ -4,24 +4,39 @@ // SPDX-License-Identifier: Apache-2.0 // -use std::io::{self, Error}; +use std::{ + io::{self, Error}, + sync::Arc, +}; -use super::endpoint_persist::{EndpointState, MacvlanEndpointState}; -use super::Endpoint; -use crate::network::{utils, NetworkPair}; use anyhow::{Context, Result}; use async_trait::async_trait; -use hypervisor::device::DeviceType; -use hypervisor::NetworkDevice; -use hypervisor::{device::driver::NetworkConfig, Hypervisor}; +use tokio::sync::RwLock; + +use hypervisor::{ + device::{ + device_manager::{do_handle_device, DeviceManager}, + driver::NetworkConfig, + DeviceConfig, DeviceType, + }, + Hypervisor, NetworkDevice, +}; + +use super::{ + endpoint_persist::{EndpointState, MacvlanEndpointState}, + Endpoint, +}; +use crate::network::{utils, NetworkPair}; #[derive(Debug)] pub struct MacVlanEndpoint { pub(crate) net_pair: NetworkPair, + pub(crate) d: Arc>, } impl MacVlanEndpoint { pub async fn new( + d: &Arc>, handle: &rtnetlink::Handle, name: &str, idx: u32, @@ -31,7 +46,11 @@ impl MacVlanEndpoint { let net_pair = NetworkPair::new(handle, idx, name, model, queues) .await .context("error creating new networkInterfacePair")?; - Ok(MacVlanEndpoint { net_pair }) + + Ok(MacVlanEndpoint { + net_pair, + d: d.clone(), + }) } fn get_network_config(&self) -> Result { @@ -42,9 +61,12 @@ impl MacVlanEndpoint { format!("hard_addr {}", &iface.hard_addr), ) })?; + Ok(NetworkConfig { host_dev_name: iface.name.clone(), + virt_iface_name: self.net_pair.virt_iface.name.clone(), guest_mac: Some(guest_mac), + ..Default::default() }) } } @@ -59,18 +81,16 @@ impl Endpoint for MacVlanEndpoint { self.net_pair.tap.tap_iface.hard_addr.clone() } - async fn attach(&self, h: &dyn Hypervisor) -> Result<()> { + async fn attach(&self) -> Result<()> { self.net_pair .add_network_model() .await .context("add network model")?; + let config = self.get_network_config().context("get network config")?; - h.add_device(DeviceType::Network(NetworkDevice { - id: self.net_pair.virt_iface.name.clone(), - config, - })) - .await - .context("error adding device by hypervisor")?; + do_handle_device(&self.d, &DeviceConfig::NetworkCfg(config)) + .await + .context("do handle network MacVlan endpoint device failed.")?; Ok(()) } @@ -80,13 +100,14 @@ impl Endpoint for MacVlanEndpoint { .del_network_model() .await .context("del network model")?; + let config = self.get_network_config().context("get network config")?; h.remove_device(DeviceType::Network(NetworkDevice { - id: self.net_pair.virt_iface.name.clone(), config, + ..Default::default() })) .await - .context("error removing device by hypervisor")?; + .context("remove MacVlan endpoint device by hypervisor failed.")?; Ok(()) } diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/mod.rs b/src/runtime-rs/crates/resource/src/network/endpoint/mod.rs index d8a6fadac..2efd0bc34 100644 --- a/src/runtime-rs/crates/resource/src/network/endpoint/mod.rs +++ b/src/runtime-rs/crates/resource/src/network/endpoint/mod.rs @@ -27,7 +27,7 @@ use super::EndpointState; pub trait Endpoint: std::fmt::Debug + Send + Sync { async fn name(&self) -> String; async fn hardware_addr(&self) -> String; - async fn attach(&self, hypervisor: &dyn Hypervisor) -> Result<()>; + async fn attach(&self) -> Result<()>; async fn detach(&self, hypervisor: &dyn Hypervisor) -> Result<()>; async fn save(&self) -> Option; } diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/physical_endpoint.rs b/src/runtime-rs/crates/resource/src/network/endpoint/physical_endpoint.rs index 9a3a4d498..9bb1dbbcf 100644 --- a/src/runtime-rs/crates/resource/src/network/endpoint/physical_endpoint.rs +++ b/src/runtime-rs/crates/resource/src/network/endpoint/physical_endpoint.rs @@ -99,7 +99,7 @@ impl Endpoint for PhysicalEndpoint { self.hard_addr.clone() } - async fn attach(&self, _hypervisor: &dyn Hypervisor) -> Result<()> { + async fn attach(&self) -> Result<()> { // bind physical interface from host driver and bind to vfio driver::bind_device_to_vfio( &self.bdf, diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/veth_endpoint.rs b/src/runtime-rs/crates/resource/src/network/endpoint/veth_endpoint.rs index 4415e6d29..b24b5cf31 100644 --- a/src/runtime-rs/crates/resource/src/network/endpoint/veth_endpoint.rs +++ b/src/runtime-rs/crates/resource/src/network/endpoint/veth_endpoint.rs @@ -4,24 +4,39 @@ // SPDX-License-Identifier: Apache-2.0 // -use std::io::{self, Error}; +use std::{ + io::{self, Error}, + sync::Arc, +}; -use super::endpoint_persist::{EndpointState, VethEndpointState}; -use super::Endpoint; -use crate::network::{utils, NetworkPair}; use anyhow::{Context, Result}; use async_trait::async_trait; -use hypervisor::device::DeviceType; -use hypervisor::NetworkDevice; -use hypervisor::{device::driver::NetworkConfig, Hypervisor}; +use tokio::sync::RwLock; + +use hypervisor::{ + device::{ + device_manager::{do_handle_device, DeviceManager}, + driver::NetworkConfig, + DeviceConfig, DeviceType, + }, + Hypervisor, NetworkDevice, +}; + +use super::{ + endpoint_persist::{EndpointState, VethEndpointState}, + Endpoint, +}; +use crate::network::{utils, NetworkPair}; #[derive(Debug)] pub struct VethEndpoint { - net_pair: NetworkPair, + pub(crate) net_pair: NetworkPair, + pub(crate) d: Arc>, } impl VethEndpoint { pub async fn new( + d: &Arc>, handle: &rtnetlink::Handle, name: &str, idx: u32, @@ -30,8 +45,12 @@ impl VethEndpoint { ) -> Result { let net_pair = NetworkPair::new(handle, idx, name, model, queues) .await - .context("new networkInterfacePair")?; - Ok(VethEndpoint { net_pair }) + .context("new network interface pair failed.")?; + + Ok(VethEndpoint { + net_pair, + d: d.clone(), + }) } fn get_network_config(&self) -> Result { @@ -42,9 +61,12 @@ impl VethEndpoint { format!("hard_addr {}", &iface.hard_addr), ) })?; + Ok(NetworkConfig { host_dev_name: iface.name.clone(), + virt_iface_name: self.net_pair.virt_iface.name.clone(), guest_mac: Some(guest_mac), + ..Default::default() }) } } @@ -59,18 +81,17 @@ impl Endpoint for VethEndpoint { self.net_pair.tap.tap_iface.hard_addr.clone() } - async fn attach(&self, h: &dyn Hypervisor) -> Result<()> { + async fn attach(&self) -> Result<()> { self.net_pair .add_network_model() .await .context("add network model")?; + let config = self.get_network_config().context("get network config")?; - h.add_device(DeviceType::Network(NetworkDevice { - id: self.net_pair.virt_iface.name.clone(), - config, - })) - .await - .context("error adding device by hypervisor")?; + do_handle_device(&self.d, &DeviceConfig::NetworkCfg(config)) + .await + .context("do handle network Veth endpoint device failed.")?; + Ok(()) } @@ -78,16 +99,19 @@ impl Endpoint for VethEndpoint { self.net_pair .del_network_model() .await - .context("del network model")?; + .context("del network model failed.")?; + let config = self.get_network_config().context("get network config")?; h.remove_device(DeviceType::Network(NetworkDevice { - id: self.net_pair.virt_iface.name.clone(), config, + ..Default::default() })) .await - .context("error removing device by hypervisor")?; + .context("remove Veth endpoint device by hypervisor failed.")?; + Ok(()) } + async fn save(&self) -> Option { Some(EndpointState { veth_endpoint: Some(VethEndpointState { diff --git a/src/runtime-rs/crates/resource/src/network/endpoint/vlan_endpoint.rs b/src/runtime-rs/crates/resource/src/network/endpoint/vlan_endpoint.rs index 6aca674be..bfc852d39 100644 --- a/src/runtime-rs/crates/resource/src/network/endpoint/vlan_endpoint.rs +++ b/src/runtime-rs/crates/resource/src/network/endpoint/vlan_endpoint.rs @@ -4,25 +4,39 @@ // SPDX-License-Identifier: Apache-2.0 // -use std::io::{self, Error}; +use std::{ + io::{self, Error}, + sync::Arc, +}; use anyhow::{Context, Result}; use async_trait::async_trait; -use hypervisor::device::DeviceType; -use hypervisor::NetworkDevice; +use tokio::sync::RwLock; + +use hypervisor::{ + device::{ + device_manager::{do_handle_device, DeviceManager}, + driver::NetworkConfig, + DeviceConfig, DeviceType, + }, + Hypervisor, NetworkDevice, +}; + +use super::{ + endpoint_persist::{EndpointState, VlanEndpointState}, + Endpoint, +}; +use crate::network::{network_model::TC_FILTER_NET_MODEL_STR, utils, NetworkPair}; -use super::endpoint_persist::{EndpointState, VlanEndpointState}; -use super::Endpoint; -use crate::network::network_model::TC_FILTER_NET_MODEL_STR; -use crate::network::{utils, NetworkPair}; -use hypervisor::{device::driver::NetworkConfig, Hypervisor}; #[derive(Debug)] pub struct VlanEndpoint { pub(crate) net_pair: NetworkPair, + pub(crate) d: Arc>, } impl VlanEndpoint { pub async fn new( + d: &Arc>, handle: &rtnetlink::Handle, name: &str, idx: u32, @@ -30,8 +44,12 @@ impl VlanEndpoint { ) -> Result { let net_pair = NetworkPair::new(handle, idx, name, TC_FILTER_NET_MODEL_STR, queues) .await - .context("error creating networkInterfacePair")?; - Ok(VlanEndpoint { net_pair }) + .context("new network interface pair failed.")?; + + Ok(VlanEndpoint { + net_pair, + d: d.clone(), + }) } fn get_network_config(&self) -> Result { @@ -42,9 +60,12 @@ impl VlanEndpoint { format!("hard_addr {}", &iface.hard_addr), ) })?; + Ok(NetworkConfig { host_dev_name: iface.name.clone(), + virt_iface_name: self.net_pair.virt_iface.name.clone(), guest_mac: Some(guest_mac), + ..Default::default() }) } } @@ -59,18 +80,16 @@ impl Endpoint for VlanEndpoint { self.net_pair.tap.tap_iface.hard_addr.clone() } - async fn attach(&self, h: &dyn Hypervisor) -> Result<()> { + async fn attach(&self) -> Result<()> { self.net_pair .add_network_model() .await - .context("error adding network model")?; + .context("add network model failed.")?; + let config = self.get_network_config().context("get network config")?; - h.add_device(DeviceType::Network(NetworkDevice { - id: self.net_pair.virt_iface.name.clone(), - config, - })) - .await - .context("error adding device by hypervisor")?; + do_handle_device(&self.d, &DeviceConfig::NetworkCfg(config)) + .await + .context("do handle network Vlan endpoint device failed.")?; Ok(()) } @@ -79,16 +98,17 @@ impl Endpoint for VlanEndpoint { self.net_pair .del_network_model() .await - .context("error deleting network model")?; + .context("delete network model failed.")?; + let config = self .get_network_config() - .context("error getting network config")?; + .context("get network config failed.")?; h.remove_device(DeviceType::Network(NetworkDevice { - id: self.net_pair.virt_iface.name.clone(), config, + ..Default::default() })) .await - .context("error removing device by hypervisor")?; + .context("remove Vlan endpoint device by hypervisor failed.")?; Ok(()) } diff --git a/src/runtime-rs/crates/resource/src/network/mod.rs b/src/runtime-rs/crates/resource/src/network/mod.rs index c922e817f..ed9a9e4f9 100644 --- a/src/runtime-rs/crates/resource/src/network/mod.rs +++ b/src/runtime-rs/crates/resource/src/network/mod.rs @@ -35,7 +35,7 @@ pub enum NetworkConfig { #[async_trait] pub trait Network: Send + Sync { - async fn setup(&self, h: &dyn Hypervisor) -> Result<()>; + async fn setup(&self) -> Result<()>; async fn interfaces(&self) -> Result>; async fn routes(&self) -> Result>; async fn neighs(&self) -> Result>; diff --git a/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_link.rs b/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_link.rs index e8d549491..d6dc0a82b 100644 --- a/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_link.rs +++ b/src/runtime-rs/crates/resource/src/network/network_info/network_info_from_link.rs @@ -31,6 +31,7 @@ impl NetworkInfoFromLink { pub async fn new( handle: &rtnetlink::Handle, link: &dyn link::Link, + addrs: Vec, hw_addr: &str, ) -> Result { let attrs = link.attrs(); @@ -40,9 +41,7 @@ impl NetworkInfoFromLink { interface: Interface { device: name.clone(), name: name.clone(), - ip_addresses: handle_addresses(handle, attrs) - .await - .context("handle addresses")?, + ip_addresses: addrs.clone(), mtu: attrs.mtu as u64, hw_addr: hw_addr.to_string(), pci_addr: Default::default(), @@ -59,7 +58,10 @@ impl NetworkInfoFromLink { } } -async fn handle_addresses(handle: &rtnetlink::Handle, attrs: &LinkAttrs) -> Result> { +pub async fn handle_addresses( + handle: &rtnetlink::Handle, + attrs: &LinkAttrs, +) -> Result> { let mut addr_msg_list = handle .address() .get() diff --git a/src/runtime-rs/crates/resource/src/network/network_pair.rs b/src/runtime-rs/crates/resource/src/network/network_pair.rs index 1bee220fe..bfb0623fe 100644 --- a/src/runtime-rs/crates/resource/src/network/network_pair.rs +++ b/src/runtime-rs/crates/resource/src/network/network_pair.rs @@ -39,6 +39,7 @@ pub struct NetworkPair { pub model: Arc, pub network_qos: bool, } + impl NetworkPair { pub(crate) async fn new( handle: &rtnetlink::Handle, diff --git a/src/runtime-rs/crates/resource/src/network/network_with_netns.rs b/src/runtime-rs/crates/resource/src/network/network_with_netns.rs index 16faa88b3..81c8c3939 100644 --- a/src/runtime-rs/crates/resource/src/network/network_with_netns.rs +++ b/src/runtime-rs/crates/resource/src/network/network_with_netns.rs @@ -26,7 +26,7 @@ use super::{ Endpoint, IPVlanEndpoint, MacVlanEndpoint, PhysicalEndpoint, VethEndpoint, VlanEndpoint, }, network_entity::NetworkEntity, - network_info::network_info_from_link::NetworkInfoFromLink, + network_info::network_info_from_link::{handle_addresses, NetworkInfoFromLink}, utils::{link, netns}, Network, }; @@ -49,7 +49,13 @@ struct NetworkWithNetnsInner { impl NetworkWithNetnsInner { async fn new(config: &NetworkWithNetNsConfig, d: Arc>) -> Result { let entity_list = if config.netns_path.is_empty() { - warn!(sl!(), "skip to scan for empty netns"); + warn!(sl!(), "Skip to scan network for empty netns"); + vec![] + } else if config.network_model.as_str() == "none" { + warn!( + sl!(), + "Skip to scan network from netns due to the none network model" + ); vec![] } else { // get endpoint @@ -82,11 +88,11 @@ impl NetworkWithNetns { #[async_trait] impl Network for NetworkWithNetns { - async fn setup(&self, h: &dyn Hypervisor) -> Result<()> { + async fn setup(&self) -> Result<()> { let inner = self.inner.read().await; let _netns_guard = netns::NetnsGuard::new(&inner.netns_path).context("net netns guard")?; for e in &inner.entity_list { - e.endpoint.attach(h).await.context("attach")?; + e.endpoint.attach().await.context("attach")?; } Ok(()) } @@ -183,9 +189,18 @@ async fn get_entity_from_netns( continue; } + let ip_addresses = handle_addresses(&handle, attrs) + .await + .context("handle addresses")?; + // Ignore unconfigured network interfaces. These are either base tunnel devices that are not namespaced + // like gre0, gretap0, sit0, ipip0, tunl0 or incorrectly setup interfaces. + if ip_addresses.is_empty() { + continue; + } + let idx = idx.fetch_add(1, Ordering::Relaxed); let (endpoint, network_info) = - create_endpoint(&handle, link.as_ref(), idx, config, d.clone()) + create_endpoint(&handle, link.as_ref(), ip_addresses, idx, config, d.clone()) .await .context("create endpoint")?; @@ -198,6 +213,7 @@ async fn get_entity_from_netns( async fn create_endpoint( handle: &rtnetlink::Handle, link: &dyn link::Link, + addrs: Vec, idx: u32, config: &NetworkWithNetNsConfig, d: Arc>, @@ -225,6 +241,7 @@ async fn create_endpoint( match link_type { "veth" => { let ret = VethEndpoint::new( + &d, handle, &attrs.name, idx, @@ -236,19 +253,20 @@ async fn create_endpoint( Arc::new(ret) } "vlan" => { - let ret = VlanEndpoint::new(handle, &attrs.name, idx, config.queues) + let ret = VlanEndpoint::new(&d, handle, &attrs.name, idx, config.queues) .await .context("vlan endpoint")?; Arc::new(ret) } "ipvlan" => { - let ret = IPVlanEndpoint::new(handle, &attrs.name, idx, config.queues) + let ret = IPVlanEndpoint::new(&d, handle, &attrs.name, idx, config.queues) .await .context("ipvlan endpoint")?; Arc::new(ret) } "macvlan" => { let ret = MacVlanEndpoint::new( + &d, handle, &attrs.name, idx, @@ -264,7 +282,7 @@ async fn create_endpoint( }; let network_info = Arc::new( - NetworkInfoFromLink::new(handle, link, &endpoint.hardware_addr().await) + NetworkInfoFromLink::new(handle, link, addrs, &endpoint.hardware_addr().await) .await .context("network info from link")?, ); diff --git a/src/runtime-rs/crates/resource/src/rootfs/block_rootfs.rs b/src/runtime-rs/crates/resource/src/rootfs/block_rootfs.rs index bcb31d957..33b0ff216 100644 --- a/src/runtime-rs/crates/resource/src/rootfs/block_rootfs.rs +++ b/src/runtime-rs/crates/resource/src/rootfs/block_rootfs.rs @@ -11,7 +11,7 @@ use anyhow::{anyhow, Context, Result}; use async_trait::async_trait; use hypervisor::{ device::{ - device_manager::{do_handle_device, DeviceManager}, + device_manager::{do_handle_device, get_block_driver, DeviceManager}, DeviceConfig, DeviceType, }, BlockConfig, @@ -43,9 +43,12 @@ impl BlockRootfs { fs::create_dir_all(&host_path) .map_err(|e| anyhow!("failed to create rootfs dir {}: {:?}", host_path, e))?; + let block_driver = get_block_driver(d).await; + let block_device_config = &mut BlockConfig { major: stat::major(dev_id) as i64, minor: stat::minor(dev_id) as i64, + driver_option: block_driver, ..Default::default() }; diff --git a/src/runtime-rs/crates/resource/src/share_fs/sandbox_bind_mounts.rs b/src/runtime-rs/crates/resource/src/share_fs/sandbox_bind_mounts.rs index 13bd28103..8c166573d 100644 --- a/src/runtime-rs/crates/resource/src/share_fs/sandbox_bind_mounts.rs +++ b/src/runtime-rs/crates/resource/src/share_fs/sandbox_bind_mounts.rs @@ -23,6 +23,7 @@ use anyhow::{anyhow, Context, Result}; use super::utils::{do_get_host_path, mkdir_with_permissions}; use kata_sys_util::{fs::get_base_name, mount}; use kata_types::mount::{SANDBOX_BIND_MOUNTS_DIR, SANDBOX_BIND_MOUNTS_RO, SANDBOX_BIND_MOUNTS_RW}; +use nix::mount::MsFlags; #[derive(Clone, Default, Debug)] pub struct SandboxBindMounts { @@ -101,14 +102,15 @@ impl SandboxBindMounts { // mount -o bind,ro host_shared mount_dest // host_shared: ${bindmount} - mount::bind_mount_unchecked(Path::new(bindmount), &mount_dest, true).map_err(|e| { - for p in &mounted_list { - nix::mount::umount(p).unwrap_or_else(|x| { - format!("do umount failed: {:?}", x); - }); - } - e - })?; + mount::bind_mount_unchecked(Path::new(bindmount), &mount_dest, true, MsFlags::MS_SLAVE) + .map_err(|e| { + for p in &mounted_list { + nix::mount::umount(p).unwrap_or_else(|x| { + format!("do umount failed: {:?}", x); + }); + } + e + })?; // default sandbox bind mounts mode is ro. if bindmount_mode == SANDBOX_BIND_MOUNTS_RO { diff --git a/src/runtime-rs/crates/resource/src/share_fs/share_virtio_fs.rs b/src/runtime-rs/crates/resource/src/share_fs/share_virtio_fs.rs index 9a5676bdf..a1a1ba25c 100644 --- a/src/runtime-rs/crates/resource/src/share_fs/share_virtio_fs.rs +++ b/src/runtime-rs/crates/resource/src/share_fs/share_virtio_fs.rs @@ -18,6 +18,7 @@ use hypervisor::{ Hypervisor, ShareFsDeviceConfig, }; use kata_sys_util::mount; +use nix::mount::MsFlags; use super::{utils, PASSTHROUGH_FS_DIR}; @@ -45,7 +46,7 @@ pub(crate) async fn prepare_virtiofs( let host_rw_dest = utils::get_host_rw_shared_path(id); utils::ensure_dir_exist(&host_rw_dest)?; - mount::bind_mount_unchecked(&host_rw_dest, &host_ro_dest, true) + mount::bind_mount_unchecked(&host_rw_dest, &host_ro_dest, true, MsFlags::MS_SLAVE) .context("bind mount shared_fs directory")?; let share_fs_device = ShareFsDevice { diff --git a/src/runtime-rs/crates/resource/src/share_fs/utils.rs b/src/runtime-rs/crates/resource/src/share_fs/utils.rs index 3300c74ef..0b019e9c5 100644 --- a/src/runtime-rs/crates/resource/src/share_fs/utils.rs +++ b/src/runtime-rs/crates/resource/src/share_fs/utils.rs @@ -11,6 +11,7 @@ use std::{ use anyhow::Result; use kata_sys_util::mount; +use nix::mount::MsFlags; use super::*; @@ -45,7 +46,7 @@ pub(crate) fn share_to_guest( is_rafs: bool, ) -> Result { let host_dest = do_get_host_path(target, sid, cid, is_volume, false); - mount::bind_mount_unchecked(source, &host_dest, readonly) + mount::bind_mount_unchecked(source, &host_dest, readonly, MsFlags::MS_SLAVE) .with_context(|| format!("failed to bind mount {} to {}", source, &host_dest))?; // bind mount remount event is not propagated to mount subtrees, so we have diff --git a/src/runtime-rs/crates/resource/src/volume/block_volume.rs b/src/runtime-rs/crates/resource/src/volume/block_volume.rs index 69b57054a..ded162bd0 100644 --- a/src/runtime-rs/crates/resource/src/volume/block_volume.rs +++ b/src/runtime-rs/crates/resource/src/volume/block_volume.rs @@ -16,7 +16,7 @@ use crate::volume::utils::{ }; use hypervisor::{ device::{ - device_manager::{do_handle_device, DeviceManager}, + device_manager::{do_handle_device, get_block_driver, DeviceManager}, DeviceConfig, DeviceType, }, BlockConfig, @@ -42,6 +42,8 @@ impl BlockVolume { // default block device fs type: ext4. let mut blk_dev_fstype = DEFAULT_VOLUME_FS_TYPE.to_string(); + let block_driver = get_block_driver(d).await; + let block_device_config = match m.r#type.as_str() { KATA_MOUNT_BIND_TYPE => { let fstat = stat::stat(mnt_src).context(format!("stat {}", m.source))?; @@ -49,6 +51,7 @@ impl BlockVolume { BlockConfig { major: stat::major(fstat.st_rdev) as i64, minor: stat::minor(fstat.st_rdev) as i64, + driver_option: block_driver, ..Default::default() } } @@ -77,6 +80,7 @@ impl BlockVolume { BlockConfig { path_on_host: v.device, + driver_option: block_driver, ..Default::default() } } diff --git a/src/runtime-rs/crates/resource/src/volume/spdk_volume.rs b/src/runtime-rs/crates/resource/src/volume/spdk_volume.rs index 31076b8a3..f0ea125da 100644 --- a/src/runtime-rs/crates/resource/src/volume/spdk_volume.rs +++ b/src/runtime-rs/crates/resource/src/volume/spdk_volume.rs @@ -16,7 +16,7 @@ use crate::volume::utils::{ }; use hypervisor::{ device::{ - device_manager::{do_handle_device, DeviceManager}, + device_manager::{do_handle_device, get_block_driver, DeviceManager}, DeviceConfig, DeviceType, }, VhostUserConfig, VhostUserType, @@ -73,9 +73,12 @@ impl SPDKVolume { } } + let block_driver = get_block_driver(d).await; + let mut vhu_blk_config = &mut VhostUserConfig { socket_path: device, device_type: VhostUserType::Blk("vhost-user-blk-pci".to_owned()), + driver_option: block_driver, ..Default::default() }; diff --git a/src/runtime-rs/crates/runtimes/Cargo.toml b/src/runtime-rs/crates/runtimes/Cargo.toml index f2a4ea522..1d0ff168e 100644 --- a/src/runtime-rs/crates/runtimes/Cargo.toml +++ b/src/runtime-rs/crates/runtimes/Cargo.toml @@ -12,6 +12,11 @@ netns-rs = "0.1.0" slog = "2.5.2" slog-scope = "4.4.0" tokio = { version = "1.28.1", features = ["rt-multi-thread"] } +tracing = "0.1.36" +tracing-opentelemetry = "0.18.0" +opentelemetry = { version = "0.18.0", features = ["rt-tokio-current-thread", "trace", "rt-tokio"] } +opentelemetry-jaeger = { version = "0.17.0", features = ["rt-tokio", "hyper_collector_client", "collector_client"] } +tracing-subscriber = { version = "0.3", features = ["registry", "std"] } hyper = { version = "0.14.20", features = ["stream", "server", "http1"] } hyperlocal = "0.8" serde_json = "1.0.88" diff --git a/src/runtime-rs/crates/runtimes/common/src/sandbox.rs b/src/runtime-rs/crates/runtimes/common/src/sandbox.rs index 1a79f23d6..9134b8c78 100644 --- a/src/runtime-rs/crates/runtimes/common/src/sandbox.rs +++ b/src/runtime-rs/crates/runtimes/common/src/sandbox.rs @@ -13,6 +13,15 @@ pub struct SandboxNetworkEnv { pub network_created: bool, } +impl std::fmt::Debug for SandboxNetworkEnv { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SandboxNetworkEnv") + .field("netns", &self.netns) + .field("network_created", &self.network_created) + .finish() + } +} + #[async_trait] pub trait Sandbox: Send + Sync { async fn start( diff --git a/src/runtime-rs/crates/runtimes/src/lib.rs b/src/runtime-rs/crates/runtimes/src/lib.rs index 867c8ef9e..63357df01 100644 --- a/src/runtime-rs/crates/runtimes/src/lib.rs +++ b/src/runtime-rs/crates/runtimes/src/lib.rs @@ -13,3 +13,4 @@ pub mod manager; pub use manager::RuntimeHandlerManager; pub use shim_interface; mod shim_mgmt; +pub mod tracer; diff --git a/src/runtime-rs/crates/runtimes/src/manager.rs b/src/runtime-rs/crates/runtimes/src/manager.rs index d05e1961f..1244b6835 100644 --- a/src/runtime-rs/crates/runtimes/src/manager.rs +++ b/src/runtime-rs/crates/runtimes/src/manager.rs @@ -24,7 +24,8 @@ use persist::sandbox_persist::Persist; use resource::{cpu_mem::initial_size::InitialSizeManager, network::generate_netns_name}; use shim_interface::shim_mgmt::ERR_NO_SHIM_SERVER; use tokio::fs; -use tokio::sync::{mpsc::Sender, RwLock}; +use tokio::sync::{mpsc::Sender, Mutex, RwLock}; +use tracing::instrument; #[cfg(feature = "virt")] use virt_container::{ sandbox::{SandboxRestoreArgs, VirtSandbox}, @@ -34,23 +35,39 @@ use virt_container::{ #[cfg(feature = "wasm")] use wasm_container::WasmContainer; -use crate::shim_mgmt::server::MgmtServer; +use crate::{ + shim_mgmt::server::MgmtServer, + tracer::{KataTracer, ROOTSPAN}, +}; struct RuntimeHandlerManagerInner { id: String, msg_sender: Sender, + kata_tracer: Arc>, runtime_instance: Option>, } +impl std::fmt::Debug for RuntimeHandlerManagerInner { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RuntimeHandlerManagerInner") + .field("id", &self.id) + .field("msg_sender", &self.msg_sender) + .finish() + } +} + impl RuntimeHandlerManagerInner { fn new(id: &str, msg_sender: Sender) -> Result { + let tracer = KataTracer::new(); Ok(Self { id: id.to_string(), msg_sender, + kata_tracer: Arc::new(Mutex::new(tracer)), runtime_instance: None, }) } + #[instrument] async fn init_runtime_handler( &mut self, spec: &oci::Spec, @@ -72,10 +89,23 @@ impl RuntimeHandlerManagerInner { _ => return Err(anyhow!("Unsupported runtime: {}", &config.runtime.name)), }; let runtime_instance = runtime_handler - .new_instance(&self.id, self.msg_sender.clone(), config) + .new_instance(&self.id, self.msg_sender.clone(), config.clone()) .await .context("new runtime instance")?; + // initilize the trace subscriber + if config.runtime.enable_tracing { + let mut tracer = self.kata_tracer.lock().await; + if let Err(e) = tracer.trace_setup( + &self.id, + &config.runtime.jaeger_endpoint, + &config.runtime.jaeger_user, + &config.runtime.jaeger_password, + ) { + warn!(sl!(), "failed to setup tracing, {:?}", e); + } + } + // start sandbox runtime_instance .sandbox @@ -86,6 +116,7 @@ impl RuntimeHandlerManagerInner { Ok(()) } + #[instrument] async fn try_init( &mut self, spec: &oci::Spec, @@ -149,6 +180,7 @@ impl RuntimeHandlerManagerInner { netns, network_created, }; + self.init_runtime_handler(spec, state, network_env, dns, Arc::new(config)) .await .context("init runtime handler")?; @@ -171,14 +203,25 @@ impl RuntimeHandlerManagerInner { fn get_runtime_instance(&self) -> Option> { self.runtime_instance.clone() } + + fn get_kata_tracer(&self) -> Arc> { + self.kata_tracer.clone() + } } pub struct RuntimeHandlerManager { inner: Arc>, } +// todo: a more detailed impl for fmt::Debug +impl std::fmt::Debug for RuntimeHandlerManager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RuntimeHandlerManager").finish() + } +} + impl RuntimeHandlerManager { - pub async fn new(id: &str, msg_sender: Sender) -> Result { + pub fn new(id: &str, msg_sender: Sender) -> Result { Ok(Self { inner: Arc::new(RwLock::new(RuntimeHandlerManagerInner::new( id, msg_sender, @@ -243,6 +286,12 @@ impl RuntimeHandlerManager { .ok_or_else(|| anyhow!("runtime not ready")) } + async fn get_kata_tracer(&self) -> Result>> { + let inner = self.inner.read().await; + Ok(inner.get_kata_tracer()) + } + + #[instrument] async fn try_init_runtime_instance( &self, spec: &oci::Spec, @@ -253,6 +302,7 @@ impl RuntimeHandlerManager { inner.try_init(spec, state, options).await } + #[instrument(parent = &*(ROOTSPAN))] pub async fn handler_message(&self, req: Request) -> Result { if let Request::CreateContainer(container_config) = req { // get oci spec @@ -291,6 +341,7 @@ impl RuntimeHandlerManager { } } + #[instrument(parent = &(*ROOTSPAN))] pub async fn handler_request(&self, req: Request) -> Result { let instance = self .get_runtime_instance() @@ -320,6 +371,11 @@ impl RuntimeHandlerManager { Request::ShutdownContainer(req) => { if cm.need_shutdown_sandbox(&req).await { sandbox.shutdown().await.context("do shutdown")?; + + // stop the tracer collector + let kata_tracer = self.get_kata_tracer().await.context("get kata tracer")?; + let tracer = kata_tracer.lock().await; + tracer.trace_end(); } Ok(Response::ShutdownContainer) } @@ -388,6 +444,7 @@ impl RuntimeHandlerManager { /// 3. shimv2 create task option /// 4. If above three are not set, then get default path from DEFAULT_RUNTIME_CONFIGURATIONS /// in kata-containers/src/libs/kata-types/src/config/default.rs, in array order. +#[instrument] fn load_config(spec: &oci::Spec, option: &Option>) -> Result { const KATA_CONF_FILE: &str = "KATA_CONF_FILE"; let annotation = Annotation::new(spec.annotations.clone()); diff --git a/src/runtime-rs/crates/runtimes/src/tracer.rs b/src/runtime-rs/crates/runtimes/src/tracer.rs new file mode 100644 index 000000000..e34c55606 --- /dev/null +++ b/src/runtime-rs/crates/runtimes/src/tracer.rs @@ -0,0 +1,169 @@ +// Copyright (c) 2019-2022 Alibaba Cloud +// Copyright (c) 2019-2022 Ant Group +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::cmp::min; +use std::sync::Arc; + +use anyhow::Result; +use lazy_static::lazy_static; +use opentelemetry::global; +use opentelemetry::runtime::Tokio; +use tracing::{span, subscriber::NoSubscriber, Span, Subscriber}; +use tracing_subscriber::prelude::*; +use tracing_subscriber::Registry; + +const DEFAULT_JAEGER_URL: &str = "http://localhost:14268/api/traces"; + +lazy_static! { + /// The ROOTSPAN is a phantom span that is running by calling [`trace_enter_root()`] at the background + /// once the configuration is read and config.runtime.enable_tracing is enabled + /// The ROOTSPAN exits by calling [`trace_exit_root()`] on shutdown request sent from containerd + /// + /// NOTE: + /// This allows other threads which are not directly running under some spans to be tracked easily + /// within the entire sandbox's lifetime. + /// To do this, you just need to add attribute #[instrment(parent=&(*ROOTSPAN))] + pub static ref ROOTSPAN: Span = span!(tracing::Level::TRACE, "root-span"); +} + +/// The tracer wrapper for kata-containers +/// The fields and member methods should ALWAYS be PRIVATE and be exposed in a safe +/// way to other modules +unsafe impl Send for KataTracer {} +unsafe impl Sync for KataTracer {} +pub struct KataTracer { + subscriber: Arc, + enabled: bool, +} + +impl Default for KataTracer { + fn default() -> Self { + Self::new() + } +} + +impl KataTracer { + /// Constructor of KataTracer, this is a dummy implementation for static initialization + pub fn new() -> Self { + Self { + subscriber: Arc::new(NoSubscriber::default()), + enabled: false, + } + } + + /// Set the tracing enabled flag + fn enable(&mut self) { + self.enabled = true; + } + + /// Return whether the tracing is enabled, enabled by [`trace_setup`] + fn enabled(&self) -> bool { + self.enabled + } + + /// Call when the tracing is enabled (set in toml configuration file) + /// This setup the subscriber, which maintains the span's information, to global and + /// inside KATA_TRACER. + /// + /// Note that the span will be noop(not collected) if a invalid subscriber is set + pub fn trace_setup( + &mut self, + sid: &str, + jaeger_endpoint: &str, + jaeger_username: &str, + jaeger_password: &str, + ) -> Result<()> { + // If varify jaeger config returns an error, it means that the tracing should not be enabled + let endpoint = verify_jaeger_config(jaeger_endpoint, jaeger_username, jaeger_password)?; + + // derive a subscriber to collect span info + let tracer = opentelemetry_jaeger::new_collector_pipeline() + .with_service_name(format!("kata-sb-{}", &sid[0..min(8, sid.len())])) + .with_endpoint(endpoint) + .with_username(jaeger_username) + .with_password(jaeger_password) + .with_hyper() + .install_batch(Tokio)?; + + let layer = tracing_opentelemetry::layer().with_tracer(tracer); + + let sub = Registry::default().with(layer); + + // we use Arc to let global subscriber and katatracer to SHARE the SAME subscriber + // this is for record the global subscriber into a global variable KATA_TRACER for more usages + let subscriber = Arc::new(sub); + tracing::subscriber::set_global_default(subscriber.clone())?; + self.subscriber = subscriber; + + // enter the rootspan + self.trace_enter_root(); + + // modity the enable state, note that we have successfully enable tracing + self.enable(); + + info!(sl!(), "Tracing enabled successfully"); + Ok(()) + } + + /// Shutdown the tracer and emit the span info to jaeger agent + /// The tracing information is only partially update to jaeger agent before this function is called + pub fn trace_end(&self) { + if self.enabled() { + // exit the rootspan + self.trace_exit_root(); + + global::shutdown_tracer_provider(); + } + } + + /// Enter the global ROOTSPAN + /// This function is a hack on tracing library's guard approach, letting the span + /// to enter without using a RAII guard to exit. This function should only be called + /// once, and also in paired with [`trace_exit_root()`]. + fn trace_enter_root(&self) { + self.enter_span(&ROOTSPAN); + } + + /// Exit the global ROOTSPAN + /// This should be called in paired with [`trace_enter_root()`]. + fn trace_exit_root(&self) { + self.exit_span(&ROOTSPAN); + } + + /// let the subscriber enter the span, this has to be called in pair with exit(span) + /// This function allows **cross function span** to run without span guard + fn enter_span(&self, span: &Span) { + let id: Option = span.into(); + self.subscriber.enter(&id.unwrap()); + } + + /// let the subscriber exit the span, this has to be called in pair to enter(span) + fn exit_span(&self, span: &Span) { + let id: Option = span.into(); + self.subscriber.exit(&id.unwrap()); + } +} + +/// Verifying the configuration of jaeger and setup the default value +fn verify_jaeger_config(endpoint: &str, username: &str, passwd: &str) -> Result { + if username.is_empty() && !passwd.is_empty() { + warn!( + sl!(), + "Jaeger password with empty username is not allowed, tracing is NOT enabled" + ); + return Err(anyhow::anyhow!("Empty username with non-empty password")); + } + + // set the default endpoint address, this expects a jaeger-collector running on localhost:14268 + let endpt = if endpoint.is_empty() { + DEFAULT_JAEGER_URL + } else { + endpoint + } + .to_owned(); + + Ok(endpt) +} diff --git a/src/runtime-rs/crates/runtimes/virt_container/Cargo.toml b/src/runtime-rs/crates/runtimes/virt_container/Cargo.toml index d648a2206..0a63195eb 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/Cargo.toml +++ b/src/runtime-rs/crates/runtimes/virt_container/Cargo.toml @@ -24,6 +24,7 @@ tokio = { version = "1.28.1" } toml = "0.4.2" url = "2.1.1" async-std = "1.12.0" +tracing = "0.1.36" agent = { path = "../../agent" } common = { path = "../common" } diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs index fba45a784..c7e7378b1 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/container.rs @@ -95,7 +95,7 @@ impl Container { amend_spec(&mut spec, toml_config.runtime.disable_guest_seccomp).context("amend spec")?; // get mutable root from oci spec - let mut root = match spec.root.as_mut() { + let root = match spec.root.as_mut() { Some(root) => root, None => return Err(anyhow!("spec miss root field")), }; diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/manager.rs b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/manager.rs index f5aa05e6c..d900632ce 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/manager.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/container_manager/manager.rs @@ -24,6 +24,7 @@ use oci::Process as OCIProcess; use resource::network::NetnsGuard; use resource::ResourceManager; use tokio::sync::RwLock; +use tracing::instrument; use kata_sys_util::hooks::HookStates; @@ -38,6 +39,15 @@ pub struct VirtContainerManager { hypervisor: Arc, } +impl std::fmt::Debug for VirtContainerManager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VirtContainerManager") + .field("sid", &self.sid) + .field("pid", &self.pid) + .finish() + } +} + impl VirtContainerManager { pub fn new( sid: &str, @@ -59,6 +69,7 @@ impl VirtContainerManager { #[async_trait] impl ContainerManager for VirtContainerManager { + #[instrument] async fn create_container(&self, config: ContainerConfig, spec: oci::Spec) -> Result { let container = Container::new( self.pid, @@ -100,6 +111,7 @@ impl ContainerManager for VirtContainerManager { Ok(PID { pid: self.pid }) } + #[instrument] async fn close_process_io(&self, process: &ContainerProcess) -> Result<()> { let containers = self.containers.read().await; let container_id = &process.container_id.to_string(); @@ -111,6 +123,7 @@ impl ContainerManager for VirtContainerManager { Ok(()) } + #[instrument] async fn delete_process(&self, process: &ContainerProcess) -> Result { let container_id = &process.container_id.container_id; match process.process_type { @@ -155,6 +168,7 @@ impl ContainerManager for VirtContainerManager { } } + #[instrument] async fn exec_process(&self, req: ExecProcessRequest) -> Result<()> { if req.spec_type_url.is_empty() { return Err(anyhow!("invalid type url")); @@ -180,6 +194,7 @@ impl ContainerManager for VirtContainerManager { Ok(()) } + #[instrument] async fn kill_process(&self, req: &KillRequest) -> Result<()> { let containers = self.containers.read().await; let container_id = &req.process.container_id.container_id; @@ -199,6 +214,7 @@ impl ContainerManager for VirtContainerManager { Ok(()) } + #[instrument] async fn wait_process(&self, process: &ContainerProcess) -> Result { let logger = logger_with_process(process); @@ -235,6 +251,7 @@ impl ContainerManager for VirtContainerManager { Ok(status.clone()) } + #[instrument] async fn start_process(&self, process: &ContainerProcess) -> Result { let containers = self.containers.read().await; let container_id = &process.container_id.container_id; @@ -265,6 +282,7 @@ impl ContainerManager for VirtContainerManager { Ok(PID { pid: self.pid }) } + #[instrument] async fn state_process(&self, process: &ContainerProcess) -> Result { let containers = self.containers.read().await; let container_id = &process.container_id.container_id; @@ -275,6 +293,7 @@ impl ContainerManager for VirtContainerManager { Ok(state) } + #[instrument] async fn pause_container(&self, id: &ContainerID) -> Result<()> { let containers = self.containers.read().await; let c = containers @@ -284,6 +303,7 @@ impl ContainerManager for VirtContainerManager { Ok(()) } + #[instrument] async fn resume_container(&self, id: &ContainerID) -> Result<()> { let containers = self.containers.read().await; let c = containers @@ -293,6 +313,7 @@ impl ContainerManager for VirtContainerManager { Ok(()) } + #[instrument] async fn resize_process_pty(&self, req: &ResizePTYRequest) -> Result<()> { let containers = self.containers.read().await; let c = containers @@ -306,6 +327,7 @@ impl ContainerManager for VirtContainerManager { Ok(()) } + #[instrument] async fn stats_container(&self, id: &ContainerID) -> Result { let containers = self.containers.read().await; let c = containers @@ -315,6 +337,7 @@ impl ContainerManager for VirtContainerManager { Ok(StatsInfo::from(stats)) } + #[instrument] async fn update_container(&self, req: UpdateRequest) -> Result<()> { let resource = serde_json::from_slice::(&req.value) .context("deserialize LinuxResource")?; @@ -326,18 +349,22 @@ impl ContainerManager for VirtContainerManager { c.update(&resource).await.context("update_container") } + #[instrument] async fn pid(&self) -> Result { Ok(PID { pid: self.pid }) } + #[instrument] async fn connect_container(&self, _id: &ContainerID) -> Result { Ok(PID { pid: self.pid }) } + #[instrument] async fn need_shutdown_sandbox(&self, req: &ShutdownRequest) -> bool { req.is_now || self.containers.read().await.is_empty() || self.sid == req.container_id } + #[instrument] async fn is_sandbox_container(&self, process: &ContainerProcess) -> bool { process.process_type == ProcessType::Container && process.container_id.container_id == self.sid diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs b/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs index 9bfc72f5e..4999ee4f9 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/lib.rs @@ -34,7 +34,11 @@ use kata_types::config::{hypervisor::HYPERVISOR_NAME_CH, CloudHypervisorConfig}; use resource::ResourceManager; use sandbox::VIRTCONTAINER; use tokio::sync::mpsc::Sender; +use tracing::instrument; +unsafe impl Send for VirtContainer {} +unsafe impl Sync for VirtContainer {} +#[derive(Debug)] pub struct VirtContainer {} #[async_trait] @@ -64,6 +68,7 @@ impl RuntimeHandler for VirtContainer { Arc::new(VirtContainer {}) } + #[instrument] async fn new_instance( &self, sid: &str, diff --git a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs index 3cbdba493..43f8fef87 100644 --- a/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs +++ b/src/runtime-rs/crates/runtimes/virt_container/src/sandbox.rs @@ -17,7 +17,7 @@ use common::{ Sandbox, SandboxNetworkEnv, }; use containerd_shim_protos::events::task::TaskOOM; -use hypervisor::{dragonball::Dragonball, Hypervisor, HYPERVISOR_DRAGONBALL}; +use hypervisor::{dragonball::Dragonball, BlockConfig, Hypervisor, HYPERVISOR_DRAGONBALL}; use kata_sys_util::hooks::HookStates; use kata_types::config::TomlConfig; use resource::{ @@ -26,6 +26,7 @@ use resource::{ ResourceConfig, ResourceManager, }; use tokio::sync::{mpsc::Sender, Mutex, RwLock}; +use tracing::instrument; use crate::health_check::HealthCheck; use persist::{self, sandbox_persist::Persist}; @@ -67,6 +68,15 @@ pub struct VirtSandbox { monitor: Arc, } +impl std::fmt::Debug for VirtSandbox { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VirtSandbox") + .field("sid", &self.sid) + .field("msg_sender", &self.msg_sender) + .finish() + } +} + impl VirtSandbox { pub async fn new( sid: &str, @@ -88,12 +98,15 @@ impl VirtSandbox { }) } - async fn prepare_config_for_sandbox( + #[instrument] + async fn prepare_for_start_sandbox( &self, _id: &str, network_env: SandboxNetworkEnv, ) -> Result> { let mut resource_configs = vec![]; + + // prepare network config if !network_env.network_created { if let Some(netns_path) = network_env.netns { let network_config = ResourceConfig::Network( @@ -103,10 +116,20 @@ impl VirtSandbox { resource_configs.push(network_config); } } - let hypervisor_config = self.hypervisor.hypervisor_config().await; - let virtio_fs_config = ResourceConfig::ShareFs(hypervisor_config.shared_fs); + + // prepare sharefs device config + let virtio_fs_config = + ResourceConfig::ShareFs(self.hypervisor.hypervisor_config().await.shared_fs); resource_configs.push(virtio_fs_config); + // prepare VM rootfs device config + let vm_rootfs = ResourceConfig::VmRootfs( + self.prepare_rootfs_config() + .await + .context("failed to prepare rootfs device config")?, + ); + resource_configs.push(vm_rootfs); + Ok(resource_configs) } @@ -162,6 +185,30 @@ impl VirtSandbox { }) } + async fn prepare_rootfs_config(&self) -> Result { + let boot_info = self.hypervisor.hypervisor_config().await.boot_info; + + let image = { + let initrd_path = boot_info.initrd.clone(); + let image_path = boot_info.image; + if !initrd_path.is_empty() { + Ok(initrd_path) + } else if !image_path.is_empty() { + Ok(image_path) + } else { + Err(anyhow!("failed to get image")) + } + } + .context("get image")?; + + Ok(BlockConfig { + path_on_host: image, + is_readonly: true, + driver_option: boot_info.vm_rootfs_driver, + ..Default::default() + }) + } + fn has_prestart_hooks( &self, prestart_hooks: Vec, @@ -173,6 +220,7 @@ impl VirtSandbox { #[async_trait] impl Sandbox for VirtSandbox { + #[instrument(name = "sb: start")] async fn start( &self, dns: Vec, @@ -198,8 +246,9 @@ impl Sandbox for VirtSandbox { // generate device and setup before start vm // should after hypervisor.prepare_vm let resources = self - .prepare_config_for_sandbox(id, network_env.clone()) + .prepare_for_start_sandbox(id, network_env.clone()) .await?; + self.resource_manager .prepare_before_start_vm(resources) .await diff --git a/src/runtime-rs/crates/service/Cargo.toml b/src/runtime-rs/crates/service/Cargo.toml index 8449328c0..b92fbbf34 100644 --- a/src/runtime-rs/crates/service/Cargo.toml +++ b/src/runtime-rs/crates/service/Cargo.toml @@ -11,6 +11,7 @@ async-trait = "0.1.48" slog = "2.5.2" slog-scope = "4.4.0" tokio = { version = "1.28.1", features = ["rt-multi-thread"] } +tracing = "0.1.36" ttrpc = { version = "0.7.1" } common = { path = "../runtimes/common" } diff --git a/src/runtime-rs/crates/service/src/manager.rs b/src/runtime-rs/crates/service/src/manager.rs index ff0fd997c..46ffc18db 100644 --- a/src/runtime-rs/crates/service/src/manager.rs +++ b/src/runtime-rs/crates/service/src/manager.rs @@ -18,6 +18,7 @@ use containerd_shim_protos::{ shim_async, }; use runtimes::RuntimeHandlerManager; +use shim_interface::KATA_PATH; use tokio::{ io::AsyncWriteExt, process::Command, @@ -26,9 +27,9 @@ use tokio::{ use ttrpc::asynchronous::Server; use crate::task_service::TaskService; + /// message buffer size const MESSAGE_BUFFER_SIZE: usize = 8; -use shim_interface::KATA_PATH; pub struct ServiceManager { receiver: Option>, @@ -39,48 +40,21 @@ pub struct ServiceManager { namespace: String, } -async fn send_event( - containerd_binary: String, - address: String, - namespace: String, - event: Arc, -) -> Result<()> { - let any = Any { - type_url: event.type_url(), - value: event.value().context("get event value")?, - ..Default::default() - }; - let data = any.write_to_bytes().context("write to any")?; - let mut child = Command::new(containerd_binary) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .args([ - "--address", - &address, - "publish", - "--topic", - &event.r#type(), - "--namespace", - &namespace, - ]) - .spawn() - .context("spawn containerd cmd to publish event")?; - - let stdin = child.stdin.as_mut().context("failed to open stdin")?; - stdin - .write_all(&data) - .await - .context("failed to write to stdin")?; - let output = child - .wait_with_output() - .await - .context("failed to read stdout")?; - info!(sl!(), "get output: {:?}", output); - Ok(()) +impl std::fmt::Debug for ServiceManager { + // todo: some how to implement debug for handler + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ServiceManager") + .field("receiver", &self.receiver) + .field("task_server.is_some()", &self.task_server.is_some()) + .field("binary", &self.binary) + .field("address", &self.address) + .field("namespace", &self.namespace) + .finish() + } } impl ServiceManager { + // TODO: who manages lifecycle for `task_server_fd`? pub async fn new( id: &str, containerd_binary: &str, @@ -89,11 +63,8 @@ impl ServiceManager { task_server_fd: RawFd, ) -> Result { let (sender, receiver) = channel::(MESSAGE_BUFFER_SIZE); - let handler = Arc::new( - RuntimeHandlerManager::new(id, sender) - .await - .context("new runtime handler")?, - ); + let rt_mgr = RuntimeHandlerManager::new(id, sender).context("new runtime handler")?; + let handler = Arc::new(rt_mgr); let mut task_server = unsafe { Server::from_raw_fd(task_server_fd) }; task_server = task_server.set_domain_unix(); Ok(Self { @@ -106,9 +77,10 @@ impl ServiceManager { }) } - pub async fn run(&mut self) -> Result<()> { + pub async fn run(mut self) -> Result<()> { info!(sl!(), "begin to run service"); - self.start().await.context("start")?; + self.registry_service().context("registry service")?; + self.start_service().await.context("start service")?; info!(sl!(), "wait server message"); let mut rx = self.receiver.take(); @@ -116,23 +88,15 @@ impl ServiceManager { while let Some(r) = rx.recv().await { info!(sl!(), "receive action {:?}", &r.action); let result = match r.action { - Action::Start => self.start().await.context("start listen"), - Action::Stop => self.stop_listen().await.context("stop listen"), + Action::Start => self.start_service().await.context("start listen"), + Action::Stop => self.stop_service().await.context("stop listen"), Action::Shutdown => { - self.stop_listen().await.context("stop listen")?; + self.stop_service().await.context("stop listen")?; break; } Action::Event(event) => { info!(sl!(), "get event {:?}", &event); - send_event( - self.binary.clone(), - self.address.clone(), - self.namespace.clone(), - event, - ) - .await - .context("send event")?; - Ok(()) + self.send_event(event).await.context("send event") } }; @@ -152,49 +116,79 @@ impl ServiceManager { pub async fn cleanup(sid: &str) -> Result<()> { let (sender, _receiver) = channel::(MESSAGE_BUFFER_SIZE); - let handler = RuntimeHandlerManager::new(sid, sender) - .await - .context("new runtime handler")?; - handler.cleanup().await.context("runtime handler cleanup")?; + let handler = RuntimeHandlerManager::new(sid, sender).context("new runtime handler")?; + if let Err(e) = handler.cleanup().await { + warn!(sl!(), "failed to clean up runtime state, {}", e); + } + let temp_dir = [KATA_PATH, sid].join("/"); - if std::fs::metadata(temp_dir.as_str()).is_ok() { + if fs::metadata(temp_dir.as_str()).is_ok() { // try to remove dir and skip the result - fs::remove_dir_all(temp_dir) - .map_err(|err| { - warn!(sl!(), "failed to clean up sandbox tmp dir"); - err - }) - .ok(); + if let Err(e) = fs::remove_dir_all(temp_dir) { + warn!(sl!(), "failed to clean up sandbox tmp dir, {}", e); + } + } + + Ok(()) + } + + fn registry_service(&mut self) -> Result<()> { + if let Some(t) = self.task_server.take() { + let task_service = Arc::new(Box::new(TaskService::new(self.handler.clone())) + as Box); + let t = t.register_service(shim_async::create_task(task_service)); + self.task_server = Some(t); } Ok(()) } - async fn start(&mut self) -> Result<()> { - let task_service = Arc::new(Box::new(TaskService::new(self.handler.clone())) - as Box); - let task_server = self.task_server.take(); - let task_server = match task_server { - Some(t) => { - let mut t = t.register_service(shim_async::create_task(task_service)); - t.start().await.context("task server start")?; - Some(t) - } - None => None, - }; - self.task_server = task_server; + async fn start_service(&mut self) -> Result<()> { + if let Some(t) = self.task_server.as_mut() { + t.start().await.context("task server start")?; + } Ok(()) } - async fn stop_listen(&mut self) -> Result<()> { - let task_server = self.task_server.take(); - let task_server = match task_server { - Some(mut t) => { - t.stop_listen().await; - Some(t) - } - None => None, + async fn stop_service(&mut self) -> Result<()> { + if let Some(t) = self.task_server.as_mut() { + t.stop_listen().await; + } + Ok(()) + } + + async fn send_event(&self, event: Arc) -> Result<()> { + let any = Any { + type_url: event.type_url(), + value: event.value().context("get event value")?, + ..Default::default() }; - self.task_server = task_server; + let data = any.write_to_bytes().context("write to any")?; + let mut child = Command::new(&self.binary) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .args([ + "--address", + &self.address, + "publish", + "--topic", + &event.r#type(), + "--namespace", + &self.namespace, + ]) + .spawn() + .context("spawn containerd cmd to publish event")?; + + let stdin = child.stdin.as_mut().context("failed to open stdin")?; + stdin + .write_all(&data) + .await + .context("failed to write to stdin")?; + let output = child + .wait_with_output() + .await + .context("failed to read stdout")?; + info!(sl!(), "get output: {:?}", output); Ok(()) } } diff --git a/src/runtime-rs/crates/service/src/task_service.rs b/src/runtime-rs/crates/service/src/task_service.rs index a612796c7..9db1bcbe4 100644 --- a/src/runtime-rs/crates/service/src/task_service.rs +++ b/src/runtime-rs/crates/service/src/task_service.rs @@ -24,31 +24,31 @@ impl TaskService { pub(crate) fn new(handler: Arc) -> Self { Self { handler } } -} -async fn handler_message( - s: &RuntimeHandlerManager, - ctx: &TtrpcContext, - req: TtrpcReq, -) -> ttrpc::Result -where - Request: TryFrom, - >::Error: std::fmt::Debug, - TtrpcResp: TryFrom, - >::Error: std::fmt::Debug, -{ - let r = req - .try_into() - .map_err(|err| ttrpc::Error::Others(format!("failed to translate from shim {:?}", err)))?; - let logger = sl!().new(o!("stream id" => ctx.mh.stream_id)); - debug!(logger, "====> task service {:?}", &r); - let resp = s - .handler_message(r) - .await - .map_err(|err| ttrpc::Error::Others(format!("failed to handler message {:?}", err)))?; - debug!(logger, "<==== task service {:?}", &resp); - resp.try_into() - .map_err(|err| ttrpc::Error::Others(format!("failed to translate to shim {:?}", err))) + async fn handler_message( + &self, + ctx: &TtrpcContext, + req: TtrpcReq, + ) -> ttrpc::Result + where + Request: TryFrom, + >::Error: std::fmt::Debug, + TtrpcResp: TryFrom, + >::Error: std::fmt::Debug, + { + let r = req.try_into().map_err(|err| { + ttrpc::Error::Others(format!("failed to translate from shim {:?}", err)) + })?; + let logger = sl!().new(o!("stream id" => ctx.mh.stream_id)); + debug!(logger, "====> task service {:?}", &r); + let resp = + self.handler.handler_message(r).await.map_err(|err| { + ttrpc::Error::Others(format!("failed to handler message {:?}", err)) + })?; + debug!(logger, "<==== task service {:?}", &resp); + resp.try_into() + .map_err(|err| ttrpc::Error::Others(format!("failed to translate to shim {:?}", err))) + } } macro_rules! impl_service { @@ -56,7 +56,7 @@ macro_rules! impl_service { #[async_trait] impl shim_async::Task for TaskService { $(async fn $name(&self, ctx: &TtrpcContext, req: $req) -> ttrpc::Result<$resp> { - handler_message(&self.handler, ctx, req).await + self.handler_message(ctx, req).await })* } }; diff --git a/src/runtime-rs/crates/shim-ctl/src/main.rs b/src/runtime-rs/crates/shim-ctl/src/main.rs index fd5152173..76506fec2 100644 --- a/src/runtime-rs/crates/shim-ctl/src/main.rs +++ b/src/runtime-rs/crates/shim-ctl/src/main.rs @@ -16,7 +16,7 @@ const WORKER_THREADS: usize = 2; async fn real_main() { let (sender, _receiver) = channel::(MESSAGE_BUFFER_SIZE); - let manager = RuntimeHandlerManager::new("xxx", sender).await.unwrap(); + let manager = RuntimeHandlerManager::new("xxx", sender).unwrap(); let req = Request::CreateContainer(ContainerConfig { container_id: "xxx".to_owned(), diff --git a/src/runtime-rs/crates/shim/Cargo.toml b/src/runtime-rs/crates/shim/Cargo.toml index 1f5dafb3f..e38e66262 100644 --- a/src/runtime-rs/crates/shim/Cargo.toml +++ b/src/runtime-rs/crates/shim/Cargo.toml @@ -29,12 +29,15 @@ slog-stdlog = "4.1.0" thiserror = "1.0.30" tokio = { version = "1.28.1", features = [ "rt", "rt-multi-thread" ] } unix_socket2 = "0.5.4" +tracing = "0.1.36" +tracing-opentelemetry = "0.18.0" kata-types = { path = "../../../libs/kata-types"} kata-sys-util = { path = "../../../libs/kata-sys-util"} logging = { path = "../../../libs/logging"} oci = { path = "../../../libs/oci" } service = { path = "../service" } +runtimes = { path = "../runtimes" } [dev-dependencies] tempfile = "3.2.0" diff --git a/src/runtime-rs/crates/shim/src/bin/main.rs b/src/runtime-rs/crates/shim/src/bin/main.rs index b8a176e4a..587f5a18b 100644 --- a/src/runtime-rs/crates/shim/src/bin/main.rs +++ b/src/runtime-rs/crates/shim/src/bin/main.rs @@ -142,7 +142,7 @@ fn real_main() -> Result<()> { Action::Delete(args) => { let mut shim = ShimExecutor::new(args); let rt = get_tokio_runtime().context("get tokio runtime")?; - rt.block_on(shim.delete())? + rt.block_on(shim.delete())?; } Action::Run(args) => { // set mnt namespace @@ -151,7 +151,7 @@ fn real_main() -> Result<()> { let mut shim = ShimExecutor::new(args); let rt = get_tokio_runtime().context("get tokio runtime")?; - rt.block_on(shim.run())? + rt.block_on(shim.run())?; } Action::Help => show_help(&args[0]), Action::Version => show_version(None), diff --git a/src/runtime-rs/crates/shim/src/shim.rs b/src/runtime-rs/crates/shim/src/shim.rs index acd76d6bf..a197f6f31 100644 --- a/src/runtime-rs/crates/shim/src/shim.rs +++ b/src/runtime-rs/crates/shim/src/shim.rs @@ -20,6 +20,7 @@ const SHIM_PID_FILE: &str = "shim.pid"; pub(crate) const ENV_KATA_RUNTIME_BIND_FD: &str = "KATA_RUNTIME_BIND_FD"; /// Command executor for shim. +#[derive(Debug)] pub struct ShimExecutor { pub(crate) args: Args, } diff --git a/src/runtime-rs/crates/shim/src/shim_run.rs b/src/runtime-rs/crates/shim/src/shim_run.rs index 5445ac635..64e81ca40 100644 --- a/src/runtime-rs/crates/shim/src/shim_run.rs +++ b/src/runtime-rs/crates/shim/src/shim_run.rs @@ -46,7 +46,7 @@ impl ShimExecutor { self.args.validate(false).context("validate")?; let server_fd = get_server_fd().context("get server fd")?; - let mut service_manager = service::ServiceManager::new( + let service_manager = service::ServiceManager::new( &self.args.id, &self.args.publish_binary, &self.args.address, diff --git a/src/runtime/cmd/kata-runtime/kata-env.go b/src/runtime/cmd/kata-runtime/kata-env.go index f74cb9f89..9b4fc8064 100644 --- a/src/runtime/cmd/kata-runtime/kata-env.go +++ b/src/runtime/cmd/kata-runtime/kata-env.go @@ -103,20 +103,19 @@ type RuntimeVersionInfo struct { // HypervisorInfo stores hypervisor details type HypervisorInfo struct { - MachineType string - Version string - Path string - BlockDeviceDriver string - EntropySource string - SharedFS string - VirtioFSDaemon string - SocketPath string - Msize9p uint32 - MemorySlots uint32 - HotPlugVFIO config.PCIePort - ColdPlugVFIO config.PCIePort - HotplugVFIOOnRootBus bool - Debug bool + MachineType string + Version string + Path string + BlockDeviceDriver string + EntropySource string + SharedFS string + VirtioFSDaemon string + SocketPath string + Msize9p uint32 + MemorySlots uint32 + HotPlugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort + Debug bool } // AgentInfo stores agent details @@ -307,20 +306,19 @@ func getHypervisorInfo(config oci.RuntimeConfig) (HypervisorInfo, error) { } return HypervisorInfo{ - Debug: config.HypervisorConfig.Debug, - MachineType: config.HypervisorConfig.HypervisorMachineType, - Version: version, - Path: hypervisorPath, - BlockDeviceDriver: config.HypervisorConfig.BlockDeviceDriver, - Msize9p: config.HypervisorConfig.Msize9p, - MemorySlots: config.HypervisorConfig.MemSlots, - EntropySource: config.HypervisorConfig.EntropySource, - SharedFS: config.HypervisorConfig.SharedFS, - VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, - HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, - ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, - HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, - SocketPath: socketPath, + Debug: config.HypervisorConfig.Debug, + MachineType: config.HypervisorConfig.HypervisorMachineType, + Version: version, + Path: hypervisorPath, + BlockDeviceDriver: config.HypervisorConfig.BlockDeviceDriver, + Msize9p: config.HypervisorConfig.Msize9p, + MemorySlots: config.HypervisorConfig.MemSlots, + EntropySource: config.HypervisorConfig.EntropySource, + SharedFS: config.HypervisorConfig.SharedFS, + VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, + HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, + ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, + SocketPath: socketPath, }, nil } diff --git a/src/runtime/cmd/kata-runtime/kata-env_test.go b/src/runtime/cmd/kata-runtime/kata-env_test.go index c8d4d0ea9..5bf2c5a88 100644 --- a/src/runtime/cmd/kata-runtime/kata-env_test.go +++ b/src/runtime/cmd/kata-runtime/kata-env_test.go @@ -87,7 +87,6 @@ func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.Runti disableBlock := true blockStorageDriver := "virtio-scsi" enableIOThreads := true - hotplugVFIOOnRootBus := true hotPlugVFIO = config.BridgePort coldPlugVFIO = config.NoPort disableNewNetNs := false @@ -132,7 +131,6 @@ func makeRuntimeConfig(prefixDir string) (configFile string, ociConfig oci.Runti DisableBlock: disableBlock, BlockDeviceDriver: blockStorageDriver, EnableIOThreads: enableIOThreads, - HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, DisableNewNetNs: disableNewNetNs, @@ -276,10 +274,8 @@ func getExpectedHypervisor(config oci.RuntimeConfig) HypervisorInfo { EntropySource: config.HypervisorConfig.EntropySource, SharedFS: config.HypervisorConfig.SharedFS, VirtioFSDaemon: config.HypervisorConfig.VirtioFSDaemon, - - HotplugVFIOOnRootBus: config.HypervisorConfig.HotplugVFIOOnRootBus, - HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, - ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, + HotPlugVFIO: config.HypervisorConfig.HotPlugVFIO, + ColdPlugVFIO: config.HypervisorConfig.ColdPlugVFIO, } if os.Geteuid() == 0 { diff --git a/src/runtime/pkg/containerd-shim-v2/create_test.go b/src/runtime/pkg/containerd-shim-v2/create_test.go index c24e3ced3..82ce7357f 100644 --- a/src/runtime/pkg/containerd-shim-v2/create_test.go +++ b/src/runtime/pkg/containerd-shim-v2/create_test.go @@ -330,31 +330,29 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (runtimeConfig string, disableBlockDevice := true blockDeviceDriver := "virtio-scsi" enableIOThreads := true - hotplugVFIOOnRootBus := true disableNewNetNs := false sharedFS := "virtio-9p" virtioFSdaemon := path.Join(dir, "virtiofsd") hotPlugVFIO = config.BridgePort - coldPlugVFIO = config.RootPort + coldPlugVFIO = config.NoPort configFileOptions := ktu.RuntimeConfigOptions{ - Hypervisor: "qemu", - HypervisorPath: hypervisorPath, - KernelPath: kernelPath, - ImagePath: imagePath, - RootfsType: rootfsType, - KernelParams: kernelParams, - MachineType: machineType, - LogPath: logPath, - DisableBlock: disableBlockDevice, - BlockDeviceDriver: blockDeviceDriver, - EnableIOThreads: enableIOThreads, - HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, - DisableNewNetNs: disableNewNetNs, - SharedFS: sharedFS, - VirtioFSDaemon: virtioFSdaemon, - HotPlugVFIO: hotPlugVFIO, - ColdPlugVFIO: coldPlugVFIO, + Hypervisor: "qemu", + HypervisorPath: hypervisorPath, + KernelPath: kernelPath, + ImagePath: imagePath, + RootfsType: rootfsType, + KernelParams: kernelParams, + MachineType: machineType, + LogPath: logPath, + DisableBlock: disableBlockDevice, + BlockDeviceDriver: blockDeviceDriver, + EnableIOThreads: enableIOThreads, + DisableNewNetNs: disableNewNetNs, + SharedFS: sharedFS, + VirtioFSDaemon: virtioFSdaemon, + HotPlugVFIO: hotPlugVFIO, + ColdPlugVFIO: coldPlugVFIO, } runtimeConfigFileData := ktu.MakeRuntimeConfigFileData(configFileOptions) diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index 8c9055ae2..5f76bff48 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -111,7 +111,7 @@ func GetVFIODeviceType(deviceFilePath string) (config.VFIODeviceType, error) { return config.VFIODeviceErrorType, err } - if strings.HasPrefix(deviceSysfsDev, vfioAPSysfsDir) { + if strings.Contains(deviceSysfsDev, vfioAPSysfsDir) { return config.VFIOAPDeviceMediatedType, nil } @@ -178,22 +178,22 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe } id := utils.MakeNameID("vfio", device.ID+strconv.Itoa(i), maxDevIDSize) - pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass) - // We need to ignore Host or PCI Bridges that are in the same IOMMU group as the - // passed-through devices. One CANNOT pass-through a PCI bridge or Host bridge. - // Class 0x0604 is PCI bridge, 0x0600 is Host bridge - ignorePCIDevice, err := checkIgnorePCIClass(pciClass, deviceBDF, 0x0600) - if err != nil { - return nil, err - } - if ignorePCIDevice { - continue - } - var vfio config.VFIODev switch vfioDeviceType { case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: + // This is vfio-pci and vfio-mdev specific + pciClass := getPCIDeviceProperty(deviceBDF, PCISysFsDevicesClass) + // We need to ignore Host or PCI Bridges that are in the same IOMMU group as the + // passed-through devices. One CANNOT pass-through a PCI bridge or Host bridge. + // Class 0x0604 is PCI bridge, 0x0600 is Host bridge + ignorePCIDevice, err := checkIgnorePCIClass(pciClass, deviceBDF, 0x0600) + if err != nil { + return nil, err + } + if ignorePCIDevice { + continue + } // Do not directly assign to `vfio` -- need to access field still vfio = config.VFIODev{ ID: id, @@ -216,6 +216,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo) ([]*config.VFIODe SysfsDev: deviceSysfsDev, Type: config.VFIOAPDeviceMediatedType, APDevices: devices, + Port: device.Port, } default: return nil, fmt.Errorf("Failed to append device: VFIO device type unrecognized") diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 801b1a81f..feec9c448 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -69,7 +69,14 @@ func (device *VFIODevice) Attach(ctx context.Context, devReceiver api.DeviceRece if err != nil { return err } + for _, vfio := range device.VfioDevs { + // If vfio.Port is not set we bail out, users should set + // explicitly the port in the config file + if vfio.Port == "" { + return fmt.Errorf("cold_plug_vfio= or hot_plug_vfio= port is not set for device %s (BridgePort | RootPort | SwitchPort)", vfio.BDF) + } + if vfio.IsPCIe { busIndex := len(config.PCIeDevices[vfio.Port]) vfio.Bus = fmt.Sprintf("%s%d", config.PCIePortPrefixMapping[vfio.Port], busIndex) diff --git a/src/runtime/pkg/device/manager/manager.go b/src/runtime/pkg/device/manager/manager.go index 735061d9e..cb5e86a04 100644 --- a/src/runtime/pkg/device/manager/manager.go +++ b/src/runtime/pkg/device/manager/manager.go @@ -120,7 +120,7 @@ func (dm *deviceManager) createDevice(devInfo config.DeviceInfo) (dev api.Device if devInfo.ID, err = dm.newDeviceID(); err != nil { return nil, err } - if IsVFIO(devInfo.HostPath) { + if IsVFIODevice(devInfo.HostPath) { return drivers.NewVFIODevice(&devInfo), nil } else if IsVhostUserBlk(devInfo) { if devInfo.DriverOptions == nil { @@ -191,12 +191,12 @@ func (dm *deviceManager) AttachDevice(ctx context.Context, id string, dr api.Dev dm.Lock() defer dm.Unlock() - d, ok := dm.devices[id] + dev, ok := dm.devices[id] if !ok { return ErrDeviceNotExist } - if err := d.Attach(ctx, dr); err != nil { + if err := dev.Attach(ctx, dr); err != nil { return err } return nil diff --git a/src/runtime/pkg/device/manager/manager_test.go b/src/runtime/pkg/device/manager/manager_test.go index 70c76b67d..c08cb66ab 100644 --- a/src/runtime/pkg/device/manager/manager_test.go +++ b/src/runtime/pkg/device/manager/manager_test.go @@ -90,6 +90,100 @@ func TestNewDevice(t *testing.T) { assert.Equal(t, vfioDev.DeviceInfo.GID, uint32(2)) } +func TestAttachVFIOAPDevice(t *testing.T) { + + var err error + var ok bool + + dm := &deviceManager{ + devices: make(map[string]api.Device), + } + + tmpDir := t.TempDir() + // sys/devices/vfio_ap/matrix/f94290f8-78ac-45fb-bb22-e55e519fa64f + testSysfsAP := "/sys/devices/vfio_ap/" + testDeviceAP := "f94290f8-78ac-45fb-bb22-e55e519fa64f" + testVFIOGroup := "42" + + matrixDir := filepath.Join(tmpDir, testSysfsAP, "matrix") + err = os.MkdirAll(matrixDir, dirMode) + assert.Nil(t, err) + + deviceAPFile := filepath.Join(matrixDir, testDeviceAP) + err = os.MkdirAll(deviceAPFile, dirMode) + assert.Nil(t, err) + + matrixDeviceAPFile := filepath.Join(deviceAPFile, "matrix") + _, err = os.Create(matrixDeviceAPFile) + assert.Nil(t, err) + // create AP devices in the matrix file + APDevices := []byte("05.001f\n") + err = os.WriteFile(matrixDeviceAPFile, APDevices, 0644) + assert.Nil(t, err) + + devicesVFIOGroupDir := filepath.Join(tmpDir, testVFIOGroup, "devices") + err = os.MkdirAll(devicesVFIOGroupDir, dirMode) + assert.Nil(t, err) + + deviceAPSymlink := filepath.Join(devicesVFIOGroupDir, testDeviceAP) + err = os.Symlink(deviceAPFile, deviceAPSymlink) + assert.Nil(t, err) + + savedIOMMUPath := config.SysIOMMUGroupPath + config.SysIOMMUGroupPath = tmpDir + + savedSysBusPciDevicesPath := config.SysBusPciDevicesPath + config.SysBusPciDevicesPath = devicesVFIOGroupDir + + defer func() { + config.SysIOMMUGroupPath = savedIOMMUPath + config.SysBusPciDevicesPath = savedSysBusPciDevicesPath + }() + + path := filepath.Join(vfioPath, testVFIOGroup) + deviceInfo := config.DeviceInfo{ + HostPath: path, + ContainerPath: path, + DevType: "c", + ColdPlug: false, + Port: config.RootPort, + } + + device, err := dm.NewDevice(deviceInfo) + assert.Nil(t, err) + _, ok = device.(*drivers.VFIODevice) + assert.True(t, ok) + + devReceiver := &api.MockDeviceReceiver{} + err = device.Attach(context.Background(), devReceiver) + assert.Nil(t, err) + + err = device.Detach(context.Background(), devReceiver) + assert.Nil(t, err) + + // If we omit the port setting we should fail + failDm := &deviceManager{ + devices: make(map[string]api.Device), + } + + failDeviceInfo := config.DeviceInfo{ + HostPath: path, + ContainerPath: path, + DevType: "c", + ColdPlug: false, + } + + failDevice, err := failDm.NewDevice(failDeviceInfo) + assert.Nil(t, err) + _, ok = failDevice.(*drivers.VFIODevice) + assert.True(t, ok) + + failDevReceiver := &api.MockDeviceReceiver{} + err = failDevice.Attach(context.Background(), failDevReceiver) + assert.Error(t, err) + +} + func TestAttachVFIODevice(t *testing.T) { dm := &deviceManager{ blockDriver: config.VirtioBlock, @@ -132,6 +226,8 @@ func TestAttachVFIODevice(t *testing.T) { HostPath: path, ContainerPath: path, DevType: "c", + ColdPlug: false, + Port: config.RootPort, } device, err := dm.NewDevice(deviceInfo) diff --git a/src/runtime/pkg/device/manager/utils.go b/src/runtime/pkg/device/manager/utils.go index a9e4ee8c6..6658b19a4 100644 --- a/src/runtime/pkg/device/manager/utils.go +++ b/src/runtime/pkg/device/manager/utils.go @@ -17,8 +17,15 @@ const ( vfioPath = "/dev/vfio/" ) +// IsVFIOControlDevice checks if the device provided is a vfio control device. +// Depending no the vfio_mode we need to know if a device is a VFIO device +// or the VFIO control device +func IsVFIOControlDevice(path string) bool { + return path == filepath.Join(vfioPath, "vfio") +} + // IsVFIO checks if the device provided is a vfio group. -func IsVFIO(hostPath string) bool { +func IsVFIODevice(hostPath string) bool { // Ignore /dev/vfio/vfio character device if strings.HasPrefix(hostPath, filepath.Join(vfioPath, "vfio")) { return false diff --git a/src/runtime/pkg/device/manager/utils_test.go b/src/runtime/pkg/device/manager/utils_test.go index 6752719dd..9fbc829d7 100644 --- a/src/runtime/pkg/device/manager/utils_test.go +++ b/src/runtime/pkg/device/manager/utils_test.go @@ -31,7 +31,7 @@ func TestIsVFIO(t *testing.T) { } for _, d := range data { - isVFIO := IsVFIO(d.path) + isVFIO := IsVFIODevice(d.path) assert.Equal(t, d.expected, isVFIO) } } diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 73b6aba44..5b618eb01 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -163,6 +163,9 @@ const ( // TransportMMIO is the MMIO transport for virtio devices. TransportMMIO VirtioTransport = "mmio" + + // TransportAP is the AP transport for virtio devices. + TransportAP VirtioTransport = "ap" ) // defaultTransport returns the default transport for the current combination @@ -199,6 +202,14 @@ func (transport VirtioTransport) isVirtioCCW(config *Config) bool { return transport == TransportCCW } +func (transport VirtioTransport) isVirtioAP(config *Config) bool { + if transport == "" { + transport = transport.defaultTransport(config) + } + + return transport == TransportAP +} + // getName returns the name of the current transport. func (transport VirtioTransport) getName(config *Config) string { if transport == "" { @@ -1852,6 +1863,9 @@ type VFIODevice struct { // Transport is the virtio transport for this device. Transport VirtioTransport + + // SysfsDev specifies the sysfs matrix entry for the AP device + SysfsDev string } // VFIODeviceTransport is a map of the vfio device name that corresponds to @@ -1860,11 +1874,13 @@ var VFIODeviceTransport = map[VirtioTransport]string{ TransportPCI: "vfio-pci", TransportCCW: "vfio-ccw", TransportMMIO: "vfio-device", + TransportAP: "vfio-ap", } // Valid returns true if the VFIODevice structure is valid and complete. +// s390x architecture requires SysfsDev to be set. func (vfioDev VFIODevice) Valid() bool { - return vfioDev.BDF != "" + return vfioDev.BDF != "" || vfioDev.SysfsDev != "" } // QemuParams returns the qemu parameters built out of this vfio device. @@ -1874,6 +1890,15 @@ func (vfioDev VFIODevice) QemuParams(config *Config) []string { driver := vfioDev.deviceName(config) + if vfioDev.Transport.isVirtioAP(config) { + deviceParams = append(deviceParams, fmt.Sprintf("%s,sysfsdev=%s", driver, vfioDev.SysfsDev)) + + qemuParams = append(qemuParams, "-device") + qemuParams = append(qemuParams, strings.Join(deviceParams, ",")) + + return qemuParams + } + deviceParams = append(deviceParams, fmt.Sprintf("%s,host=%s", driver, vfioDev.BDF)) if vfioDev.Transport.isVirtioPCI(config) { if vfioDev.VendorID != "" { @@ -2878,10 +2903,9 @@ func (config *Config) appendDevices(logger QMPLog) { for _, d := range config.Devices { if !d.Valid() { - logger.Errorf("vm device is not valid: %+v", config.Devices) + logger.Errorf("vm device is not valid: %+v", d) continue } - config.qemuParams = append(config.qemuParams, d.QemuParams(config)...) } } diff --git a/src/runtime/pkg/govmm/qemu/qmp.go b/src/runtime/pkg/govmm/qemu/qmp.go index 3d89fbe66..e123ec780 100644 --- a/src/runtime/pkg/govmm/qemu/qmp.go +++ b/src/runtime/pkg/govmm/qemu/qmp.go @@ -1233,10 +1233,11 @@ func (q *QMP) ExecutePCIVFIOMediatedDeviceAdd(ctx context.Context, devID, sysfsd } // ExecuteAPVFIOMediatedDeviceAdd adds a VFIO mediated AP device to a QEMU instance using the device_add command. -func (q *QMP) ExecuteAPVFIOMediatedDeviceAdd(ctx context.Context, sysfsdev string) error { +func (q *QMP) ExecuteAPVFIOMediatedDeviceAdd(ctx context.Context, sysfsdev string, devID string) error { args := map[string]interface{}{ "driver": VfioAP, "sysfsdev": sysfsdev, + "id": devID, } return q.executeCommand(ctx, "device_add", args, nil) } diff --git a/src/runtime/pkg/govmm/qemu/qmp_test.go b/src/runtime/pkg/govmm/qemu/qmp_test.go index 17492f6fd..06738a40d 100644 --- a/src/runtime/pkg/govmm/qemu/qmp_test.go +++ b/src/runtime/pkg/govmm/qemu/qmp_test.go @@ -1128,7 +1128,7 @@ func TestQMPAPVFIOMediatedDeviceAdd(t *testing.T) { q := startQMPLoop(buf, cfg, connectedCh, disconnectedCh) checkVersion(t, connectedCh) sysfsDev := "/sys/devices/vfio_ap/matrix/a297db4a-f4c2-11e6-90f6-d3b88d6c9525" - err := q.ExecuteAPVFIOMediatedDeviceAdd(context.Background(), sysfsDev) + err := q.ExecuteAPVFIOMediatedDeviceAdd(context.Background(), sysfsDev, "test-id") if err != nil { t.Fatalf("Unexpected error %v", err) } diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go index f0ba941de..7384cca5e 100644 --- a/src/runtime/pkg/hypervisors/hypervisor_state.go +++ b/src/runtime/pkg/hypervisors/hypervisor_state.go @@ -42,10 +42,9 @@ type HypervisorState struct { // HotpluggedCPUs is the list of CPUs that were hot-added HotpluggedVCPUs []CPUDevice - HotpluggedMemory int - VirtiofsDaemonPid int - Pid int - HotPlugVFIO config.PCIePort - ColdPlugVFIO config.PCIePort - HotplugVFIOOnRootBus bool + HotpluggedMemory int + VirtiofsDaemonPid int + Pid int + HotPlugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort } diff --git a/src/runtime/pkg/katatestutils/utils.go b/src/runtime/pkg/katatestutils/utils.go index ec1d85c3a..041a2ec5e 100644 --- a/src/runtime/pkg/katatestutils/utils.go +++ b/src/runtime/pkg/katatestutils/utils.go @@ -233,7 +233,6 @@ type RuntimeConfigOptions struct { DefaultMsize9p uint32 DisableBlock bool EnableIOThreads bool - HotplugVFIOOnRootBus bool DisableNewNetNs bool HypervisorDebug bool RuntimeDebug bool @@ -317,8 +316,8 @@ func MakeRuntimeConfigFileData(config RuntimeConfigOptions) string { default_memory = ` + strconv.FormatUint(uint64(config.DefaultMemSize), 10) + ` disable_block_device_use = ` + strconv.FormatBool(config.DisableBlock) + ` enable_iothreads = ` + strconv.FormatBool(config.EnableIOThreads) + ` - hotplug_vfio_on_root_bus = ` + strconv.FormatBool(config.HotplugVFIOOnRootBus) + ` cold_plug_vfio = "` + config.ColdPlugVFIO.String() + `" + hot_plug_vfio = "` + config.HotPlugVFIO.String() + `" msize_9p = ` + strconv.FormatUint(uint64(config.DefaultMsize9p), 10) + ` enable_debug = ` + strconv.FormatBool(config.HypervisorDebug) + ` guest_hook_path = "` + config.DefaultGuestHookPath + `" diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index f76986876..43da75699 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -81,7 +81,6 @@ const defaultFileBackedMemRootDir string = "" const defaultEnableDebug bool = false const defaultDisableNestingChecks bool = false const defaultMsize9p uint32 = 8192 -const defaultHotplugVFIOOnRootBus bool = false const defaultEntropySource = "/dev/urandom" const defaultGuestHookPath string = "" const defaultVirtioFSCacheMode = "never" diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index a485cac81..b8f955cbb 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -22,6 +22,7 @@ import ( govmmQemu "github.com/kata-containers/kata-containers/src/runtime/pkg/govmm/qemu" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/pkg/oci" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" exp "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/experimental" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" @@ -157,7 +158,6 @@ type hypervisor struct { DisableNestingChecks bool `toml:"disable_nesting_checks"` EnableIOThreads bool `toml:"enable_iothreads"` DisableImageNvdimm bool `toml:"disable_image_nvdimm"` - HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` HotPlugVFIO config.PCIePort `toml:"hot_plug_vfio"` ColdPlugVFIO config.PCIePort `toml:"cold_plug_vfio"` DisableVhostNet bool `toml:"disable_vhost_net"` @@ -886,7 +886,6 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { EnableIOThreads: h.EnableIOThreads, Msize9p: h.msize9p(), DisableImageNvdimm: h.DisableImageNvdimm, - HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, HotPlugVFIO: h.hotPlugVFIO(), ColdPlugVFIO: h.coldPlugVFIO(), DisableVhostNet: h.DisableVhostNet, @@ -1089,7 +1088,6 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { BlockDeviceCacheNoflush: h.BlockDeviceCacheNoflush, EnableIOThreads: h.EnableIOThreads, Msize9p: h.msize9p(), - HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, ColdPlugVFIO: h.coldPlugVFIO(), HotPlugVFIO: h.hotPlugVFIO(), DisableVhostNet: true, @@ -1336,7 +1334,6 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { BlockDeviceCacheNoflush: defaultBlockDeviceCacheNoflush, EnableIOThreads: defaultEnableIOThreads, Msize9p: defaultMsize9p, - HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus, ColdPlugVFIO: defaultColdPlugVFIO, HotPlugVFIO: defaultHotPlugVFIO, GuestHookPath: defaultGuestHookPath, @@ -1722,7 +1719,8 @@ func checkConfig(config oci.RuntimeConfig) error { hotPlugVFIO := config.HypervisorConfig.HotPlugVFIO coldPlugVFIO := config.HypervisorConfig.ColdPlugVFIO machineType := config.HypervisorConfig.HypervisorMachineType - if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType); err != nil { + hypervisorType := config.HypervisorType + if err := checkPCIeConfig(coldPlugVFIO, hotPlugVFIO, machineType, hypervisorType); err != nil { return err } @@ -1732,10 +1730,9 @@ func checkConfig(config oci.RuntimeConfig) error { // checkPCIeConfig ensures the PCIe configuration is valid. // Only allow one of the following settings for cold-plug: // no-port, root-port, switch-port -func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string) error { - // Currently only QEMU q35 supports advanced PCIe topologies - // firecracker, dragonball do not have right now any PCIe support - if machineType != "q35" { +func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineType string, hypervisorType virtcontainers.HypervisorType) error { + if hypervisorType != virtcontainers.QemuHypervisor { + kataUtilsLogger.Warn("Advanced PCIe Topology only available for QEMU hypervisor, ignoring hot(cold)_vfio_port setting") return nil } @@ -1745,6 +1742,12 @@ func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineT if coldPlug == config.NoPort && hotPlug == config.NoPort { return nil } + // Currently only QEMU q35,virt support advanced PCIe topologies + // firecracker, dragonball do not have right now any PCIe support + if machineType != "q35" && machineType != "virt" { + return nil + } + var port config.PCIePort if coldPlug != config.NoPort { port = coldPlug @@ -1752,10 +1755,13 @@ func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineT if hotPlug != config.NoPort { port = hotPlug } - if port == config.NoPort || port == config.BridgePort || port == config.RootPort || port == config.SwitchPort { + if port == config.NoPort { + return fmt.Errorf("invalid vfio_port=%s setting, use on of %s, %s, %s", + port, config.BridgePort, config.RootPort, config.SwitchPort) + } + if port == config.BridgePort || port == config.RootPort || port == config.SwitchPort { return nil } - return fmt.Errorf("invalid vfio_port=%s setting, allowed values %s, %s, %s, %s", coldPlug, config.NoPort, config.BridgePort, config.RootPort, config.SwitchPort) } diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index b786ce25f..4e4f9917f 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -85,7 +85,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime blockDeviceDriver := "virtio-scsi" blockDeviceAIO := "io_uring" enableIOThreads := true - hotplugVFIOOnRootBus := true hotPlugVFIO = config.NoPort coldPlugVFIO = config.BridgePort disableNewNetNs := false @@ -108,7 +107,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime BlockDeviceDriver: blockDeviceDriver, BlockDeviceAIO: blockDeviceAIO, EnableIOThreads: enableIOThreads, - HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, DisableNewNetNs: disableNewNetNs, @@ -172,7 +170,6 @@ func createAllRuntimeConfigFiles(dir, hypervisor string) (testConfig testRuntime BlockDeviceAIO: defaultBlockDeviceAIO, DefaultBridges: defaultBridgesCount, EnableIOThreads: enableIOThreads, - HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, HotPlugVFIO: hotPlugVFIO, ColdPlugVFIO: coldPlugVFIO, Msize9p: defaultMsize9p, @@ -613,7 +610,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) { machineType := "machineType" disableBlock := true enableIOThreads := true - hotplugVFIOOnRootBus := true coldPlugVFIO = config.BridgePort orgVHostVSockDevicePath := utils.VHostVSockDevicePath blockDeviceAIO := "io_uring" @@ -632,7 +628,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) { MachineType: machineType, DisableBlockDeviceUse: disableBlock, EnableIOThreads: enableIOThreads, - HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, ColdPlugVFIO: coldPlugVFIO, RxRateLimiterMaxRate: rxRateLimiterMaxRate, TxRateLimiterMaxRate: txRateLimiterMaxRate, @@ -684,10 +679,6 @@ func TestNewQemuHypervisorConfig(t *testing.T) { t.Errorf("Expected value for enable IOThreads %v, got %v", enableIOThreads, config.EnableIOThreads) } - if config.HotplugVFIOOnRootBus != hotplugVFIOOnRootBus { - t.Errorf("Expected value for HotplugVFIOOnRootBus %v, got %v", hotplugVFIOOnRootBus, config.HotplugVFIOOnRootBus) - } - if config.RxRateLimiterMaxRate != rxRateLimiterMaxRate { t.Errorf("Expected value for rx rate limiter %v, got %v", rxRateLimiterMaxRate, config.RxRateLimiterMaxRate) } @@ -809,7 +800,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) { machineType := "machineType" disableBlock := true enableIOThreads := true - hotplugVFIOOnRootBus := true hypervisor := hypervisor{ Path: hypervisorPath, @@ -819,7 +809,6 @@ func TestNewQemuHypervisorConfigImageAndInitrd(t *testing.T) { MachineType: machineType, DisableBlockDeviceUse: disableBlock, EnableIOThreads: enableIOThreads, - HotplugVFIOOnRootBus: hotplugVFIOOnRootBus, } _, err := newQemuHypervisorConfig(hypervisor) diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go index 7399e017a..2355ed216 100644 --- a/src/runtime/pkg/oci/utils.go +++ b/src/runtime/pkg/oci/utils.go @@ -511,12 +511,6 @@ func addHypervisorConfigOverrides(ocispec specs.Spec, config *vc.SandboxConfig, return err } - if err := newAnnotationConfiguration(ocispec, vcAnnotations.HotplugVFIOOnRootBus).setBool(func(hotplugVFIOOnRootBus bool) { - config.HypervisorConfig.HotplugVFIOOnRootBus = hotplugVFIOOnRootBus - }); err != nil { - return err - } - if err := newAnnotationConfiguration(ocispec, vcAnnotations.UseLegacySerial).setBool(func(useLegacySerial bool) { config.HypervisorConfig.LegacySerial = useLegacySerial }); err != nil { diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go index 7b53d4dc6..f045eede8 100644 --- a/src/runtime/pkg/oci/utils_test.go +++ b/src/runtime/pkg/oci/utils_test.go @@ -659,7 +659,6 @@ func TestAddHypervisorAnnotations(t *testing.T) { ocispec.Annotations[vcAnnotations.DisableVhostNet] = "true" ocispec.Annotations[vcAnnotations.GuestHookPath] = "/usr/bin/" ocispec.Annotations[vcAnnotations.DisableImageNvdimm] = "true" - ocispec.Annotations[vcAnnotations.HotplugVFIOOnRootBus] = "true" ocispec.Annotations[vcAnnotations.ColdPlugVFIO] = config.BridgePort ocispec.Annotations[vcAnnotations.HotPlugVFIO] = config.NoPort ocispec.Annotations[vcAnnotations.IOMMUPlatform] = "true" @@ -700,7 +699,6 @@ func TestAddHypervisorAnnotations(t *testing.T) { assert.Equal(sbConfig.HypervisorConfig.DisableVhostNet, true) assert.Equal(sbConfig.HypervisorConfig.GuestHookPath, "/usr/bin/") assert.Equal(sbConfig.HypervisorConfig.DisableImageNvdimm, true) - assert.Equal(sbConfig.HypervisorConfig.HotplugVFIOOnRootBus, true) assert.Equal(string(sbConfig.HypervisorConfig.ColdPlugVFIO), string(config.BridgePort)) assert.Equal(string(sbConfig.HypervisorConfig.HotPlugVFIO), string(config.NoPort)) assert.Equal(sbConfig.HypervisorConfig.IOMMUPlatform, true) diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go index fb15b2e17..d322b3398 100644 --- a/src/runtime/virtcontainers/container.go +++ b/src/runtime/virtcontainers/container.go @@ -18,6 +18,7 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager" + deviceManager "github.com/kata-containers/kata-containers/src/runtime/pkg/device/manager" volume "github.com/kata-containers/kata-containers/src/runtime/pkg/direct-volume" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols/grpc" @@ -878,9 +879,16 @@ func (c *Container) create(ctx context.Context) (err error) { // If cold-plug we've attached the devices already, do not try to // attach them a second time. coldPlugVFIO := (c.sandbox.config.HypervisorConfig.ColdPlugVFIO != config.NoPort) + modeVFIO := (c.sandbox.config.VfioMode == config.VFIOModeVFIO) + if coldPlugVFIO { var cntDevices []ContainerDevice for _, dev := range c.devices { + isVFIOControlDevice := deviceManager.IsVFIOControlDevice(dev.ContainerPath) + if isVFIOControlDevice && modeVFIO { + cntDevices = append(cntDevices, dev) + } + if strings.HasPrefix(dev.ContainerPath, vfioPath) { c.Logger().WithFields(logrus.Fields{ "device": dev, diff --git a/src/runtime/virtcontainers/documentation/api/1.0/api.md b/src/runtime/virtcontainers/documentation/api/1.0/api.md index ca5cb4a1a..75acda066 100644 --- a/src/runtime/virtcontainers/documentation/api/1.0/api.md +++ b/src/runtime/virtcontainers/documentation/api/1.0/api.md @@ -284,10 +284,6 @@ type HypervisorConfig struct { // DisableImageNvdimm is used to disable guest rootfs image nvdimm devices DisableImageNvdimm bool - // HotplugVFIOOnRootBus is used to indicate if devices need to be hotplugged on the - // root bus instead of a bridge. - HotplugVFIOOnRootBus bool - // HotPlugVFIO is used to indicate if devices need to be hotplugged on the // root port, switch, bridge or no port HotPlugVFIO hv.PCIePort diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index f2c86c6a6..c61b44f67 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -312,117 +312,356 @@ type Param struct { // HypervisorConfig is the hypervisor configuration. // nolint: govet type HypervisorConfig struct { - customAssets map[types.AssetType]*types.Asset - SeccompSandbox string - KernelPath string - ImagePath string - InitrdPath string - FirmwarePath string - FirmwareVolumePath string - MachineAccelerators string - CPUFeatures string - HypervisorPath string - HypervisorCtlPath string - GuestPreAttestationKeyset string - BlockDeviceDriver string - HypervisorMachineType string - GuestPreAttestationURI string - GuestPreAttestationMode string - DevicesStatePath string - EntropySource string - SharedFS string - SharedPath string - VirtioFSDaemon string - VirtioFSCache string - FileBackedMemRootDir string - VhostUserStorePath string - GuestMemoryDumpPath string - GuestHookPath string - VMid string - VMStorePath string - RunStorePath string - SELinuxProcessLabel string - JailerPath string - MemoryPath string - SEVCertChainPath string - BlockDeviceAIO string - User string - RemoteHypervisorSocket string - SandboxName string - SandboxNamespace string - JailerPathList []string - EntropySourceList []string - VirtioFSDaemonList []string - VirtioFSExtraArgs []string - EnableAnnotations []string - FileBackedMemRootList []string - PFlash []string - VhostUserStorePathList []string - HypervisorCtlPathList []string - KernelParams []Param - Groups []uint32 - HypervisorPathList []string - HypervisorParams []Param - DiskRateLimiterBwOneTimeBurst int64 - DiskRateLimiterOpsMaxRate int64 - RootfsType string - VhostUserDeviceReconnect uint32 + // customAssets is a map of assets. + // Each value in that map takes precedence over the configured assets. + // For example, if there is a value for the "kernel" key in this map, + // it will be used for the sandbox's kernel path instead of KernelPath. + customAssets map[types.AssetType]*types.Asset + + // Supplementary group IDs. + Groups []uint32 + + // KernelPath is the guest kernel host path. + KernelPath string + + // ImagePath is the guest image host path. + ImagePath string + + // InitrdPath is the guest initrd image host path. + // ImagePath and InitrdPath cannot be set at the same time. + InitrdPath string + + // RootfsType is filesystem type of rootfs. + RootfsType string + + // FirmwarePath is the bios host path + FirmwarePath string + + // FirmwareVolumePath is the configuration volume path for the firmware + FirmwareVolumePath string + + // MachineAccelerators are machine specific accelerators + MachineAccelerators string + + // CPUFeatures are cpu specific features + CPUFeatures string + + // HypervisorPath is the hypervisor executable host path. + HypervisorPath string + + // HypervisorCtlPath is the hypervisor ctl executable host path. + HypervisorCtlPath string + + // JailerPath is the jailer executable host path. + JailerPath string + + // BlockDeviceDriver specifies the driver to be used for block device + // either VirtioSCSI or VirtioBlock with the default driver being defaultBlockDriver + BlockDeviceDriver string + + // HypervisorMachineType specifies the type of machine being + // emulated. + HypervisorMachineType string + + // MemoryPath is the memory file path of VM memory. Used when either BootToBeTemplate or + // BootFromTemplate is true. + MemoryPath string + + // DevicesStatePath is the VM device state file path. Used when either BootToBeTemplate or + // BootFromTemplate is true. + DevicesStatePath string + + // EntropySource is the path to a host source of + // entropy (/dev/random, /dev/urandom or real hardware RNG device) + EntropySource string + + // Shared file system type: + // - virtio-9p + // - virtio-fs (default) + SharedFS string + + // Path for filesystem sharing + SharedPath string + + // VirtioFSDaemon is the virtio-fs vhost-user daemon path + VirtioFSDaemon string + + // VirtioFSCache cache mode for fs version cache + VirtioFSCache string + + // File based memory backend root directory + FileBackedMemRootDir string + + // VhostUserStorePath is the directory path where vhost-user devices + // related folders, sockets and device nodes should be. + VhostUserStorePath string + + // VhostUserDeviceReconnect is the timeout for reconnecting on non-server spdk sockets + // when the remote end goes away. Zero disables reconnecting. + VhostUserDeviceReconnect uint32 + + // GuestCoredumpPath is the path in host for saving guest memory dump + GuestMemoryDumpPath string + + // GuestHookPath is the path within the VM that will be used for 'drop-in' hooks + GuestHookPath string + + // VMid is the id of the VM that create the hypervisor if the VM is created by the factory. + // VMid is "" if the hypervisor is not created by the factory. + VMid string + + // VMStorePath is the location on disk where VM information will persist + VMStorePath string + + // VMStorePath is the location on disk where runtime information will persist + RunStorePath string + + // SELinux label for the VM + SELinuxProcessLabel string + + // HypervisorPathList is the list of hypervisor paths names allowed in annotations + HypervisorPathList []string + + // HypervisorCtlPathList is the list of hypervisor control paths names allowed in annotations + HypervisorCtlPathList []string + + // JailerPathList is the list of jailer paths names allowed in annotations + JailerPathList []string + + // EntropySourceList is the list of valid entropy sources + EntropySourceList []string + + // VirtioFSDaemonList is the list of valid virtiofs names for annotations + VirtioFSDaemonList []string + + // VirtioFSExtraArgs passes options to virtiofsd daemon + VirtioFSExtraArgs []string + + // Enable annotations by name + EnableAnnotations []string + + // FileBackedMemRootList is the list of valid root directories values for annotations + FileBackedMemRootList []string + + // PFlash image paths + PFlash []string + + // VhostUserStorePathList is the list of valid values for vhost-user paths + VhostUserStorePathList []string + + // SeccompSandbox is the qemu function which enables the seccomp feature + SeccompSandbox string + + // BlockiDeviceAIO specifies the I/O API to be used. + BlockDeviceAIO string + + // The user maps to the uid. + User string + + // KernelParams are additional guest kernel parameters. + KernelParams []Param + + // HypervisorParams are additional hypervisor parameters. + HypervisorParams []Param + + // SGXEPCSize specifies the size in bytes for the EPC Section. + // Enable SGX. Hardware-based isolation and memory encryption. + SGXEPCSize int64 + + // DiskRateLimiterBwRate is used to control disk I/O bandwidth on VM level. + // The same value, defined in bits per second, is used for inbound and outbound bandwidth. + DiskRateLimiterBwMaxRate int64 + + // DiskRateLimiterBwOneTimeBurst is used to control disk I/O bandwidth on VM level. + // This increases the initial max rate and this initial extra credit does *NOT* replenish + // and can be used for an *initial* burst of data. + DiskRateLimiterBwOneTimeBurst int64 + + // DiskRateLimiterOpsRate is used to control disk I/O operations on VM level. + // The same value, defined in operations per second, is used for inbound and outbound bandwidth. + DiskRateLimiterOpsMaxRate int64 + + // DiskRateLimiterOpsOneTimeBurst is used to control disk I/O operations on VM level. + // This increases the initial max rate and this initial extra credit does *NOT* replenish + // and can be used for an *initial* burst of data. DiskRateLimiterOpsOneTimeBurst int64 - SGXEPCSize int64 - DefaultMaxMemorySize uint64 - NetRateLimiterBwMaxRate int64 - NetRateLimiterBwOneTimeBurst int64 - NetRateLimiterOpsMaxRate int64 - NetRateLimiterOpsOneTimeBurst int64 - MemOffset uint64 - TxRateLimiterMaxRate uint64 - DiskRateLimiterBwMaxRate int64 - RxRateLimiterMaxRate uint64 - MemorySize uint32 - DefaultMaxVCPUs uint32 - DefaultBridges uint32 - Msize9p uint32 - MemSlots uint32 - VirtioFSCacheSize uint32 - VirtioFSQueueSize uint32 - Uid uint32 - Gid uint32 - SEVGuestPolicy uint32 - SNPGuestPolicy uint64 - NumVCPUs uint32 - RemoteHypervisorTimeout uint32 - IOMMUPlatform bool - EnableIOThreads bool - Debug bool - MemPrealloc bool - HugePages bool - VirtioMem bool - IOMMU bool - DisableBlockDeviceUse bool - DisableNestingChecks bool - DisableImageNvdimm bool - HotplugVFIOOnRootBus bool - GuestMemoryDumpPaging bool - ConfidentialGuest bool - SevSnpGuest bool - GuestPreAttestation bool - BlockDeviceCacheNoflush bool - BlockDeviceCacheDirect bool - BlockDeviceCacheSet bool - BootToBeTemplate bool - BootFromTemplate bool - DisableVhostNet bool - EnableVhostUserStore bool - GuestSwap bool - Rootless bool - DisableSeccomp bool - DisableSeLinux bool - DisableGuestSeLinux bool - LegacySerial bool - HotPlugVFIO config.PCIePort - ColdPlugVFIO config.PCIePort - VFIODevices []config.DeviceInfo - VhostUserBlkDevices []config.DeviceInfo + + // RxRateLimiterMaxRate is used to control network I/O inbound bandwidth on VM level. + RxRateLimiterMaxRate uint64 + + // TxRateLimiterMaxRate is used to control network I/O outbound bandwidth on VM level. + TxRateLimiterMaxRate uint64 + + // NetRateLimiterBwRate is used to control network I/O bandwidth on VM level. + // The same value, defined in bits per second, is used for inbound and outbound bandwidth. + NetRateLimiterBwMaxRate int64 + + // NetRateLimiterBwOneTimeBurst is used to control network I/O bandwidth on VM level. + // This increases the initial max rate and this initial extra credit does *NOT* replenish + // and can be used for an *initial* burst of data. + NetRateLimiterBwOneTimeBurst int64 + + // NetRateLimiterOpsRate is used to control network I/O operations on VM level. + // The same value, defined in operations per second, is used for inbound and outbound bandwidth. + NetRateLimiterOpsMaxRate int64 + + // NetRateLimiterOpsOneTimeBurst is used to control network I/O operations on VM level. + // This increases the initial max rate and this initial extra credit does *NOT* replenish + // and can be used for an *initial* burst of data. + NetRateLimiterOpsOneTimeBurst int64 + + // MemOffset specifies memory space for nvdimm device + MemOffset uint64 + + // VFIODevices are used to get PCIe device info early before the sandbox + // is started to make better PCIe topology decisions + VFIODevices []config.DeviceInfo + // VhostUserBlkDevices are handled differently in Q35 and Virt machine + // type. capture them early before the sandbox to make better PCIe topology + // decisions + VhostUserBlkDevices []config.DeviceInfo + + // HotplugVFIO is used to indicate if devices need to be hotplugged on the + // root port or a switch + HotPlugVFIO config.PCIePort + + // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the + // root port, switch or no port + ColdPlugVFIO config.PCIePort + + // NumVCPUs specifies default number of vCPUs for the VM. + NumVCPUs uint32 + + //DefaultMaxVCPUs specifies the maximum number of vCPUs for the VM. + DefaultMaxVCPUs uint32 + + // DefaultMem specifies default memory size in MiB for the VM. + MemorySize uint32 + + // DefaultMaxMemorySize specifies the maximum amount of RAM in MiB for the VM. + DefaultMaxMemorySize uint64 + + // DefaultBridges specifies default number of bridges for the VM. + // Bridges can be used to hot plug devices + DefaultBridges uint32 + + // Msize9p is used as the msize for 9p shares + Msize9p uint32 + + // MemSlots specifies default memory slots the VM. + MemSlots uint32 + + // VirtioFSCacheSize is the DAX cache size in MiB + VirtioFSCacheSize uint32 + + // Size of virtqueues + VirtioFSQueueSize uint32 + + // User ID. + Uid uint32 + + // Group ID. + Gid uint32 + + // BlockDeviceCacheSet specifies cache-related options will be set to block devices or not. + BlockDeviceCacheSet bool + + // BlockDeviceCacheDirect specifies cache-related options for block devices. + // Denotes whether use of O_DIRECT (bypass the host page cache) is enabled. + BlockDeviceCacheDirect bool + + // BlockDeviceCacheNoflush specifies cache-related options for block devices. + // Denotes whether flush requests for the device are ignored. + BlockDeviceCacheNoflush bool + + // DisableBlockDeviceUse disallows a block device from being used. + DisableBlockDeviceUse bool + + // EnableIOThreads enables IO to be processed in a separate thread. + // Supported currently for virtio-scsi driver. + EnableIOThreads bool + + // Debug changes the default hypervisor and kernel parameters to + // enable debug output where available. And Debug also enable the hmp socket. + Debug bool + + // MemPrealloc specifies if the memory should be pre-allocated + MemPrealloc bool + + // HugePages specifies if the memory should be pre-allocated from huge pages + HugePages bool + + // VirtioMem is used to enable/disable virtio-mem + VirtioMem bool + + // IOMMU specifies if the VM should have a vIOMMU + IOMMU bool + + // IOMMUPlatform is used to indicate if IOMMU_PLATFORM is enabled for supported devices + IOMMUPlatform bool + + // DisableNestingChecks is used to override customizations performed + // when running on top of another VMM. + DisableNestingChecks bool + + // DisableImageNvdimm is used to disable guest rootfs image nvdimm devices + DisableImageNvdimm bool + + // GuestMemoryDumpPaging is used to indicate if enable paging + // for QEMU dump-guest-memory command + GuestMemoryDumpPaging bool + + // Enable confidential guest support. + // Enable or disable different hardware features, ranging + // from memory encryption to both memory and CPU-state encryption and integrity. + ConfidentialGuest bool + + // Enable SEV-SNP guests on AMD machines capable of both + SevSnpGuest bool + + // BootToBeTemplate used to indicate if the VM is created to be a template VM + BootToBeTemplate bool + + // BootFromTemplate used to indicate if the VM should be created from a template VM + BootFromTemplate bool + + // DisableVhostNet is used to indicate if host supports vhost_net + DisableVhostNet bool + + // EnableVhostUserStore is used to indicate if host supports vhost-user-blk/scsi + EnableVhostUserStore bool + + // GuestSwap Used to enable/disable swap in the guest + GuestSwap bool + + // Rootless is used to enable rootless VMM process + Rootless bool + + // Disable seccomp from the hypervisor process + DisableSeccomp bool + + // Disable selinux from the hypervisor process + DisableSeLinux bool + + // Disable selinux from the container process + DisableGuestSeLinux bool + + // Use legacy serial for the guest console + LegacySerial bool + + GuestPreAttestation bool + GuestPreAttestationKeyset string + GuestPreAttestationURI string + GuestPreAttestationMode string + + RemoteHypervisorSocket string + RemoteHypervisorTimeout uint32 + SandboxName string + SandboxNamespace string + + SEVCertChainPath string + SEVGuestPolicy uint32 + SNPGuestPolicy uint64 } // vcpu mapping from vcpu number to thread number diff --git a/src/runtime/virtcontainers/persist.go b/src/runtime/virtcontainers/persist.go index 91ab51ebf..aa1875a3b 100644 --- a/src/runtime/virtcontainers/persist.go +++ b/src/runtime/virtcontainers/persist.go @@ -244,7 +244,6 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) { FileBackedMemRootList: sconfig.HypervisorConfig.FileBackedMemRootList, DisableNestingChecks: sconfig.HypervisorConfig.DisableNestingChecks, DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm, - HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus, BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate, BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate, DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet, @@ -485,7 +484,6 @@ func loadSandboxConfig(id string) (*SandboxConfig, error) { FileBackedMemRootList: hconf.FileBackedMemRootList, DisableNestingChecks: hconf.DisableNestingChecks, DisableImageNvdimm: hconf.DisableImageNvdimm, - HotplugVFIOOnRootBus: hconf.HotplugVFIOOnRootBus, HotPlugVFIO: hconf.HotPlugVFIO, ColdPlugVFIO: hconf.ColdPlugVFIO, BootToBeTemplate: hconf.BootToBeTemplate, diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go index 6ca5ee690..5b43ab24a 100644 --- a/src/runtime/virtcontainers/persist/api/config.go +++ b/src/runtime/virtcontainers/persist/api/config.go @@ -191,10 +191,6 @@ type HypervisorConfig struct { // DisableImageNvdimm disables nvdimm for guest rootfs image DisableImageNvdimm bool - // HotplugVFIOOnRootBus is used to indicate if devices need to be hotplugged on the - // root bus instead of a bridge. - HotplugVFIOOnRootBus bool - // HotPlugVFIO is used to indicate if devices need to be hotplugged on the // root, switch, bridge or no-port HotPlugVFIO config.PCIePort diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go index 9a1811254..3cc242062 100644 --- a/src/runtime/virtcontainers/pkg/annotations/annotations.go +++ b/src/runtime/virtcontainers/pkg/annotations/annotations.go @@ -139,10 +139,6 @@ const ( // DisableImageNvdimm is a sandbox annotation to specify use of nvdimm device for guest rootfs image. DisableImageNvdimm = kataAnnotHypervisorPrefix + "disable_image_nvdimm" - // HotplugVFIOOnRootBus is a sandbox annotation used to indicate if devices need to be hotplugged on the - // root bus instead of a bridge. - HotplugVFIOOnRootBus = kataAnnotHypervisorPrefix + "hotplug_vfio_on_root_bus" - // ColdPlugVFIO is a sandbox annotation used to indicate if devices need to be coldplugged. ColdPlugVFIO = kataAnnotHypervisorPrefix + "cold_plug_vfio" diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 43cb78f21..27d75ecc4 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -66,11 +66,6 @@ const romFile = "" // Default value is false. const defaultDisableModern = false -// A deeper PCIe topology than 5 is already not advisable just for the sake -// of having enough buffer we limit ourselves to 10 and exit if we reach -// the root bus -const maxPCIeTopoDepth = 10 - type qmpChannel struct { qmp *govmmQemu.QMP ctx context.Context @@ -81,15 +76,14 @@ type qmpChannel struct { // QemuState keeps Qemu's state type QemuState struct { - UUID string - HotPlugVFIO config.PCIePort - Bridges []types.Bridge - HotpluggedVCPUs []hv.CPUDevice - HotpluggedMemory int - VirtiofsDaemonPid int - HotplugVFIOOnRootBus bool - HotplugVFIO config.PCIePort - ColdPlugVFIO config.PCIePort + UUID string + HotPlugVFIO config.PCIePort + Bridges []types.Bridge + HotpluggedVCPUs []hv.CPUDevice + HotpluggedMemory int + VirtiofsDaemonPid int + HotplugVFIO config.PCIePort + ColdPlugVFIO config.PCIePort } // qemu is an Hypervisor interface implementation for the Linux qemu hypervisor. @@ -285,7 +279,6 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso q.state.UUID = uuid.Generate().String() q.state.HotPlugVFIO = q.config.HotPlugVFIO q.state.ColdPlugVFIO = q.config.ColdPlugVFIO - q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus q.state.HotPlugVFIO = q.config.HotPlugVFIO // The path might already exist, but in case of VM templating, @@ -808,6 +801,7 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig if err != nil { return fmt.Errorf("Cannot get host path for device: %v err: %v", dev, err) } + devicesPerIOMMUGroup, err := drivers.GetAllVFIODevicesFromIOMMUGroup(dev) if err != nil { return fmt.Errorf("Cannot get all VFIO devices from IOMMU group with device: %v err: %v", dev, err) @@ -818,7 +812,7 @@ func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig } } } - vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus) + vfioOnRootPort := (q.state.HotPlugVFIO == config.RootPort || q.state.ColdPlugVFIO == config.RootPort) vfioOnSwitchPort := (q.state.HotPlugVFIO == config.SwitchPort || q.state.ColdPlugVFIO == config.SwitchPort) numOfVhostUserBlockDevices := len(hypervisorConfig.VhostUserBlkDevices) @@ -1723,7 +1717,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V config.PCIeDevices[config.RootPort][devID] = true bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID) - bridgeSlot, err := q.qomGetSlot(bridgeQomPath) + bridgeSlot, err := q.arch.qomGetSlot(bridgeQomPath, &q.qmpMonitorCh) if err != nil { return err } @@ -1826,88 +1820,6 @@ func (q *qemu) hotplugVhostUserDevice(ctx context.Context, vAttr *config.VhostUs } } -// Query QMP to find the PCI slot of a device, given its QOM path or ID -func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) { - addr, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qomPath, "addr") - if err != nil { - return types.PciSlot{}, err - } - addrf, ok := addr.(float64) - // XXX going via float makes no real sense, but that's how - // JSON works, and we'll get away with it for the small values - // we have here - if !ok { - return types.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr) - } - addri := int(addrf) - - slotNum, funcNum := addri>>3, addri&0x7 - if funcNum != 0 { - return types.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q", - slotNum, funcNum, qomPath) - } - - return types.PciSlotFromInt(slotNum) -} - -// Query QMP to find a device's PCI path given its QOM path or ID -func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) { - - var slots []types.PciSlot - - devSlot, err := q.qomGetSlot(qemuID) - if err != nil { - return types.PciPath{}, err - } - slots = append(slots, devSlot) - - // This only works for Q35 and Virt - r, _ := regexp.Compile(`^/machine/.*/pcie.0`) - - var parentPath = qemuID - // We do not want to use a forever loop here, a deeper PCIe topology - // than 5 is already not advisable just for the sake of having enough - // buffer we limit ourselves to 10 and leave the loop early if we hit - // the root bus. - for i := 1; i <= maxPCIeTopoDepth; i++ { - parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus") - if err != nil { - return types.PciPath{}, err - } - - busQOM, ok := parenBusQOM.(string) - if !ok { - return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM) - } - - // If we hit /machine/q35/pcie.0 we're done this is the root bus - // we climbed the complete hierarchy - if r.Match([]byte(busQOM)) { - break - } - - // `bus` is the QOM path of the QOM bus object, but we need - // the PCI parent_bus which manages that bus. There doesn't seem - // to be a way to get that other than to simply drop the last - // path component. - idx := strings.LastIndex(busQOM, "/") - if idx == -1 { - return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM) - } - parentBus := busQOM[:idx] - - parentSlot, err := q.qomGetSlot(parentBus) - if err != nil { - return types.PciPath{}, err - } - - // Prepend the slots, since we're climbing the hierarchy - slots = append([]types.PciSlot{parentSlot}, slots...) - parentPath = parentBus - } - return types.PciPathFromSlots(slots...) -} - func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) { return q.executeVFIODeviceAdd(device) } @@ -1937,7 +1849,7 @@ func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, brid case config.VFIOPCIDeviceMediatedType: return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile) case config.VFIOAPDeviceMediatedType: - return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID) default: return fmt.Errorf("Incorrect VFIO device type found") } @@ -1950,7 +1862,7 @@ func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error { case config.VFIOPCIDeviceMediatedType: return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile) case config.VFIOAPDeviceMediatedType: - return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev, device.ID) default: return fmt.Errorf("Incorrect VFIO device type found") } @@ -1968,46 +1880,43 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op "hot-plug-vfio": q.state.HotPlugVFIO, "device-info": string(buf), }).Info("Start hot-plug VFIO device") - // In case MachineType is q35, a PCIe device is hotplugged on - // a PCIe Root Port or alternatively on a PCIe Switch Port - if q.HypervisorConfig().HypervisorMachineType != QemuQ35 && q.HypervisorConfig().HypervisorMachineType != QemuVirt { - device.Bus = "" - } else { - var err error - // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus - // for pc machine type instead of bridge. This is useful for devices that require - // a large PCI BAR which is a currently a limitation with PCI bridges. - if q.state.HotPlugVFIO == config.RootPort || q.state.HotplugVFIOOnRootBus { - err = q.hotplugVFIODeviceRootPort(ctx, device) - } else if q.state.HotPlugVFIO == config.SwitchPort { - err = q.hotplugVFIODeviceSwitchPort(ctx, device) - } else { - err = q.hotplugVFIODeviceBridgePort(ctx, device) - } - if err != nil { - return err - } + + err = fmt.Errorf("Incorrect hot plug configuration %v for device %v found", q.state.HotPlugVFIO, device) + // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus + // for pc machine type instead of bridge. This is useful for devices that require + // a large PCI BAR which is a currently a limitation with PCI bridges. + if q.state.HotPlugVFIO == config.RootPort { + err = q.hotplugVFIODeviceRootPort(ctx, device) + } else if q.state.HotPlugVFIO == config.SwitchPort { + err = q.hotplugVFIODeviceSwitchPort(ctx, device) + } else if q.state.HotPlugVFIO == config.BridgePort { + err = q.hotplugVFIODeviceBridgePort(ctx, device) } - // XXX: Depending on whether we're doing root port or + if err != nil { + return err + } + + // Depending on whether we're doing root port or // bridge hotplug, and how the bridge is set up in // other parts of the code, we may or may not already // have information about the slot number of the // bridge and or the device. For simplicity, just - // query both of them back from qemu - device.GuestPciPath, err = q.qomGetPciPath(device.ID) + // query both of them back from qemu based on the arch + device.GuestPciPath, err = q.arch.qomGetPciPath(device.ID, &q.qmpMonitorCh) + return err - } + } else { - q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device") + q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device") - if !q.state.HotplugVFIOOnRootBus { - if err := q.arch.removeDeviceFromBridge(device.ID); err != nil { - return err + if q.state.HotPlugVFIO == config.BridgePort { + if err := q.arch.removeDeviceFromBridge(device.ID); err != nil { + return err + } } + + return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID) } - - return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID) - } func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error { @@ -2966,7 +2875,6 @@ func (q *qemu) Save() (s hv.HypervisorState) { s.Type = string(QemuHypervisor) s.UUID = q.state.UUID s.HotpluggedMemory = q.state.HotpluggedMemory - s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus for _, bridge := range q.arch.getBridges() { s.Bridges = append(s.Bridges, hv.Bridge{ @@ -2988,7 +2896,6 @@ func (q *qemu) Save() (s hv.HypervisorState) { func (q *qemu) Load(s hv.HypervisorState) { q.state.UUID = s.UUID q.state.HotpluggedMemory = s.HotpluggedMemory - q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid for _, bridge := range s.Bridges { diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index ead1a4a65..25c193d82 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -13,6 +13,7 @@ import ( "errors" "fmt" "os" + "regexp" "runtime" "strings" @@ -25,6 +26,11 @@ import ( "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" ) +// A deeper PCIe topology than 5 is already not advisable just for the sake +// of having enough buffer we limit ourselves to 10 and exit if we reach +// the root bus +const maxPCIeTopoDepth = 10 + type qemuArch interface { // enableNestingChecks nesting checks will be honoured enableNestingChecks() @@ -169,6 +175,12 @@ type qemuArch interface { // and 64-bit addressable memory getBARsMaxAddressableMemory() (uint64, uint64) + // Query QMP to find a device's PCI path given its QOM path or ID + qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) + + // Query QMP to find the PCI slot of a device, given its QOM path or ID + qomGetSlot(qomPath string, qmpCh *qmpChannel) (types.PciSlot, error) + // append SEV object type to the VM definition appendSEVObject(devices []govmmQemu.Device, firmware, firmwareVolume string, config sevKbs.GuestPreAttestationConfig) ([]govmmQemu.Device, string, error) @@ -915,6 +927,88 @@ func (q *qemuArchBase) appendProtectionDevice(devices []govmmQemu.Device, firmwa return devices, firmware, nil } +// Query QMP to find the PCI slot of a device, given its QOM path or ID +func (q *qemuArchBase) qomGetSlot(qomPath string, qmpCh *qmpChannel) (types.PciSlot, error) { + addr, err := qmpCh.qmp.ExecQomGet(qmpCh.ctx, qomPath, "addr") + if err != nil { + return types.PciSlot{}, err + } + addrf, ok := addr.(float64) + // XXX going via float makes no real sense, but that's how + // JSON works, and we'll get away with it for the small values + // we have here + if !ok { + return types.PciSlot{}, fmt.Errorf("addr QOM property of %q is %T not a number", qomPath, addr) + } + addri := int(addrf) + + slotNum, funcNum := addri>>3, addri&0x7 + if funcNum != 0 { + return types.PciSlot{}, fmt.Errorf("Unexpected non-zero PCI function (%02x.%1x) on %q", + slotNum, funcNum, qomPath) + } + + return types.PciSlotFromInt(slotNum) +} + +// Query QMP to find a device's PCI path given its QOM path or ID +func (q *qemuArchBase) qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) { + + var slots []types.PciSlot + + devSlot, err := q.qomGetSlot(qemuID, qmpCh) + if err != nil { + return types.PciPath{}, err + } + slots = append(slots, devSlot) + + // This only works for Q35 and Virt + r, _ := regexp.Compile(`^/machine/.*/pcie.0`) + + var parentPath = qemuID + // We do not want to use a forever loop here, a deeper PCIe topology + // than 5 is already not advisable just for the sake of having enough + // buffer we limit ourselves to 10 and leave the loop early if we hit + // the root bus. + for i := 1; i <= maxPCIeTopoDepth; i++ { + parenBusQOM, err := qmpCh.qmp.ExecQomGet(qmpCh.ctx, parentPath, "parent_bus") + if err != nil { + return types.PciPath{}, err + } + + busQOM, ok := parenBusQOM.(string) + if !ok { + return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM) + } + + // If we hit /machine/q35/pcie.0 we're done this is the root bus + // we climbed the complete hierarchy + if r.Match([]byte(busQOM)) { + break + } + + // `bus` is the QOM path of the QOM bus object, but we need + // the PCI parent_bus which manages that bus. There doesn't seem + // to be a way to get that other than to simply drop the last + // path component. + idx := strings.LastIndex(busQOM, "/") + if idx == -1 { + return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM) + } + parentBus := busQOM[:idx] + + parentSlot, err := q.qomGetSlot(parentBus, qmpCh) + if err != nil { + return types.PciPath{}, err + } + + // Prepend the slots, since we're climbing the hierarchy + slots = append([]types.PciSlot{parentSlot}, slots...) + parentPath = parentBus + } + return types.PciPathFromSlots(slots...) +} + // AMD SEV methods func (q *qemuArchBase) appendSEVObject(devices []govmmQemu.Device, firmware, firmwareVolume string, config sevKbs.GuestPreAttestationConfig) ([]govmmQemu.Device, string, error) { hvLogger.WithField("arch", runtime.GOARCH).Warnf("Confidential Computing has not been implemented for this architecture") diff --git a/src/runtime/virtcontainers/qemu_s390x.go b/src/runtime/virtcontainers/qemu_s390x.go index b0c1ede54..29eaafe5b 100644 --- a/src/runtime/virtcontainers/qemu_s390x.go +++ b/src/runtime/virtcontainers/qemu_s390x.go @@ -351,3 +351,32 @@ func (q *qemuS390x) appendProtectionDevice(devices []govmmQemu.Device, firmware, return devices, firmware, fmt.Errorf("Unsupported guest protection technology: %v", q.protection) } } + +func (q *qemuS390x) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device { + if vfioDev.SysfsDev == "" { + return devices + } + + if len(vfioDev.APDevices) > 0 { + devices = append(devices, + govmmQemu.VFIODevice{ + SysfsDev: vfioDev.SysfsDev, + Transport: govmmQemu.TransportAP, + }, + ) + return devices + + } + devices = append(devices, + govmmQemu.VFIODevice{ + SysfsDev: vfioDev.SysfsDev, + }, + ) + return devices +} + +// Query QMP to find a device's PCI path given its QOM path or ID +func (q *qemuS390x) qomGetPciPath(qemuID string, qmpCh *qmpChannel) (types.PciPath, error) { + hvLogger.Warnf("qomGetPciPath not implemented for s390x") + return types.PciPath{}, nil +} diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index bdd6f296c..d10286a06 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -30,8 +30,6 @@ import ( "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" - cri "github.com/containerd/containerd/pkg/cri/annotations" - crio "github.com/containers/podman/v4/pkg/annotations" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/api" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/config" "github.com/kata-containers/kata-containers/src/runtime/pkg/device/drivers" @@ -611,67 +609,11 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor return nil, err } - if len(sandboxConfig.Containers) > 0 { - // These values are required by remote hypervisor - for _, a := range []string{cri.SandboxName, crio.SandboxName} { - if value, ok := sandboxConfig.Containers[0].Annotations[a]; ok { - sandboxConfig.HypervisorConfig.SandboxName = value - } - } - - for _, a := range []string{cri.SandboxNamespace, crio.Namespace} { - if value, ok := sandboxConfig.Containers[0].Annotations[a]; ok { - sandboxConfig.HypervisorConfig.SandboxNamespace = value - } - } + coldPlugVFIO, err := s.coldOrHotPlugVFIO(&sandboxConfig) + if err != nil { + return nil, err } - // If we have a confidential guest we need to cold-plug the PCIe VFIO devices - // until we have TDISP/IDE PCIe support. - coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort) - // Aggregate all the containner devices for hot-plug and use them to dedcue - // the correct amount of ports to reserve for the hypervisor. - hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort) - - var vfioDevices []config.DeviceInfo - // vhost-user-block device is a PCIe device in Virt, keep track of it - // for correct number of PCIe root ports. - var vhostUserBlkDevices []config.DeviceInfo - - for cnt, containers := range sandboxConfig.Containers { - for dev, device := range containers.DeviceInfos { - - if deviceManager.IsVhostUserBlk(device) { - vhostUserBlkDevices = append(vhostUserBlkDevices, device) - continue - } - isVFIO := deviceManager.IsVFIO(device.ContainerPath) - if hotPlugVFIO && isVFIO { - vfioDevices = append(vfioDevices, device) - sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO - } - if coldPlugVFIO && isVFIO { - device.ColdPlug = true - device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO - vfioDevices = append(vfioDevices, device) - // We need to remove the devices marked for cold-plug - // otherwise at the container level the kata-agent - // will try to hot-plug them. - sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging" - } - } - var filteredDevices []config.DeviceInfo - for _, device := range containers.DeviceInfos { - if device.ID != "remove-we-are-cold-plugging" { - filteredDevices = append(filteredDevices, device) - } - } - sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices - - } - sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices - sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices - // store doesn't require hypervisor to be stored immediately if err = s.hypervisor.CreateVM(ctx, s.id, s.network, &sandboxConfig.HypervisorConfig); err != nil { return nil, err @@ -685,7 +627,8 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor return s, nil } - for _, dev := range vfioDevices { + for _, dev := range sandboxConfig.HypervisorConfig.VFIODevices { + s.Logger().Info("cold-plug device: ", dev) _, err := s.AddDevice(ctx, dev) if err != nil { s.Logger().WithError(err).Debug("Cannot cold-plug add device") @@ -695,6 +638,70 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor return s, nil } +func (s *Sandbox) coldOrHotPlugVFIO(sandboxConfig *SandboxConfig) (bool, error) { + // If we have a confidential guest we need to cold-plug the PCIe VFIO devices + // until we have TDISP/IDE PCIe support. + coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort) + // Aggregate all the containner devices for hot-plug and use them to dedcue + // the correct amount of ports to reserve for the hypervisor. + hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort) + + modeIsGK := (sandboxConfig.VfioMode == config.VFIOModeGuestKernel) + modeIsVFIO := (sandboxConfig.VfioMode == config.VFIOModeVFIO) + + var vfioDevices []config.DeviceInfo + // vhost-user-block device is a PCIe device in Virt, keep track of it + // for correct number of PCIe root ports. + var vhostUserBlkDevices []config.DeviceInfo + + for cnt, containers := range sandboxConfig.Containers { + for dev, device := range containers.DeviceInfos { + + if deviceManager.IsVhostUserBlk(device) { + vhostUserBlkDevices = append(vhostUserBlkDevices, device) + continue + } + isVFIODevice := deviceManager.IsVFIODevice(device.ContainerPath) + isVFIOControlDevice := deviceManager.IsVFIOControlDevice(device.ContainerPath) + // vfio_mode=vfio needs the VFIO control device add it to the list + // of devices to be added to the VM. + if modeIsVFIO && isVFIOControlDevice && !hotPlugVFIO { + vfioDevices = append(vfioDevices, device) + } + + if hotPlugVFIO && isVFIODevice { + device.ColdPlug = false + device.Port = sandboxConfig.HypervisorConfig.HotPlugVFIO + vfioDevices = append(vfioDevices, device) + sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO + } + if coldPlugVFIO && isVFIODevice { + device.ColdPlug = true + device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO + vfioDevices = append(vfioDevices, device) + // We need to remove the devices marked for cold-plug + // otherwise at the container level the kata-agent + // will try to hot-plug them. + if modeIsGK { + sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging" + } + } + } + var filteredDevices []config.DeviceInfo + for _, device := range containers.DeviceInfos { + if device.ID != "remove-we-are-cold-plugging" { + filteredDevices = append(filteredDevices, device) + } + } + sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices + } + + sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices + sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices + + return coldPlugVFIO, nil +} + func (s *Sandbox) createResourceController() error { var err error cgroupPath := "" @@ -2069,26 +2076,26 @@ func (s *Sandbox) AddDevice(ctx context.Context, info config.DeviceInfo) (api.De } var err error - b, err := s.devManager.NewDevice(info) + add, err := s.devManager.NewDevice(info) if err != nil { return nil, err } defer func() { if err != nil { - s.devManager.RemoveDevice(b.DeviceID()) + s.devManager.RemoveDevice(add.DeviceID()) } }() - if err = s.devManager.AttachDevice(ctx, b.DeviceID(), s); err != nil { + if err = s.devManager.AttachDevice(ctx, add.DeviceID(), s); err != nil { return nil, err } defer func() { if err != nil { - s.devManager.DetachDevice(ctx, b.DeviceID(), s) + s.devManager.DetachDevice(ctx, add.DeviceID(), s) } }() - return b, nil + return add, nil } // updateResources will: diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go index 90a2af7ee..b735c5511 100644 --- a/src/runtime/virtcontainers/sandbox_test.go +++ b/src/runtime/virtcontainers/sandbox_test.go @@ -606,6 +606,7 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) { HostPath: path, ContainerPath: path, DevType: "c", + Port: config.RootPort, } dev, err := dm.NewDevice(deviceInfo) assert.Nil(t, err, "deviceManager.NewDevice return error: %v", err) diff --git a/src/tools/kata-ctl/Cargo.lock b/src/tools/kata-ctl/Cargo.lock index c7586b6d4..c37d17b5e 100644 --- a/src/tools/kata-ctl/Cargo.lock +++ b/src/tools/kata-ctl/Cargo.lock @@ -33,6 +33,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anstream" version = "0.2.6" @@ -187,6 +202,34 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cgroups-rs" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b098e7c3a70d03c288fa0a96ccf13e770eb3d78c4cc0e1549b3c13215d5f965" +dependencies = [ + "libc", + "log", + "nix 0.25.1", + "regex", + "thiserror", +] + +[[package]] +name = "chrono" +version = "0.4.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "time 0.1.45", + "wasm-bindgen", + "winapi", +] + [[package]] name = "clap" version = "4.2.1" @@ -230,6 +273,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a2dd5a6fe8c6e3502f568a6353e5273bbb15193ad9a89e457b9970798efbea1" +[[package]] +name = "common-path" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2382f75942f4b3be3690fe4f86365e9c853c1587d6ee58212cebf6e2a9ccd101" + [[package]] name = "concolor-override" version = "1.0.0" @@ -358,6 +407,17 @@ dependencies = [ "libc", ] +[[package]] +name = "fail" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe5e43d0f78a42ad591453aedb1d7ae631ce7ee445c7643691055a9ed8d3b01c" +dependencies = [ + "log", + "once_cell", + "rand", +] + [[package]] name = "fastrand" version = "1.8.0" @@ -500,7 +560,7 @@ checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" dependencies = [ "cfg-if", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", ] [[package]] @@ -676,6 +736,29 @@ dependencies = [ "tokio", ] +[[package]] +name = "iana-time-zone" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "idna" version = "0.3.0" @@ -778,6 +861,7 @@ dependencies = [ "epoll", "futures", "hyper", + "kata-sys-util", "kata-types", "libc", "logging", @@ -805,6 +889,30 @@ dependencies = [ "vmm-sys-util", ] +[[package]] +name = "kata-sys-util" +version = "0.1.0" +dependencies = [ + "anyhow", + "byteorder", + "cgroups-rs", + "chrono", + "common-path", + "fail", + "kata-types", + "lazy_static", + "libc", + "nix 0.24.3", + "oci", + "once_cell", + "rand", + "serde_json", + "slog", + "slog-scope", + "subprocess", + "thiserror", +] + [[package]] name = "kata-types" version = "0.1.0" @@ -923,7 +1031,7 @@ checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" dependencies = [ "libc", "log", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.42.0", ] @@ -1004,6 +1112,15 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "num-traits" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +dependencies = [ + "autocfg", +] + [[package]] name = "num_cpus" version = "1.15.0" @@ -1163,6 +1280,12 @@ version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + [[package]] name = "privdrop" version = "0.5.3" @@ -1319,6 +1442,36 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + [[package]] name = "redox_syscall" version = "0.2.16" @@ -1641,7 +1794,7 @@ dependencies = [ "serde", "serde_json", "slog", - "time", + "time 0.3.17", ] [[package]] @@ -1665,7 +1818,7 @@ dependencies = [ "slog", "term", "thread_local", - "time", + "time 0.3.17", ] [[package]] @@ -1721,6 +1874,16 @@ dependencies = [ "syn 1.0.107", ] +[[package]] +name = "subprocess" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2e86926081dda636c546d8c5e641661049d7562a68f5488be4a1f7f66f6086" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "syn" version = "1.0.107" @@ -1821,6 +1984,17 @@ dependencies = [ "once_cell", ] +[[package]] +name = "time" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" +dependencies = [ + "libc", + "wasi 0.10.0+wasi-snapshot-preview1", + "winapi", +] + [[package]] name = "time" version = "0.3.17" @@ -2114,6 +2288,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -2248,6 +2428,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets 0.48.0", +] + [[package]] name = "windows-sys" version = "0.42.0" diff --git a/src/tools/kata-ctl/Cargo.toml b/src/tools/kata-ctl/Cargo.toml index a0ba95adb..2c4582432 100644 --- a/src/tools/kata-ctl/Cargo.toml +++ b/src/tools/kata-ctl/Cargo.toml @@ -30,6 +30,7 @@ sys-info = "0.9.1" shim-interface = { path = "../../libs/shim-interface"} kata-types = { path = "../../libs/kata-types" } +kata-sys-util = { path = "../../../src/libs/kata-sys-util/" } safe-path = { path = "../../libs/safe-path" } agent = { path = "../../runtime-rs/crates/agent"} serial_test = "0.5.1" diff --git a/src/tools/kata-ctl/src/arch/aarch64/mod.rs b/src/tools/kata-ctl/src/arch/aarch64/mod.rs index 41d28f8db..01cd4828a 100644 --- a/src/tools/kata-ctl/src/arch/aarch64/mod.rs +++ b/src/tools/kata-ctl/src/arch/aarch64/mod.rs @@ -84,10 +84,4 @@ mod arch_specific { // TODO: Not implemented Ok(true) } - - #[allow(dead_code)] - // Guest protection is not supported on ARM64. - pub fn available_guest_protection() -> Result { - Ok(check::GuestProtection::NoProtection) - } } diff --git a/src/tools/kata-ctl/src/arch/powerpc64le/mod.rs b/src/tools/kata-ctl/src/arch/powerpc64le/mod.rs index 436d5a4d2..bf5822c04 100644 --- a/src/tools/kata-ctl/src/arch/powerpc64le/mod.rs +++ b/src/tools/kata-ctl/src/arch/powerpc64le/mod.rs @@ -32,22 +32,4 @@ mod arch_specific { // to get cpu details specific to powerpc architecture similar // to the goloang implementation of function getCPUDetails() } - - pub fn host_is_vmcontainer_capable() -> Result { - // TODO: Not implemented - Ok(true) - } - - pub fn available_guest_protection() -> Result { - if !Uid::effective().is_root() { - return Err(check::ProtectionError::NoPerms); - } - - let metadata = fs::metadata(PEF_SYS_FIRMWARE_DIR); - if metadata.is_ok() && metadata.unwrap().is_dir() { - Ok(check::GuestProtection::Pef) - } - - Ok(check::GuestProtection::NoProtection) - } } diff --git a/src/tools/kata-ctl/src/arch/s390x/mod.rs b/src/tools/kata-ctl/src/arch/s390x/mod.rs index 991d676d1..6f3ee1f4e 100644 --- a/src/tools/kata-ctl/src/arch/s390x/mod.rs +++ b/src/tools/kata-ctl/src/arch/s390x/mod.rs @@ -12,9 +12,7 @@ mod arch_specific { use crate::types::*; use crate::utils; use anyhow::{anyhow, Result}; - use nix::unistd::Uid; use slog::{info, o, warn}; - use std::collections::HashMap; use std::io::BufRead; use std::io::BufReader; @@ -37,15 +35,17 @@ mod arch_specific { fn check_cpu() -> Result<()> { info!(sl!(), "check CPU: s390x"); - let cpu_info = check::get_single_cpu_info(check::PROC_CPUINFO, CPUINFO_DELIMITER)?; + let cpu_info = + kata_sys_util::cpu::get_single_cpu_info(check::PROC_CPUINFO, CPUINFO_DELIMITER)?; - let cpu_features = check::get_cpu_flags(&cpu_info, CPUINFO_FEATURES_TAG).map_err(|e| { - anyhow!( - "Error parsing CPU features, file {:?}, {:?}", - check::PROC_CPUINFO, - e - ) - })?; + let cpu_features = kata_sys_util::cpu::get_cpu_flags(&cpu_info, CPUINFO_FEATURES_TAG) + .map_err(|e| { + anyhow!( + "Error parsing CPU features, file {:?}, {:?}", + check::PROC_CPUINFO, + e + ) + })?; let missing_cpu_features = check::check_cpu_flags(&cpu_features, CPU_FEATURES_REQ)?; if !missing_cpu_features.is_empty() { @@ -93,41 +93,6 @@ mod arch_specific { Err(anyhow!("System is not capable of running a VM")) } - #[allow(dead_code)] - fn retrieve_cpu_facilities() -> Result> { - let f = std::fs::File::open(check::PROC_CPUINFO)?; - let mut reader = BufReader::new(f); - let mut contents = String::new(); - let facilities_field = "facilities"; - let mut facilities = HashMap::new(); - - while reader.read_line(&mut contents)? > 0 { - let fields: Vec<&str> = contents.split_whitespace().collect(); - if fields.len() < 2 { - contents.clear(); - continue; - } - - if !fields[0].starts_with(facilities_field) { - contents.clear(); - continue; - } - - let mut start = 1; - if fields[1] == ":" { - start = 2; - } - - for field in fields.iter().skip(start) { - let bit = field.parse::()?; - facilities.insert(bit, true); - } - return Ok(facilities); - } - - Ok(facilities) - } - #[allow(dead_code)] pub fn check_cmd_line( kernel_cmdline_path: &str, @@ -174,41 +139,4 @@ mod arch_specific { // to get cpu details specific to s390x architecture similar // to the goloang implementation of function getS390xCPUDetails() } - - #[allow(dead_code)] - // Guest protection is not supported on ARM64. - pub fn available_guest_protection() -> Result { - if !Uid::effective().is_root() { - return Err(check::ProtectionError::NoPerms)?; - } - - let facilities = retrieve_cpu_facilities().map_err(|err| { - check::ProtectionError::CheckFailed(format!( - "Error retrieving cpu facilities file : {}", - err.to_string() - )) - })?; - - // Secure Execution - // https://www.kernel.org/doc/html/latest/virt/kvm/s390-pv.html - let se_cpu_facility_bit: i32 = 158; - if !facilities.contains_key(&se_cpu_facility_bit) { - return Ok(check::GuestProtection::NoProtection); - } - - let cmd_line_values = vec!["1", "on", "y", "yes"]; - let se_cmdline_param = "prot_virt"; - - let se_cmdline_present = - check_cmd_line("/proc/cmdline", se_cmdline_param, &cmd_line_values) - .map_err(|err| check::ProtectionError::CheckFailed(err.to_string()))?; - - if !se_cmdline_present { - return Err(check::ProtectionError::InvalidValue(String::from( - "Protected Virtualization is not enabled on kernel command line!", - ))); - } - - Ok(check::GuestProtection::Se) - } } diff --git a/src/tools/kata-ctl/src/arch/x86_64/mod.rs b/src/tools/kata-ctl/src/arch/x86_64/mod.rs index 67fd4ddc8..3d6df8ed7 100644 --- a/src/tools/kata-ctl/src/arch/x86_64/mod.rs +++ b/src/tools/kata-ctl/src/arch/x86_64/mod.rs @@ -10,14 +10,10 @@ pub use arch_specific::*; mod arch_specific { use crate::check; - use crate::check::{GuestProtection, ProtectionError}; use crate::types::*; use crate::utils; use anyhow::{anyhow, Context, Result}; - use nix::unistd::Uid; use slog::{info, o, warn}; - use std::fs; - use std::path::Path; const CPUINFO_DELIMITER: &str = "\nprocessor"; const CPUINFO_FLAGS_TAG: &str = "flags"; @@ -93,15 +89,17 @@ mod arch_specific { fn check_cpu(_args: &str) -> Result<()> { info!(sl!(), "check CPU: x86_64"); - let cpu_info = check::get_single_cpu_info(check::PROC_CPUINFO, CPUINFO_DELIMITER)?; + let cpu_info = + kata_sys_util::cpu::get_single_cpu_info(check::PROC_CPUINFO, CPUINFO_DELIMITER)?; - let cpu_flags = check::get_cpu_flags(&cpu_info, CPUINFO_FLAGS_TAG).map_err(|e| { - anyhow!( - "Error parsing CPU flags, file {:?}, {:?}", - check::PROC_CPUINFO, - e - ) - })?; + let cpu_flags = + kata_sys_util::cpu::get_cpu_flags(&cpu_info, CPUINFO_FLAGS_TAG).map_err(|e| { + anyhow!( + "Error parsing CPU flags, file {:?}, {:?}", + check::PROC_CPUINFO, + e + ) + })?; // perform checks // TODO: Perform checks based on hypervisor type @@ -118,20 +116,6 @@ mod arch_specific { Ok(()) } - fn retrieve_cpu_flags() -> Result { - let cpu_info = check::get_single_cpu_info(check::PROC_CPUINFO, CPUINFO_DELIMITER)?; - - let cpu_flags = check::get_cpu_flags(&cpu_info, CPUINFO_FLAGS_TAG).map_err(|e| { - anyhow!( - "Error parsing CPU flags, file {:?}, {:?}", - check::PROC_CPUINFO, - e - ) - })?; - - Ok(cpu_flags) - } - pub fn get_cpu_details() -> Result<(String, String)> { utils::get_generic_cpu_details(check::PROC_CPUINFO) } @@ -145,68 +129,8 @@ mod arch_specific { result.context("KVM check failed") } - pub const TDX_SYS_FIRMWARE_DIR: &str = "/sys/firmware/tdx_seam/"; - pub const TDX_CPU_FLAG: &str = "tdx"; - pub const SEV_KVM_PARAMETER_PATH: &str = "/sys/module/kvm_amd/parameters/sev"; - pub const SNP_KVM_PARAMETER_PATH: &str = "/sys/module/kvm_amd/parameters/sev_snp"; - - pub fn available_guest_protection() -> Result { - if !Uid::effective().is_root() { - return Err(ProtectionError::NoPerms); - } - - arch_guest_protection( - TDX_SYS_FIRMWARE_DIR, - TDX_CPU_FLAG, - SEV_KVM_PARAMETER_PATH, - SNP_KVM_PARAMETER_PATH, - ) - } - - pub fn arch_guest_protection( - tdx_path: &str, - tdx_flag: &str, - sev_path: &str, - snp_path: &str, - ) -> Result { - let flags = - retrieve_cpu_flags().map_err(|err| ProtectionError::CheckFailed(err.to_string()))?; - - let metadata = fs::metadata(tdx_path); - - if metadata.is_ok() && metadata.unwrap().is_dir() && flags.contains(tdx_flag) { - return Ok(GuestProtection::Tdx); - } - - let check_contents = |file_name: &str| -> Result { - let file_path = Path::new(file_name); - if !file_path.exists() { - return Ok(false); - } - - let contents = fs::read_to_string(file_name).map_err(|err| { - ProtectionError::CheckFailed(format!("Error reading file {} : {}", file_name, err)) - })?; - - if contents == "Y" { - return Ok(true); - } - Ok(false) - }; - - if check_contents(snp_path)? { - return Ok(GuestProtection::Snp); - } - - if check_contents(sev_path)? { - return Ok(GuestProtection::Sev); - } - - Ok(GuestProtection::NoProtection) - } - fn running_on_vmm() -> Result { - match check::get_single_cpu_info(check::PROC_CPUINFO, CPUINFO_DELIMITER) { + match kata_sys_util::cpu::get_single_cpu_info(check::PROC_CPUINFO, CPUINFO_DELIMITER) { Ok(cpu_info) => { // check if the 'hypervisor' flag exist in the cpu features let missing_hypervisor_flag = check::check_cpu_attribs(&cpu_info, VMM_FLAGS)?; @@ -357,74 +281,3 @@ mod arch_specific { Err(anyhow!("System is not capable of running a VM")) } } - -#[cfg(target_arch = "x86_64")] -#[cfg(test)] -mod tests { - use super::*; - use crate::check; - use nix::unistd::Uid; - use std::fs; - use std::io::Write; - use tempfile::tempdir; - - #[test] - fn test_available_guest_protection_no_privileges() { - if !Uid::effective().is_root() { - let res = available_guest_protection(); - assert!(res.is_err()); - assert_eq!( - "No permission to check guest protection", - res.unwrap_err().to_string() - ); - } - } - - fn test_arch_guest_protection_snp() { - // Test snp - let dir = tempdir().unwrap(); - let snp_file_path = dir.path().join("sev_snp"); - let path = snp_file_path.clone(); - let mut snp_file = fs::File::create(snp_file_path).unwrap(); - writeln!(snp_file, "Y").unwrap(); - - let actual = - arch_guest_protection("/xyz/tmp", TDX_CPU_FLAG, "/xyz/tmp", path.to_str().unwrap()); - assert!(actual.is_ok()); - assert_eq!(actual.unwrap(), check::GuestProtection::Snp); - - writeln!(snp_file, "N").unwrap(); - let actual = - arch_guest_protection("/xyz/tmp", TDX_CPU_FLAG, "/xyz/tmp", path.to_str().unwrap()); - assert!(actual.is_ok()); - assert_eq!(actual.unwrap(), check::GuestProtection::NoProtection); - } - - fn test_arch_guest_protection_sev() { - // Test sev - let dir = tempdir().unwrap(); - let sev_file_path = dir.path().join("sev"); - let sev_path = sev_file_path.clone(); - let mut sev_file = fs::File::create(sev_file_path).unwrap(); - writeln!(sev_file, "Y").unwrap(); - - let actual = arch_guest_protection( - "/xyz/tmp", - TDX_CPU_FLAG, - sev_path.to_str().unwrap(), - "/xyz/tmp", - ); - assert!(actual.is_ok()); - assert_eq!(actual.unwrap(), check::GuestProtection::Sev); - - writeln!(sev_file, "N").unwrap(); - let actual = arch_guest_protection( - "/xyz/tmp", - TDX_CPU_FLAG, - sev_path.to_str().unwrap(), - "/xyz/tmp", - ); - assert!(actual.is_ok()); - assert_eq!(actual.unwrap(), check::GuestProtection::NoProtection); - } -} diff --git a/src/tools/kata-ctl/src/check.rs b/src/tools/kata-ctl/src/check.rs index 78e30a93a..3bf899ad2 100644 --- a/src/tools/kata-ctl/src/check.rs +++ b/src/tools/kata-ctl/src/check.rs @@ -16,8 +16,6 @@ use nix::{ioctl_write_int_bad, request_code_none}; use reqwest::header::{CONTENT_TYPE, USER_AGENT}; use serde::{Deserialize, Serialize}; use slog::{info, o}; -use std::fmt; -use thiserror::Error; #[cfg(any(target_arch = "x86_64"))] use std::process::{Command, Stdio}; @@ -61,57 +59,6 @@ macro_rules! sl { }; } -fn read_file_contents(file_path: &str) -> Result { - let contents = std::fs::read_to_string(file_path)?; - Ok(contents) -} - -// get_single_cpu_info returns the contents of the first cpu from -// the specified cpuinfo file by parsing based on a specified delimiter -pub fn get_single_cpu_info(cpu_info_file: &str, substring: &str) -> Result { - let contents = read_file_contents(cpu_info_file)?; - - if contents.is_empty() { - return Err(anyhow!(ERR_NO_CPUINFO)); - } - - let subcontents: Vec<&str> = contents.split(substring).collect(); - let result = subcontents - .first() - .ok_or("error splitting contents of cpuinfo") - .map_err(|e| anyhow!(e))? - .to_string(); - Ok(result) -} - -// get_cpu_flags returns a string of cpu flags from cpuinfo, passed in -// as a string -#[cfg(any(target_arch = "s390x", target_arch = "x86_64"))] -pub fn get_cpu_flags(cpu_info: &str, cpu_flags_tag: &str) -> Result { - if cpu_info.is_empty() { - return Err(anyhow!(ERR_NO_CPUINFO)); - } - - if cpu_flags_tag.is_empty() { - return Err(anyhow!("cpu flags delimiter string is empty"))?; - } - - let subcontents: Vec<&str> = cpu_info.split('\n').collect(); - for line in subcontents { - if line.starts_with(cpu_flags_tag) { - let line_data: Vec<&str> = line.split(':').collect(); - let flags = line_data - .last() - .ok_or("error splitting flags in cpuinfo") - .map_err(|e| anyhow!(e))? - .to_string(); - return Ok(flags); - } - } - - Ok("".to_string()) -} - // get_missing_strings searches for required (strings) in data and returns // a vector containing the missing strings #[cfg(any(target_arch = "s390x", target_arch = "x86_64"))] @@ -149,43 +96,6 @@ pub fn check_cpu_attribs( Ok(missing_attribs) } -#[allow(dead_code)] -#[derive(Debug, PartialEq)] -pub enum GuestProtection { - NoProtection, - Tdx, - Sev, - Snp, - Pef, - Se, -} - -impl fmt::Display for GuestProtection { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - GuestProtection::Tdx => write!(f, "tdx"), - GuestProtection::Sev => write!(f, "sev"), - GuestProtection::Snp => write!(f, "snp"), - GuestProtection::Pef => write!(f, "pef"), - GuestProtection::Se => write!(f, "se"), - GuestProtection::NoProtection => write!(f, "none"), - } - } -} - -#[allow(dead_code)] -#[derive(Error, Debug)] -pub enum ProtectionError { - #[error("No permission to check guest protection")] - NoPerms, - - #[error("Failed to check guest protection: {0}")] - CheckFailed(String), - - #[error("Invalid guest protection value: {0}")] - InvalidValue(String), -} - pub fn run_network_checks() -> Result<()> { Ok(()) } @@ -397,6 +307,7 @@ mod tests { use super::*; #[cfg(any(target_arch = "x86_64"))] use crate::types::{KernelModule, KernelParam, KernelParamType}; + use kata_sys_util::cpu::{get_cpu_flags, get_single_cpu_info}; use semver::Version; use slog::warn; use std::fs; diff --git a/src/tools/kata-ctl/src/ops/env_ops.rs b/src/tools/kata-ctl/src/ops/env_ops.rs index d687f9617..d28d569da 100644 --- a/src/tools/kata-ctl/src/ops/env_ops.rs +++ b/src/tools/kata-ctl/src/ops/env_ops.rs @@ -9,6 +9,7 @@ use crate::arch::arch_specific; use crate::args::EnvArgument; use crate::ops::version; use crate::utils; +use kata_sys_util::protection; use kata_types::config::TomlConfig; use anyhow::{anyhow, Context, Result}; @@ -251,7 +252,7 @@ fn get_host_info() -> Result { let memory_info = get_memory_info()?; let guest_protection = - arch_specific::available_guest_protection().map_err(|e| anyhow!(e.to_string()))?; + protection::available_guest_protection().map_err(|e| anyhow!(e.to_string()))?; let guest_protection = guest_protection.to_string(); diff --git a/src/tools/kata-ctl/src/utils.rs b/src/tools/kata-ctl/src/utils.rs index a292d3b78..3b6e4daee 100644 --- a/src/tools/kata-ctl/src/utils.rs +++ b/src/tools/kata-ctl/src/utils.rs @@ -6,7 +6,6 @@ #![allow(dead_code)] use crate::arch::arch_specific; -use crate::check::get_single_cpu_info; use anyhow::{anyhow, Context, Result}; use std::fs; @@ -106,7 +105,7 @@ pub fn get_distro_details(os_release: &str, os_release_clr: &str) -> Result<(Str #[cfg(any(target_arch = "s390x", target_arch = "x86_64", target_arch = "aarch64"))] pub fn get_generic_cpu_details(cpu_info_file: &str) -> Result<(String, String)> { - let cpu_info = get_single_cpu_info(cpu_info_file, "\n\n")?; + let cpu_info = kata_sys_util::cpu::get_single_cpu_info(cpu_info_file, "\n\n")?; let lines = cpu_info.lines(); let mut vendor = String::new(); let mut model = String::new(); diff --git a/src/tools/runk/README.md b/src/tools/runk/README.md index 7ead13ab1..eddfc73fc 100644 --- a/src/tools/runk/README.md +++ b/src/tools/runk/README.md @@ -175,6 +175,32 @@ $ sudo runk state test $ sudo runk delete test ``` +## Using `runk` from `Docker` + +`runk` can run containers using [`Docker`](https://github.com/docker). + +First, install `Docker` from package by following the +[`Docker` installation instructions](https://docs.docker.com/engine/install/). + +### Running a container with `Docker` command line + +Start the docker daemon: + +```bash +$ sudo dockerd --experimental --add-runtime="runk=/usr/local/bin/runk" +``` + +> **Note:** +> Before starting the `dockerd`, you need to stop the normal docker daemon +> running on your environment (i.e., `systemctl stop docker`). + +Launch a container in a different terminal: + +```bash +$ sudo docker run -it --rm --runtime runk busybox sh +/ # +``` + ## Using `runk` from `Podman` `runk` can run containers using [`Podman`](https://github.com/containers/podman). @@ -192,7 +218,7 @@ $ sudo podman --runtime /usr/local/bin/runk run -it --rm busybox sh > **Note:** > `runk` does not support some commands except > [OCI standard operations](https://github.com/opencontainers/runtime-spec/blob/main/runtime.md#operations) -> yet, so those commands do not work in `Podman`. Regarding commands currently +> yet, so those commands do not work in `Docker/Podman`. Regarding commands currently > implemented in `runk`, see the [Status of `runk`](#status-of-runk) section. ## Using `runk` from `containerd` diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 000000000..122d16071 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1 @@ +integration/kubernetes/runtimeclass_workloads_work/ diff --git a/tests/common.bash b/tests/common.bash index 090bd7fd9..d71b3f43c 100644 --- a/tests/common.bash +++ b/tests/common.bash @@ -7,6 +7,9 @@ # This file contains common functions that # are being used by our metrics and integration tests +this_script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +export repo_root_dir="$(cd "${this_script_dir}/../" && pwd)" + # Kata tests directory used for storing various test-related artifacts. KATA_TESTS_BASEDIR="${KATA_TESTS_BASEDIR:-/var/log/kata-tests}" @@ -23,23 +26,23 @@ KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu}" RUNTIME="${RUNTIME:-containerd-shim-kata-v2}" -die() { +function die() { local msg="$*" echo -e "[$(basename $0):${BASH_LINENO[0]}] ERROR: $msg" >&2 exit 1 } -warn() { +function warn() { local msg="$*" echo -e "[$(basename $0):${BASH_LINENO[0]}] WARNING: $msg" } -info() { +function info() { local msg="$*" echo -e "[$(basename $0):${BASH_LINENO[0]}] INFO: $msg" } -handle_error() { +function handle_error() { local exit_code="${?}" local line_number="${1:-}" echo -e "[$(basename $0):$line_number] ERROR: $(eval echo "$BASH_COMMAND")" @@ -47,7 +50,7 @@ handle_error() { } trap 'handle_error $LINENO' ERR -waitForProcess() { +function waitForProcess() { wait_time="$1" sleep_time="$2" cmd="$3" @@ -66,7 +69,7 @@ waitForProcess() { # Kata runtime. Of course, the end user can choose any name they # want in reality, but this function knows the names of the default # and recommended Kata docker runtime install names. -is_a_kata_runtime() { +function is_a_kata_runtime() { if [ "$1" = "containerd-shim-kata-v2" ] || [ "$1" = "io.containerd.kata.v2" ]; then echo "1" else @@ -76,7 +79,7 @@ is_a_kata_runtime() { # Gets versions and paths of all the components # list in kata-env -extract_kata_env() { +function extract_kata_env() { RUNTIME_CONFIG_PATH=$(kata-runtime kata-env --json | jq -r .Runtime.Config.Path) RUNTIME_VERSION=$(kata-runtime kata-env --json | jq -r .Runtime.Version | grep Semver | cut -d'"' -f4) RUNTIME_COMMIT=$(kata-runtime kata-env --json | jq -r .Runtime.Version | grep Commit | cut -d'"' -f4) @@ -97,7 +100,7 @@ extract_kata_env() { } # Checks that processes are not running -check_processes() { +function check_processes() { extract_kata_env # Only check the kata-env if we have managed to find the kata executable... @@ -120,7 +123,7 @@ check_processes() { # Clean environment, this function will try to remove all # stopped/running containers. -clean_env() +function clean_env() { # If the timeout has not been set, default it to 30s # Docker has a built in 10s default timeout, so make ours @@ -139,7 +142,7 @@ clean_env() fi } -clean_env_ctr() +function clean_env_ctr() { local count_running="$(sudo ctr c list -q | wc -l)" local remaining_attempts=10 @@ -181,7 +184,32 @@ clean_env_ctr() count_tasks="$(sudo ctr t list -q | wc -l)" if (( count_tasks > 0 )); then - die "Can't remove running contaienrs." + die "Can't remove running containers." + fi + + kill_kata_components +} + +# Kills running shim and hypervisor components +function kill_kata_components() { + local kata_bin_dir="/opt/kata/bin" + local shim_path="${kata_bin_dir}/containerd-shim-kata-v2" + local hypervisor_path="${kata_bin_dir}/qemu-system-x86_64" + local pid_shim_count="$(pgrep -fc ${shim_path} || exit 0)" + + [ ${pid_shim_count} -gt "0" ] && sudo kill -SIGKILL "$(pgrep -f ${shim_path})" > /dev/null 2>&1 + + if [ "${KATA_HYPERVISOR}" = 'clh' ]; then + hypervisor_path="${kata_bin_dir}/cloud-hypervisor" + elif [ "${KATA_HYPERVISOR}" != 'qemu' ]; then + echo "Failed to stop the hypervisor: '${KATA_HYPERVISOR}' as it is not recognized" + return + fi + + local pid_hypervisor_count="$(pgrep -fc ${hypervisor_path} || exit 0)" + + if [ ${pid_hypervisor_count} -gt "0" ]; then + sudo kill -SIGKILL "$(pgrep -f ${hypervisor_path})" > /dev/null 2>&1 fi } @@ -189,7 +217,7 @@ clean_env_ctr() # Outputs warnings to stdio if something has gone wrong. # # Returns 0 on success, 1 otherwise -restart_systemd_service_with_no_burst_limit() { +function restart_systemd_service_with_no_burst_limit() { local service=$1 info "restart $service service" @@ -224,7 +252,7 @@ restart_systemd_service_with_no_burst_limit() { return 0 } -restart_containerd_service() { +function restart_containerd_service() { restart_systemd_service_with_no_burst_limit containerd || return 1 local retries=5 @@ -241,16 +269,147 @@ restart_containerd_service() { return 0 } -# @path_results: path to the input metric-results folder -# @tarball_fname: path and filename to the output tarball -function compress_metrics_results_dir() -{ - local path_results="${1:-results}" - local tarball_fname="${2:-}" +# Configures containerd +function overwrite_containerd_config() { + containerd_config="/etc/containerd/config.toml" + sudo rm -f "${containerd_config}" + sudo tee "${containerd_config}" << EOF +version = 2 +[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + SystemdCgroup = true - [ -z "${tarball_fname}" ] && die "Missing the tarball filename or the path to save the tarball results is incorrect." - [ ! -d "${path_results}" ] && die "Missing path to the results folder." - - cd "${path_results}" && tar -czf "${tarball_fname}" *.json && cd - - info "tarball generated: ${tarball_fname}" +[plugins] + [plugins."io.containerd.grpc.v1.cri"] + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "kata" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata] + runtime_type = "io.containerd.kata.v2" +EOF +} + +function install_kata() { + local kata_tarball="kata-static.tar.xz" + declare -r katadir="/opt/kata" + declare -r destdir="/" + declare -r local_bin_dir="/usr/local/bin/" + + # Removing previous kata installation + sudo rm -rf "${katadir}" + + pushd "${kata_tarball_dir}" + sudo tar -xvf "${kata_tarball}" -C "${destdir}" + popd + + # create symbolic links to kata components + for b in "${katadir}"/bin/* ; do + sudo ln -sf "${b}" "${local_bin_dir}/$(basename $b)" + done + + if [[ ${KATA_HYPERVISOR} == "dragonball" ]]; then + sudo ln -sf "${katadir}/runtime-rs/bin/containerd-shim-kata-v2" "${local_bin_dir}/containerd-shim-kata-${KATA_HYPERVISOR}-v2" + else + sudo ln -sf "${katadir}/bin/containerd-shim-kata-v2" "${local_bin_dir}/containerd-shim-kata-${KATA_HYPERVISOR}-v2" + fi + + sudo ln -sf ${katadir}/share/defaults/kata-containers/configuration-${KATA_HYPERVISOR}.toml ${katadir}/share/defaults/kata-containers/configuration.toml + + check_containerd_config_for_kata + restart_containerd_service +} + +function check_containerd_config_for_kata() { + # check containerd config + declare -r line1="default_runtime_name = \"kata\"" + declare -r line2="runtime_type = \"io.containerd.kata.v2\"" + declare -r num_lines_containerd=2 + declare -r containerd_path="/etc/containerd/config.toml" + local count_matches=$(grep -ic "$line1\|$line2" "${containerd_path}") + + if [ "${count_matches}" = "${num_lines_containerd}" ]; then + info "containerd ok" + else + info "overwriting containerd configuration w/ a valid one" + overwrite_containerd_config + fi +} + +function ensure_yq() { + : "${GOPATH:=${GITHUB_WORKSPACE:-$HOME/go}}" + export GOPATH + export PATH="${GOPATH}/bin:${PATH}" + INSTALL_IN_GOPATH=true "${repo_root_dir}/ci/install_yq.sh" +} + +# dependency: What we want to get the version from the versions.yaml file +function get_from_kata_deps() { + local dependency="$1" + versions_file="${repo_root_dir}/versions.yaml" + + command -v yq &>/dev/null || die 'yq command is not in your $PATH' + result=$("yq" read -X "$versions_file" "$dependency") + [ "$result" = "null" ] && result="" + echo "$result" +} + +# project: org/repo format +# base_version: ${major}.${minor} +function get_latest_patch_release_from_a_github_project() { + project="${1}" + base_version="${2}" + + curl --silent https://api.github.com/repos/${project}/releases | jq -r .[].tag_name | grep "^${base_version}.[0-9]*$" -m1 +} + +# base_version: The version to be intalled in the ${major}.${minor} format +function clone_cri_containerd() { + base_version="${1}" + + project="containerd/containerd" + version=$(get_latest_patch_release_from_a_github_project "${project}" "${base_version}") + + rm -rf containerd + git clone -b ${version} https://github.com/${project} +} + +# project: org/repo format +# version: the version of the tarball that will be downloaded +# tarball-name: the name of the tarball that will be downloaded +function download_github_project_tarball() { + project="${1}" + version="${2}" + tarball_name="${3}" + + wget https://github.com/${project}/releases/download/${version}/${tarball_name} +} + +# base_version: The version to be intalled in the ${major}.${minor} format +function install_cri_containerd() { + base_version="${1}" + + project="containerd/containerd" + version=$(get_latest_patch_release_from_a_github_project "${project}" "${base_version}") + + tarball_name="cri-containerd-cni-${version//v}-linux-$(${repo_root_dir}/tests/kata-arch.sh -g).tar.gz" + + download_github_project_tarball "${project}" "${version}" "${tarball_name}" + sudo tar -xvf "${tarball_name}" -C / + rm -f "${tarball_name}" + + sudo mkdir -p /etc/containerd + containerd config default | sudo tee /etc/containerd/config.toml +} + +# base_version: The version to be intalled in the ${major}.${minor} format +function install_cri_tools() { + base_version="${1}" + + project="kubernetes-sigs/cri-tools" + version=$(get_latest_patch_release_from_a_github_project "${project}" "${base_version}") + + tarball_name="crictl-${version}-linux-$(${repo_root_dir}/tests/kata-arch.sh -g).tar.gz" + + download_github_project_tarball "${project}" "${version}" "${tarball_name}" + sudo tar -xvf "${tarball_name}" -C /usr/local/bin + rm -f "${tarball_name}" } diff --git a/tests/functional/vfio/gha-run.sh b/tests/functional/vfio/gha-run.sh new file mode 100755 index 000000000..f4cb608de --- /dev/null +++ b/tests/functional/vfio/gha-run.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# +# Copyright (c) Microsoft Corporation. +# +# SPDX-License-Identifier: Apache-2.0 +# + +set -o errexit +set -o nounset +set -o pipefail + +kata_tarball_dir="${2:-kata-artifacts}" +vfio_dir="$(dirname "$(readlink -f "$0")")" +source "${vfio_dir}/../../common.bash" + +function install_dependencies() { + info "Installing the dependencies needed for running the vfio tests" +} + +function run() { + info "Running cri-containerd tests using ${KATA_HYPERVISOR} hypervisor" +} + +function main() { + action="${1:-}" + case "${action}" in + install-dependencies) install_dependencies ;; + run) run ;; + *) >&2 die "Invalid argument" ;; + esac +} + +main "$@" diff --git a/tests/install_go.sh b/tests/install_go.sh new file mode 100755 index 000000000..3827bb7b3 --- /dev/null +++ b/tests/install_go.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# +# Copyright (c) 2018-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +set -o errexit +set -o nounset +set -o pipefail + +tmp_dir=$(mktemp -d -t install-go-tmp.XXXXXXXXXX) +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +script_name="$(basename "${BASH_SOURCE[0]}")" +force="" +USE_VERSIONS_FILE="" +PROJECT="Kata Containers" + +source "${script_dir}/common.bash" + +install_dest="/usr/local/" + +function finish() { + rm -rf "$tmp_dir" +} + +function usage(){ + exit_code="$1" + cat < + +Args: + : Install a specific go version. + +Example: +${script_name} 1.10 + +Options +-d : destination path, path where go will be installed. +-f : Force remove old go version and install the specified one. +-h : Show this help +-p : Install go defined in ${PROJECT} versions file. + +EOF + + exit "$exit_code" +} + +trap finish EXIT + +pushd "${tmp_dir}" + +while getopts "d:fhp" opt +do + case $opt in + d) install_dest="${OPTARG}" ;; + f) force="true" ;; + h) usage 0 ;; + p) USE_VERSIONS_FILE="true" ;; + esac +done + +shift $(( $OPTIND - 1 )) + +go_version="${1:-""}" + +if [ -z "$go_version" ] && [ "${USE_VERSIONS_FILE}" = "true" ] ;then + go_version=$(get_from_kata_deps "languages.golang.meta.newest-version") +fi + +if [ -z "$go_version" ];then + echo "Missing go version or -p option" + usage 0 +fi + +if command -v go; then + [[ "$(go version)" == *"go${go_version}"* ]] && \ + info "Go ${go_version} already installed" && \ + exit + if [ "${force}" = "true" ]; then + info "removing $(go version)" + sudo rm -rf "${install_dest}/go" + else + die "$(go version) is installed, use -f or remove it before install go ${go_version}" + fi +fi + +goarch=$("${repo_root_dir}/tests/kata-arch.sh" --golang) + +info "Download go version ${go_version}" +kernel_name=$(uname -s) +curl -OL "https://storage.googleapis.com/golang/go${go_version}.${kernel_name,,}-${goarch}.tar.gz" +info "Install go" +mkdir -p "${install_dest}" +sudo tar -C "${install_dest}" -xzf "go${go_version}.${kernel_name,,}-${goarch}.tar.gz" +popd diff --git a/tests/integration/cri-containerd/gha-run.sh b/tests/integration/cri-containerd/gha-run.sh new file mode 100755 index 000000000..c36c96605 --- /dev/null +++ b/tests/integration/cri-containerd/gha-run.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +set -o errexit +set -o nounset +set -o pipefail + +kata_tarball_dir="${2:-kata-artifacts}" +cri_containerd_dir="$(dirname "$(readlink -f "$0")")" +source "${cri_containerd_dir}/../../common.bash" + +function install_dependencies() { + info "Installing the dependencies needed for running the cri-containerd tests" + + # Dependency list of projects that we can rely on the system packages + # - build-essential + # - Theoretically we only need `make`, but doesn't hurt to install + # the whole build-essential group + # - jq + # - podman-docker + # - one of the tests rely on docker to pull an image. + # we've decided to go for podman, instead, as it does *not* bring + # containerd as a dependency + declare -a system_deps=( + build-essential + jq + podman-docker + ) + + sudo apt-get update + sudo apt-get -y install "${system_deps[@]}" + + ensure_yq + ${repo_root_dir}/tests/install_go.sh -p + + # Dependency list of projects that we can install them + # directly from their releases on GitHub: + # - containerd + # - cri-container-cni release tarball already includes CNI plugins + # - cri-tools + declare -a github_deps + github_deps[0]="cri_containerd:$(get_from_kata_deps "externals.containerd.${CONTAINERD_VERSION}")" + github_deps[1]="cri_tools:$(get_from_kata_deps "externals.critools.latest")" + + for github_dep in "${github_deps[@]}"; do + IFS=":" read -r -a dep <<< "${github_dep}" + install_${dep[0]} "${dep[1]}" + done + + # Clone containerd as we'll need to build it in order to run the tests + # base_version: The version to be intalled in the ${major}.${minor} format + clone_cri_containerd $(get_from_kata_deps "externals.containerd.${CONTAINERD_VERSION}") +} + +function run() { + info "Running cri-containerd tests using ${KATA_HYPERVISOR} hypervisor" + + return 0 +} + +function main() { + action="${1:-}" + case "${action}" in + install-dependencies) install_dependencies ;; + install-kata) install_kata ;; + run) run ;; + *) >&2 die "Invalid argument" ;; + esac +} + +main "$@" diff --git a/tests/integration/cri-containerd/integration-tests.sh b/tests/integration/cri-containerd/integration-tests.sh new file mode 100755 index 000000000..0e4df5578 --- /dev/null +++ b/tests/integration/cri-containerd/integration-tests.sh @@ -0,0 +1,493 @@ +#!/bin/bash +# +# Copyright (c) 2017-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +[[ "${DEBUG}" != "" ]] && set -o xtrace +set -o errexit +set -o nounset +set -o pipefail +set -o errtrace + +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../../common.bash" + +# runc is installed in /usr/local/sbin/ add that path +export PATH="$PATH:/usr/local/sbin" + +# golang is installed in /usr/local/go/bin/ add that path +export PATH="$PATH:/usr/local/go/bin" + +# Runtime to be used for testing +RUNTIME=${RUNTIME:-containerd-shim-kata-v2} +FACTORY_TEST=${FACTORY_TEST:-""} +KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu}" +USE_DEVMAPPER="${USE_DEVMAPPER:-false}" +ARCH=$(uname -m) + +containerd_runtime_type="io.containerd.kata-${KATA_HYPERVISOR}.v2" + +containerd_shim_path="$(command -v containerd-shim)" + +#containerd config file +readonly tmp_dir=$(mktemp -t -d test-cri-containerd.XXXX) +export REPORT_DIR="${tmp_dir}" +readonly CONTAINERD_CONFIG_FILE="${tmp_dir}/test-containerd-config" +readonly CONTAINERD_CONFIG_FILE_TEMP="${CONTAINERD_CONFIG_FILE}.temp" +readonly default_containerd_config="/etc/containerd/config.toml" +readonly default_containerd_config_backup="$CONTAINERD_CONFIG_FILE.backup" +readonly kata_config="/etc/kata-containers/configuration.toml" +readonly kata_config_backup="$kata_config.backup" +readonly default_kata_config="/opt/kata/share/defaults/kata-containers/configuration.toml" + +function ci_config() { + sudo mkdir -p $(dirname "${kata_config}") + [ -f "$kata_config" ] && sudo cp "$kata_config" "$kata_config_backup" || \ + sudo cp "$default_kata_config" "$kata_config" + + source /etc/os-release || source /usr/lib/os-release + ID=${ID:-""} + if [ "$ID" == ubuntu ]; then + # https://github.com/kata-containers/tests/issues/352 + if [ -n "${FACTORY_TEST}" ]; then + sudo sed -i -e 's/^#enable_template.*$/enable_template = true/g' "${kata_config}" + echo "init vm template" + sudo -E PATH=$PATH "$RUNTIME" factory init + fi + fi + + echo "enable debug for kata-runtime" + sudo sed -i 's/^#enable_debug =/enable_debug =/g' ${kata_config} +} + +function ci_cleanup() { + source /etc/os-release || source /usr/lib/os-release + + if [ -n "${FACTORY_TEST}" ]; then + echo "destroy vm template" + sudo -E PATH=$PATH "$RUNTIME" factory destroy + fi + + if [ -e "$default_containerd_config_backup" ]; then + echo "restore containerd config" + sudo systemctl stop containerd + sudo cp "$default_containerd_config_backup" "$default_containerd_config" + fi + + [ -f "$kata_config_backup" ] && sudo mv "$kata_config_backup" "$kata_config" || \ + sudo rm "$kata_config" +} + +function create_containerd_config() { + local runtime="$1" + # kata_annotations is set to 1 if caller want containerd setup with + # kata annotations support. + local kata_annotations=${2-0} + [ -n "${runtime}" ] || die "need runtime to create config" + + local runtime_type="${containerd_runtime_type}" + if [ "${runtime}" == "runc" ]; then + runtime_type="io.containerd.runc.v2" + fi + local containerd_runtime=$(command -v "containerd-shim-${runtime}-v2") + +cat << EOF | sudo tee "${CONTAINERD_CONFIG_FILE}" +[debug] + level = "debug" +[plugins] + [plugins.cri] + [plugins.cri.containerd] + default_runtime_name = "$runtime" + [plugins.cri.containerd.runtimes.${runtime}] + runtime_type = "${runtime_type}" + $( [ $kata_annotations -eq 1 ] && \ + echo 'pod_annotations = ["io.katacontainers.*"]' && \ + echo ' container_annotations = ["io.katacontainers.*"]' + ) + [plugins.cri.containerd.runtimes.${runtime}.options] + Runtime = "${containerd_runtime}" +[plugins.linux] + shim = "${containerd_shim_path}" +EOF + +if [ "$USE_DEVMAPPER" == "true" ]; then + sudo sed -i 's|^\(\[plugins\]\).*|\1\n \[plugins.devmapper\]\n pool_name = \"contd-thin-pool\"\n base_image_size = \"4096MB\"|' ${CONTAINERD_CONFIG_FILE} + echo "Devicemapper configured" + cat "${CONTAINERD_CONFIG_FILE}" +fi + +} + +function cleanup() { + ci_cleanup + [ -d "$tmp_dir" ] && rm -rf "${tmp_dir}" +} + +trap cleanup EXIT + +function err_report() { + local log_file="${REPORT_DIR}/containerd.log" + if [ -f "$log_file" ]; then + echo "ERROR: containerd log :" + echo "-------------------------------------" + cat "${log_file}" + echo "-------------------------------------" + fi +} + + +function check_daemon_setup() { + info "containerd(cri): Check daemon works with runc" + create_containerd_config "runc" + + # containerd cri-integration will modify the passed in config file. Let's + # give it a temp one. + cp $CONTAINERD_CONFIG_FILE $CONTAINERD_CONFIG_FILE_TEMP + # in some distros(AlibabaCloud), there is no btrfs-devel package available, + # so pass GO_BUILDTAGS="no_btrfs" to make to not use btrfs. + sudo -E PATH="${PATH}:/usr/local/bin" \ + REPORT_DIR="${REPORT_DIR}" \ + FOCUS="TestImageLoad" \ + RUNTIME="" \ + CONTAINERD_CONFIG_FILE="$CONTAINERD_CONFIG_FILE_TEMP" \ + make GO_BUILDTAGS="no_btrfs" -e cri-integration +} + +function testContainerStart() { + # no_container_yaml set to 1 will not create container_yaml + # because caller has created its own container_yaml. + no_container_yaml=${1:-0} + + local pod_yaml=${REPORT_DIR}/pod.yaml + local container_yaml=${REPORT_DIR}/container.yaml + local image="busybox:latest" + + cat << EOF > "${pod_yaml}" +metadata: + name: busybox-sandbox1 + namespace: default + uid: busybox-sandbox1-uid +EOF + + #TestContainerSwap has created its own container_yaml. + if [ $no_container_yaml -ne 1 ]; then + cat << EOF > "${container_yaml}" +metadata: + name: busybox-killed-vmm + namespace: default + uid: busybox-killed-vmm-uid +image: + image: "$image" +command: +- top +EOF + fi + + sudo cp "$default_containerd_config" "$default_containerd_config_backup" + sudo cp $CONTAINERD_CONFIG_FILE "$default_containerd_config" + + restart_containerd_service + + sudo crictl pull $image + podid=$(sudo crictl runp $pod_yaml) + cid=$(sudo crictl create $podid $container_yaml $pod_yaml) + sudo crictl start $cid +} + +function testContainerStop() { + info "stop pod $podid" + sudo crictl stopp $podid + info "remove pod $podid" + sudo crictl rmp $podid + + sudo cp "$default_containerd_config_backup" "$default_containerd_config" + restart_containerd_service +} + +function TestKilledVmmCleanup() { + if [[ "${KATA_HYPERVISOR}" != "qemu" ]]; then + info "TestKilledVmmCleanup is skipped for ${KATA_HYPERVISOR}, only QEMU is currently tested" + return 0 + fi + + info "test killed vmm cleanup" + + testContainerStart + + qemu_pid=$(ps aux|grep qemu|grep -v grep|awk '{print $2}') + info "kill qemu $qemu_pid" + sudo kill -SIGKILL $qemu_pid + # sleep to let shimv2 exit + sleep 1 + remained=$(ps aux|grep shimv2|grep -v grep || true) + [ -z $remained ] || die "found remaining shimv2 process $remained" + + testContainerStop + + info "stop containerd" +} + +function TestContainerMemoryUpdate() { + if [[ "${KATA_HYPERVISOR}" != "qemu" ]] || [[ "${ARCH}" == "ppc64le" ]] || [[ "${ARCH}" == "s390x" ]]; then + return + fi + + test_virtio_mem=$1 + + if [ $test_virtio_mem -eq 1 ]; then + if [[ "$ARCH" != "x86_64" ]]; then + return + fi + info "Test container memory update with virtio-mem" + + sudo sed -i -e 's/^#enable_virtio_mem.*$/enable_virtio_mem = true/g' "${kata_config}" + else + info "Test container memory update without virtio-mem" + + sudo sed -i -e 's/^enable_virtio_mem.*$/#enable_virtio_mem = true/g' "${kata_config}" + fi + + testContainerStart + + vm_size=$(($(sudo crictl exec $cid cat /proc/meminfo | grep "MemTotal:" | awk '{print $2}')*1024)) + if [ $vm_size -gt $((2*1024*1024*1024)) ] || [ $vm_size -lt $((2*1024*1024*1024-128*1024*1024)) ]; then + testContainerStop + die "The VM memory size $vm_size before update is not right" + fi + + sudo crictl update --memory $((2*1024*1024*1024)) $cid + sleep 1 + + vm_size=$(($(sudo crictl exec $cid cat /proc/meminfo | grep "MemTotal:" | awk '{print $2}')*1024)) + if [ $vm_size -gt $((4*1024*1024*1024)) ] || [ $vm_size -lt $((4*1024*1024*1024-128*1024*1024)) ]; then + testContainerStop + die "The VM memory size $vm_size after increase is not right" + fi + + if [ $test_virtio_mem -eq 1 ]; then + sudo crictl update --memory $((1*1024*1024*1024)) $cid + sleep 1 + + vm_size=$(($(sudo crictl exec $cid cat /proc/meminfo | grep "MemTotal:" | awk '{print $2}')*1024)) + if [ $vm_size -gt $((3*1024*1024*1024)) ] || [ $vm_size -lt $((3*1024*1024*1024-128*1024*1024)) ]; then + testContainerStop + die "The VM memory size $vm_size after decrease is not right" + fi + fi + + testContainerStop +} + +function getContainerSwapInfo() { + swap_size=$(($(sudo crictl exec $cid cat /proc/meminfo | grep "SwapTotal:" | awk '{print $2}')*1024)) + # NOTE: these below two checks only works on cgroup v1 + swappiness=$(sudo crictl exec $cid cat /sys/fs/cgroup/memory/memory.swappiness) + swap_in_bytes=$(sudo crictl exec $cid cat /sys/fs/cgroup/memory/memory.memsw.limit_in_bytes) +} + +function TestContainerSwap() { + if [[ "${KATA_HYPERVISOR}" != "qemu" ]] || [[ "${ARCH}" != "x86_64" ]]; then + return + fi + + local container_yaml=${REPORT_DIR}/container.yaml + local image="busybox:latest" + + info "Test container with guest swap" + + create_containerd_config "kata-${KATA_HYPERVISOR}" 1 + sudo sed -i -e 's/^#enable_guest_swap.*$/enable_guest_swap = true/g' "${kata_config}" + + # Test without swap device + testContainerStart + getContainerSwapInfo + # Current default swappiness is 60 + if [ $swappiness -ne 60 ]; then + testContainerStop + die "The VM swappiness $swappiness without swap device is not right" + fi + if [ $swap_in_bytes -lt 1125899906842624 ]; then + testContainerStop + die "The VM swap_in_bytes $swap_in_bytes without swap device is not right" + fi + if [ $swap_size -ne 0 ]; then + testContainerStop + die "The VM swap size $swap_size without swap device is not right" + fi + testContainerStop + + # Test with swap device + cat << EOF > "${container_yaml}" +metadata: + name: busybox-swap + namespace: default + uid: busybox-swap-uid +annotations: + io.katacontainers.container.resource.swappiness: "100" + io.katacontainers.container.resource.swap_in_bytes: "1610612736" +linux: + resources: + memory_limit_in_bytes: 1073741824 +image: + image: "$image" +command: +- top +EOF + + testContainerStart 1 + getContainerSwapInfo + testContainerStop + + if [ $swappiness -ne 100 ]; then + die "The VM swappiness $swappiness with swap device is not right" + fi + if [ $swap_in_bytes -ne 1610612736 ]; then + die "The VM swap_in_bytes $swap_in_bytes with swap device is not right" + fi + if [ $swap_size -ne 536870912 ]; then + die "The VM swap size $swap_size with swap device is not right" + fi + + # Test without swap_in_bytes + cat << EOF > "${container_yaml}" +metadata: + name: busybox-swap + namespace: default + uid: busybox-swap-uid +annotations: + io.katacontainers.container.resource.swappiness: "100" +linux: + resources: + memory_limit_in_bytes: 1073741824 +image: + image: "$image" +command: +- top +EOF + + testContainerStart 1 + getContainerSwapInfo + testContainerStop + + if [ $swappiness -ne 100 ]; then + die "The VM swappiness $swappiness without swap_in_bytes is not right" + fi + # swap_in_bytes is not set, it should be a value that bigger than 1125899906842624 + if [ $swap_in_bytes -lt 1125899906842624 ]; then + die "The VM swap_in_bytes $swap_in_bytes without swap_in_bytes is not right" + fi + if [ $swap_size -ne 1073741824 ]; then + die "The VM swap size $swap_size without swap_in_bytes is not right" + fi + + # Test without memory_limit_in_bytes + cat << EOF > "${container_yaml}" +metadata: + name: busybox-swap + namespace: default + uid: busybox-swap-uid +annotations: + io.katacontainers.container.resource.swappiness: "100" +image: + image: "$image" +command: +- top +EOF + + testContainerStart 1 + getContainerSwapInfo + testContainerStop + + if [ $swappiness -ne 100 ]; then + die "The VM swappiness $swappiness without memory_limit_in_bytes is not right" + fi + # swap_in_bytes is not set, it should be a value that bigger than 1125899906842624 + if [ $swap_in_bytes -lt 1125899906842624 ]; then + die "The VM swap_in_bytes $swap_in_bytes without memory_limit_in_bytes is not right" + fi + if [ $swap_size -ne 2147483648 ]; then + die "The VM swap size $swap_size without memory_limit_in_bytes is not right" + fi + + create_containerd_config "kata-${KATA_HYPERVISOR}" +} + +# k8s may restart docker which will impact on containerd stop +function stop_containerd() { + local tmp=$(pgrep kubelet || true) + [ -n "$tmp" ] && sudo kubeadm reset -f + + sudo systemctl stop containerd +} + +function main() { + + info "Stop crio service" + systemctl is-active --quiet crio && sudo systemctl stop crio + + info "Stop containerd service" + systemctl is-active --quiet containerd && stop_containerd + + # Configure enviroment if running in CI + ci_config + + pushd "containerd" + + # Make sure the right artifacts are going to be built + make clean + + check_daemon_setup + + info "containerd(cri): testing using runtime: ${containerd_runtime_type}" + + create_containerd_config "kata-${KATA_HYPERVISOR}" + + info "containerd(cri): Running cri-integration" + + + passing_test="TestContainerStats|TestContainerRestart|TestContainerListStatsWithIdFilter|TestContainerListStatsWithIdSandboxIdFilter|TestDuplicateName|TestImageLoad|TestImageFSInfo|TestSandboxCleanRemove" + + if [[ "${KATA_HYPERVISOR}" == "cloud-hypervisor" || \ + "${KATA_HYPERVISOR}" == "qemu" ]]; then + issue="https://github.com/kata-containers/tests/issues/2318" + info "${KATA_HYPERVISOR} fails with TestContainerListStatsWithSandboxIdFilter }" + info "see ${issue}" + else + passing_test="${passing_test}|TestContainerListStatsWithSandboxIdFilter" + fi + + # in some distros(AlibabaCloud), there is no btrfs-devel package available, + # so pass GO_BUILDTAGS="no_btrfs" to make to not use btrfs. + # containerd cri-integration will modify the passed in config file. Let's + # give it a temp one. + cp $CONTAINERD_CONFIG_FILE $CONTAINERD_CONFIG_FILE_TEMP + sudo -E PATH="${PATH}:/usr/local/bin" \ + REPORT_DIR="${REPORT_DIR}" \ + FOCUS="^(${passing_test})$" \ + RUNTIME="" \ + CONTAINERD_CONFIG_FILE="$CONTAINERD_CONFIG_FILE_TEMP" \ + make GO_BUILDTAGS="no_btrfs" -e cri-integration + + # trap error for print containerd log, + # containerd's `cri-integration` will print the log itself. + trap err_report ERR + + # TestContainerSwap is currently failing with GHA. + # Let's re-enable it as soon as we get it to work. + # Reference: https://github.com/kata-containers/kata-containers/issues/7410 + # TestContainerSwap + + # TODO: runtime-rs doesn't support memory update currently + if [ "$KATA_HYPERVISOR" != "dragonball" ]; then + TestContainerMemoryUpdate 1 + TestContainerMemoryUpdate 0 + fi + + TestKilledVmmCleanup + + popd +} + +main diff --git a/tests/integration/gha-run.sh b/tests/integration/kubernetes/gha-run.sh similarity index 73% rename from tests/integration/gha-run.sh rename to tests/integration/kubernetes/gha-run.sh index c5cd573d6..0d02e7082 100755 --- a/tests/integration/gha-run.sh +++ b/tests/integration/kubernetes/gha-run.sh @@ -8,8 +8,11 @@ set -o errexit set -o nounset set -o pipefail -integration_dir="$(dirname "$(readlink -f "$0")")" -tools_dir="${integration_dir}/../../tools" +kubernetes_dir="$(dirname "$(readlink -f "$0")")" +source "${kubernetes_dir}/../../common.bash" +tools_dir="${repo_root_dir}/tools" + +AZ_RG="${AZ_RG:-kataCI}" function _print_cluster_name() { short_sha="$(git rev-parse --short=12 HEAD)" @@ -35,7 +38,7 @@ function create_cluster() { delete_cluster || true az aks create \ - -g "kataCI" \ + -g "${AZ_RG}" \ -n "$(_print_cluster_name)" \ -s "Standard_D4s_v5" \ --node-count 1 \ @@ -54,34 +57,33 @@ function install_kubectl() { function get_cluster_credentials() { az aks get-credentials \ - -g "kataCI" \ + -g "${AZ_RG}" \ -n "$(_print_cluster_name)" } -function ensure_yq() { - : "${GOPATH:=${GITHUB_WORKSPACE}}" - export GOPATH - export PATH="${GOPATH}/bin:${PATH}" - INSTALL_IN_GOPATH=true "${repo_root_dir}/ci/install_yq.sh" -} - -function run_tests() { +function deploy_kata() { platform="${1}" ensure_yq # Emsure we're in the default namespace kubectl config set-context --current --namespace=default - # Delete any spurious tests namespace that was left behind - kubectl delete namespace kata-containers-k8s-tests &> /dev/null || true - sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}|g" "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" + + # Enable debug for Kata Containers + yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[1].value' --tag '!!str' "true" + # Let the `kata-deploy` script take care of the runtime class creation / removal + yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[4].value' --tag '!!str' "true" + if [ "${KATA_HOST_OS}" = "cbl-mariner" ]; then yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[+].name' "HOST_OS" yq write -i "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" 'spec.template.spec.containers[0].env[-1].value' "${KATA_HOST_OS}" fi + + echo "::group::Final kata-deploy.yaml that is used in the test" cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" cat "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" | grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" || die "Failed to setup the tests image" + echo "::endgroup::" kubectl apply -f "${tools_dir}/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml" if [ "${platform}" = "tdx" ]; then @@ -90,7 +92,6 @@ function run_tests() { kubectl apply -f "${tools_dir}/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml" fi kubectl -n kube-system wait --timeout=10m --for=condition=Ready -l name=kata-deploy pod - kubectl apply -f "${tools_dir}/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml" # This is needed as the kata-deploy pod will be set to "Ready" when it starts running, # which may cause issues like not having the node properly labeled or the artefacts @@ -101,11 +102,24 @@ function run_tests() { sleep 60s fi + echo "::group::kata-deploy logs" + kubectl -n kube-system logs -l name=kata-deploy + echo "::endgroup::" + + echo "::group::Runtime classes" + kubectl get runtimeclass + echo "::endgroup::" +} + +function run_tests() { + # Delete any spurious tests namespace that was left behind + kubectl delete namespace kata-containers-k8s-tests &> /dev/null || true + # Create a new namespace for the tests and switch to it - kubectl apply -f ${integration_dir}/kubernetes/runtimeclass_workloads/tests-namespace.yaml + kubectl apply -f ${kubernetes_dir}/runtimeclass_workloads/tests-namespace.yaml kubectl config set-context --current --namespace=kata-containers-k8s-tests - pushd "${integration_dir}/kubernetes" + pushd "${kubernetes_dir}" bash setup.sh bash run_kubernetes_tests.sh popd @@ -113,6 +127,15 @@ function run_tests() { function cleanup() { platform="${1}" + ensure_yq + + echo "Gather information about the nodes and pods before cleaning up the node" + get_nodes_and_pods_info + + if [ "${platform}" = "aks" ]; then + delete_cluster + return + fi # Switch back to the default namespace and delete the tests one kubectl config set-context --current --namespace=default @@ -129,6 +152,9 @@ function cleanup() { kubectl delete ${deploy_spec} kubectl -n kube-system wait --timeout=10m --for=delete -l name=kata-deploy pod + # Let the `kata-deploy` script take care of the runtime class creation / removal + yq write -i "${tools_dir}/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml" 'spec.template.spec.containers[0].env[4].value' --tag '!!str' "true" + sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}|g" "${tools_dir}/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml" cat "${tools_dir}/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml" cat "${tools_dir}/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml" | grep "${DOCKER_REGISTRY}/${DOCKER_REPO}:${DOCKER_TAG}" || die "Failed to setup the tests image" @@ -137,16 +163,20 @@ function cleanup() { kubectl delete ${cleanup_spec} kubectl delete -f "${tools_dir}/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml" - kubectl delete -f "${tools_dir}/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml" } function delete_cluster() { az aks delete \ - -g "kataCI" \ + -g "${AZ_RG}" \ -n "$(_print_cluster_name)" \ --yes } +function get_nodes_and_pods_info() { + kubectl debug $(kubectl get nodes -o name) -it --image=quay.io/kata-containers/kata-debug:latest + kubectl get pods -o name | grep node-debugger | xargs kubectl delete +} + function main() { export KATA_HOST_OS="${KATA_HOST_OS:-}" @@ -159,14 +189,15 @@ function main() { install-bats) install_bats ;; install-kubectl) install_kubectl ;; get-cluster-credentials) get_cluster_credentials ;; - run-tests-aks) run_tests "aks" ;; - run-tests-sev) run_tests "sev" ;; - run-tests-snp) run_tests "snp" ;; - run-tests-tdx) run_tests "tdx" ;; + deploy-kata-aks) deploy_kata "aks" ;; + deploy-kata-sev) deploy_kata "sev" ;; + deploy-kata-snp) deploy_kata "snp" ;; + deploy-kata-tdx) deploy_kata "tdx" ;; + run-tests) run_tests ;; cleanup-sev) cleanup "sev" ;; cleanup-snp) cleanup "snp" ;; cleanup-tdx) cleanup "tdx" ;; - delete-cluster) delete_cluster ;; + delete-cluster) cleanup "aks" ;; *) >&2 echo "Invalid argument"; exit 2 ;; esac } diff --git a/tests/integration/kubernetes/k8s-file-volume.bats b/tests/integration/kubernetes/k8s-file-volume.bats new file mode 100644 index 000000000..fd09d7886 --- /dev/null +++ b/tests/integration/kubernetes/k8s-file-volume.bats @@ -0,0 +1,47 @@ +#!/usr/bin/env bats +# +# Copyright (c) 2022 Ant Group +# +# SPDX-License-Identifier: Apache-2.0 +# + +load "${BATS_TEST_DIRNAME}/../../common.bash" +load "${BATS_TEST_DIRNAME}/tests_common.sh" +TEST_INITRD="${TEST_INITRD:-no}" + +setup() { + [ "${KATA_HYPERVISOR}" == "firecracker" ] && skip "test not working see: ${fc_limitations}" + pod_name="test-file-volume" + container_name="busybox-file-volume-container" + tmp_file=$(exec_host mktemp /tmp/file-volume-test-foo.XXXXX) + mount_path="/tmp/foo.txt" + file_body="test" + get_pod_config_dir +} + +@test "Test readonly volume for pods" { + [ "${KATA_HYPERVISOR}" == "firecracker" ] && skip "test not working see: ${fc_limitations}" + # Write test body to temp file + exec_host "echo "$file_body" > $tmp_file" + + # Create test yaml + sed -e "s|HOST_FILE|$tmp_file|" ${pod_config_dir}/pod-file-volume.yaml > ${pod_config_dir}/test-pod-file-volume.yaml + sed -i "s|MOUNT_PATH|$mount_path|" ${pod_config_dir}/test-pod-file-volume.yaml + + # Create pod + kubectl create -f "${pod_config_dir}/test-pod-file-volume.yaml" + + # Check pod creation + kubectl wait --for=condition=Ready --timeout=$timeout pod "$pod_name" + + # Validate file volume body inside the pod + file_in_container=$(kubectl exec $pod_name -- cat $mount_path) + [ "$file_body" == "$file_in_container" ] +} + +teardown() { + [ "${KATA_HYPERVISOR}" == "firecracker" ] && skip "test not working see: ${fc_limitations}" + kubectl delete pod "$pod_name" + exec_host rm -f $tmp_file + rm -f ${pod_config_dir}/test-pod-file-volume.yaml.yaml +} diff --git a/tests/integration/kubernetes/k8s-volume.bats b/tests/integration/kubernetes/k8s-volume.bats new file mode 100644 index 000000000..0489ff461 --- /dev/null +++ b/tests/integration/kubernetes/k8s-volume.bats @@ -0,0 +1,67 @@ +#!/usr/bin/env bats +# +# Copyright (c) 2018 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +load "${BATS_TEST_DIRNAME}/../../common.bash" +load "${BATS_TEST_DIRNAME}/tests_common.sh" +TEST_INITRD="${TEST_INITRD:-no}" + +setup() { + [ "${KATA_HYPERVISOR}" == "firecracker" ] && skip "test not working see: ${fc_limitations}" + + get_pod_config_dir + + tmp_file=$(exec_host mktemp -d /tmp/data.XXXX) + pod_yaml=$(mktemp --tmpdir pod_config.XXXXXX.yaml) + msg="Hello from Kubernetes" + exec_host "echo $msg > $tmp_file/index.html" + pod_name="pv-pod" + # Define temporary file at yaml + sed -e "s|tmp_data|${tmp_file}|g" ${pod_config_dir}/pv-volume.yaml > "$pod_yaml" +} + +@test "Create Persistent Volume" { + [ "${KATA_HYPERVISOR}" == "firecracker" ] && skip "test not working see: ${fc_limitations}" + + volume_name="pv-volume" + volume_claim="pv-claim" + + # Create the persistent volume + kubectl create -f "$pod_yaml" + + # Check the persistent volume is Available + cmd="kubectl get pv $volume_name | grep Available" + waitForProcess "$wait_time" "$sleep_time" "$cmd" + + # Create the persistent volume claim + kubectl create -f "${pod_config_dir}/volume-claim.yaml" + + # Check the persistent volume claim is Bound. + cmd="kubectl get pvc $volume_claim | grep Bound" + waitForProcess "$wait_time" "$sleep_time" "$cmd" + + # Create pod + kubectl create -f "${pod_config_dir}/pv-pod.yaml" + + # Check pod creation + kubectl wait --for=condition=Ready --timeout=$timeout pod "$pod_name" + + cmd="cat /mnt/index.html" + kubectl exec $pod_name -- sh -c "$cmd" | grep "$msg" +} + +teardown() { + [ "${KATA_HYPERVISOR}" == "firecracker" ] && skip "test not working see: ${fc_limitations}" + + # Debugging information + kubectl describe "pod/$pod_name" + + kubectl delete pod "$pod_name" + kubectl delete pvc "$volume_claim" + kubectl delete pv "$volume_name" + rm -f "$pod_yaml" + exec_host rm -rf "$tmp_file" +} diff --git a/tests/integration/kubernetes/run_kubernetes_tests.sh b/tests/integration/kubernetes/run_kubernetes_tests.sh index db1e16633..f8b635d22 100644 --- a/tests/integration/kubernetes/run_kubernetes_tests.sh +++ b/tests/integration/kubernetes/run_kubernetes_tests.sh @@ -8,6 +8,7 @@ set -e kubernetes_dir=$(dirname "$(readlink -f "$0")") +source "${kubernetes_dir}/../../common.bash" TARGET_ARCH="${TARGET_ARCH:-x86_64}" KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu}" @@ -27,6 +28,7 @@ else "k8s-empty-dirs.bats" \ "k8s-env.bats" \ "k8s-exec.bats" \ + "k8s-file-volume.bats" \ "k8s-inotify.bats" \ "k8s-job.bats" \ "k8s-kill-all-process-in-container.bats" \ @@ -50,6 +52,7 @@ else "k8s-sysctls.bats" \ "k8s-security-context.bats" \ "k8s-shared-volume.bats" \ + "k8s-volume.bats" \ "k8s-nginx-connectivity.bats" \ ) fi diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pv-pod.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pv-pod.yaml new file mode 100644 index 000000000..6a165b971 --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/pv-pod.yaml @@ -0,0 +1,26 @@ +# +# Copyright (c) 2018 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +kind: Pod +apiVersion: v1 +metadata: + name: pv-pod +spec: + terminationGracePeriodSeconds: 0 + runtimeClassName: kata + volumes: + - name: pv-storage + persistentVolumeClaim: + claimName: pv-claim + containers: + - name: pv-container + image: quay.io/prometheus/busybox:latest + ports: + command: + - sleep + - "120" + volumeMounts: + - mountPath: "/mnt/" + name: pv-storage diff --git a/tests/integration/kubernetes/runtimeclass_workloads/pv-volume.yaml b/tests/integration/kubernetes/runtimeclass_workloads/pv-volume.yaml new file mode 100644 index 000000000..e677d5af4 --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/pv-volume.yaml @@ -0,0 +1,19 @@ +# +# Copyright (c) 2018 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +kind: PersistentVolume +apiVersion: v1 +metadata: + name: pv-volume + labels: + type: local +spec: + storageClassName: manual + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + hostPath: + path: "tmp_data" diff --git a/tests/integration/kubernetes/runtimeclass_workloads/volume-claim.yaml b/tests/integration/kubernetes/runtimeclass_workloads/volume-claim.yaml new file mode 100644 index 000000000..e523e29de --- /dev/null +++ b/tests/integration/kubernetes/runtimeclass_workloads/volume-claim.yaml @@ -0,0 +1,16 @@ +# +# Copyright (c) 2018 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: pv-claim +spec: + storageClassName: manual + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 3Gi diff --git a/tests/integration/kubernetes/setup.sh b/tests/integration/kubernetes/setup.sh index 614f38827..fb68e47cb 100755 --- a/tests/integration/kubernetes/setup.sh +++ b/tests/integration/kubernetes/setup.sh @@ -8,26 +8,33 @@ set -o nounset set -o pipefail kubernetes_dir=$(dirname "$(readlink -f "$0")") +source "${kubernetes_dir}/../../common.bash" + +reset_workloads_work_dir() { + rm -rf ${kubernetes_dir}/runtimeclass_workloads_work + cp -R ${kubernetes_dir}/runtimeclass_workloads ${kubernetes_dir}/runtimeclass_workloads_work +} set_runtime_class() { - sed -i -e "s|runtimeClassName: kata|runtimeClassName: kata-${KATA_HYPERVISOR}|" ${kubernetes_dir}/runtimeclass_workloads/*.yaml + sed -i -e "s|runtimeClassName: kata|runtimeClassName: kata-${KATA_HYPERVISOR}|" ${kubernetes_dir}/runtimeclass_workloads_work/*.yaml } set_kernel_path() { if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then mariner_kernel_path="/usr/share/cloud-hypervisor/vmlinux.bin" - find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.kernel]' "${mariner_kernel_path}" \; + find ${kubernetes_dir}/runtimeclass_workloads_work/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.kernel]' "${mariner_kernel_path}" \; fi } set_initrd_path() { if [[ "${KATA_HOST_OS}" = "cbl-mariner" ]]; then - initrd_path="/opt/kata/share/kata-containers/kata-containers-initrd-cbl-mariner.img" - find ${kubernetes_dir}/runtimeclass_workloads/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.initrd]' "${initrd_path}" \; + initrd_path="/opt/kata/share/kata-containers/kata-containers-initrd-mariner.img" + find ${kubernetes_dir}/runtimeclass_workloads_work/*.yaml -exec yq write -i {} 'metadata.annotations[io.katacontainers.config.hypervisor.initrd]' "${initrd_path}" \; fi } main() { + reset_workloads_work_dir set_runtime_class set_kernel_path set_initrd_path diff --git a/tests/integration/kubernetes/tests_common.sh b/tests/integration/kubernetes/tests_common.sh index 660a429c8..88f45557c 100644 --- a/tests/integration/kubernetes/tests_common.sh +++ b/tests/integration/kubernetes/tests_common.sh @@ -34,6 +34,19 @@ dragonball_limitations="https://github.com/kata-containers/kata-containers/issue export KUBECONFIG="${KUBECONFIG:-$HOME/.kube/config}" get_pod_config_dir() { - pod_config_dir="${BATS_TEST_DIRNAME}/runtimeclass_workloads" + pod_config_dir="${BATS_TEST_DIRNAME}/runtimeclass_workloads_work" info "k8s configured to use runtimeclass" } + +# Runs a command in the host filesystem. +exec_host() { + node="$(kubectl get node -o name)" + # `kubectl debug` always returns 0, so we hack it to return the right exit code. + command="$@" + command+='; echo -en \\n$?' + output="$(kubectl debug -qit "${node}" --image=alpine:latest -- chroot /host bash -c "${command}")" + kubectl get pods -o name | grep node-debugger | xargs kubectl delete > /dev/null + exit_code="$(echo "${output}" | tail -1)" + echo "$(echo "${output}" | head -n -1)" + return ${exit_code} +} diff --git a/tests/integration/nydus/gha-run.sh b/tests/integration/nydus/gha-run.sh new file mode 100755 index 000000000..86707d504 --- /dev/null +++ b/tests/integration/nydus/gha-run.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +set -o errexit +set -o nounset +set -o pipefail + +kata_tarball_dir="${2:-kata-artifacts}" +nydus_dir="$(dirname "$(readlink -f "$0")")" +source "${nydus_dir}/../../common.bash" + +function install_dependencies() { + info "Installing the dependencies needed for running the nydus tests" + + return 0 +} + +function run() { + info "Running nydus tests using ${KATA_HYPERVISOR} hypervisor" + + return 0 +} + +function main() { + action="${1:-}" + case "${action}" in + install-dependencies) install_dependencies ;; + install-kata) return 0 ;; + run) run ;; + *) >&2 die "Invalid argument" ;; + esac +} + +main "$@" diff --git a/tests/integration/nydus/nydus-container.yaml b/tests/integration/nydus/nydus-container.yaml new file mode 100644 index 000000000..a30327ade --- /dev/null +++ b/tests/integration/nydus/nydus-container.yaml @@ -0,0 +1,12 @@ +metadata: + name: nydus-container +image: + image: ghcr.io/dragonflyoss/image-service/alpine:nydus-latest +command: +- tail +- -f +- /dev/null +linux: + resources: + memory_limit_in_bytes: 524288000 +log_path: nydus.0.log diff --git a/tests/integration/nydus/nydus-sandbox.yaml b/tests/integration/nydus/nydus-sandbox.yaml new file mode 100644 index 000000000..9f039d726 --- /dev/null +++ b/tests/integration/nydus/nydus-sandbox.yaml @@ -0,0 +1,5 @@ +metadata: + attempt: 1 + name: nydus-sandbox + namespace: default +log_directory: /tmp diff --git a/tests/integration/nydus/nydus_tests.sh b/tests/integration/nydus/nydus_tests.sh new file mode 100755 index 000000000..fe49580da --- /dev/null +++ b/tests/integration/nydus/nydus_tests.sh @@ -0,0 +1,211 @@ +#!/bin/bash +# +# Copyright (c) 2022 Ant Group +# +# SPDX-License-Identifier: Apache-2.0 +# +# This will test the nydus feature is working properly + +set -o errexit +set -o nounset +set -o pipefail +set -o errtrace + +dir_path=$(dirname "$0") +source "${dir_path}/../../lib/common.bash" +source "${dir_path}/../../.ci/lib.sh" +source "/etc/os-release" || source "/usr/lib/os-release" +KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu}" + +need_restore_kata_config=false +kata_config_backup="/tmp/kata-configuration.toml" +SYSCONFIG_FILE="/etc/kata-containers/configuration.toml" +DEFAULT_CONFIG_FILE="/opt/kata/share/defaults/kata-containers/configuration-qemu.toml" +CLH_CONFIG_FILE="/opt/kata/share/defaults/kata-containers/configuration-clh.toml" +DB_CONFIG_FILE="/opt/kata/share/defaults/kata-containers/configuration-dragonball.toml" +need_restore_containerd_config=false +containerd_config="/etc/containerd/config.toml" +containerd_config_backup="/tmp/containerd.config.toml" + +# test image for container +IMAGE="${IMAGE:-ghcr.io/dragonflyoss/image-service/alpine:nydus-latest}" + +if [ "$KATA_HYPERVISOR" != "qemu" ] && [ "$KATA_HYPERVISOR" != "cloud-hypervisor" ] && [ "$KATA_HYPERVISOR" != "dragonball" ]; then + echo "Skip nydus test for $KATA_HYPERVISOR, it only works for QEMU/CLH/DB now." + exit 0 +fi + +arch="$(uname -m)" +if [ "$arch" != "x86_64" ]; then + echo "Skip nydus test for $arch, it only works for x86_64 now. See https://github.com/kata-containers/tests/issues/4445" + exit 0 +fi + +function install_from_tarball() { + local package_name="$1" + local binary_name="$2" + [ -n "$package_name" ] || die "need package_name" + [ -n "$binary_name" ] || die "need package release binary_name" + + local url=$(get_version "externals.${package_name}.url") + local version=$(get_version "externals.${package_name}.version") + local tarball_url="${url}/releases/download/${version}/${binary_name}-${version}-$arch.tgz" + if [ "${package_name}" == "nydus" ]; then + local goarch="$(${dir_path}/../../.ci/kata-arch.sh --golang)" + tarball_url="${url}/releases/download/${version}/${binary_name}-${version}-linux-$goarch.tgz" + fi + echo "Download tarball from ${tarball_url}" + curl -Ls "$tarball_url" | sudo tar xfz - -C /usr/local/bin --strip-components=1 +} + +function setup_nydus() { + # install nydus + install_from_tarball "nydus" "nydus-static" + + # install nydus-snapshotter + install_from_tarball "nydus-snapshotter" "nydus-snapshotter" + + # Config nydus snapshotter + sudo -E cp "$dir_path/nydusd-config.json" /etc/ + + # start nydus-snapshotter + nohup /usr/local/bin/containerd-nydus-grpc \ + --config-path /etc/nydusd-config.json \ + --shared-daemon \ + --log-level debug \ + --root /var/lib/containerd/io.containerd.snapshotter.v1.nydus \ + --cache-dir /var/lib/nydus/cache \ + --nydusd-path /usr/local/bin/nydusd \ + --nydusimg-path /usr/local/bin/nydus-image \ + --disable-cache-manager true \ + --enable-nydus-overlayfs true \ + --log-to-stdout >/dev/null 2>&1 & +} + +function config_kata() { + sudo mkdir -p /etc/kata-containers + if [ -f "$SYSCONFIG_FILE" ]; then + need_restore_kata_config=true + sudo cp -a "${SYSCONFIG_FILE}" "${kata_config_backup}" + elif [ "$KATA_HYPERVISOR" == "qemu" ]; then + sudo cp -a "${DEFAULT_CONFIG_FILE}" "${SYSCONFIG_FILE}" + elif [ "$KATA_HYPERVISOR" == "dragonball" ]; then + sudo cp -a "${DB_CONFIG_FILE}" "${SYSCONFIG_FILE}" + else + sudo cp -a "${CLH_CONFIG_FILE}" "${SYSCONFIG_FILE}" + fi + + echo "Enabling all debug options in file ${SYSCONFIG_FILE}" + sudo sed -i -e 's/^#\(enable_debug\).*=.*$/\1 = true/g' "${SYSCONFIG_FILE}" + sudo sed -i -e 's/^kernel_params = "\(.*\)"/kernel_params = "\1 agent.log=debug"/g' "${SYSCONFIG_FILE}" + + if [ "$KATA_HYPERVISOR" != "dragonball" ]; then + sudo sed -i 's|^shared_fs.*|shared_fs = "virtio-fs-nydus"|g' "${SYSCONFIG_FILE}" + sudo sed -i 's|^virtio_fs_daemon.*|virtio_fs_daemon = "/usr/local/bin/nydusd"|g' "${SYSCONFIG_FILE}" + fi + + sudo sed -i 's|^virtio_fs_extra_args.*|virtio_fs_extra_args = []|g' "${SYSCONFIG_FILE}" +} + +function config_containerd() { + readonly runc_path=$(command -v runc) + sudo mkdir -p /etc/containerd/ + if [ -f "$containerd_config" ]; then + need_restore_containerd_config=true + sudo cp -a "${containerd_config}" "${containerd_config_backup}" + else + sudo rm "${containerd_config}" + fi + + cat <&2; exit 1; } + + while [ $# -gt 1 ] + do + case "$1" in + -d|--default) ;; + + -g|--golang) type="golang";; + + -r|--rust) type="rust";; + + -h|--help) + usage + exit 0 + ;; + + -k|--kernel) type="kernel";; + + --) + shift + break + ;; + esac + shift + done + + local -r arch=$(uname -m) + + case "$type" in + default) echo "$arch";; + golang) arch_to_golang "$arch";; + rust) arch_to_rust "${arch}";; + kernel) arch_to_kernel "$arch";; + esac +} + +main "$@" diff --git a/tests/metrics/README.md b/tests/metrics/README.md index d017ed3fc..825018b01 100644 --- a/tests/metrics/README.md +++ b/tests/metrics/README.md @@ -70,6 +70,8 @@ Tests relating to networking. General items could include: Tests relating to the storage (graph, volume) drivers. +For further details see the [storage tests documentation](storage). + ### Disk Test relating to measure reading and writing against clusters. @@ -79,6 +81,14 @@ Test relating to measure reading and writing against clusters. Tests relating with TensorFlow and Pytorch implementations of several popular convolutional models. +For further details see the [machine learning tests documentation](machine_learning). + +### `CPU` + +Tests related with `CPU` performance. + +For further details see the [`cpu` tests documentation](cpu). + ## Saving Results In order to ensure continuity, and thus testing and historical tracking of results, diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml index 562b2c83b..9569f0397 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-clh-kata-metric8.toml @@ -16,9 +16,9 @@ description = "measure container lifecycle timings" # within (inclusive) checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result" checktype = "mean" -midval = 0.42 -minpercent = 20.0 -maxpercent = 20.0 +midval = 0.69 +minpercent = 30.0 +maxpercent = 30.0 [[metric]] name = "memory-footprint" @@ -32,3 +32,68 @@ checktype = "mean" midval = 2518364.00 minpercent = 20.0 maxpercent = 20.0 + +[[metric]] +name = "memory-footprint-inside-container" +type = "json" +description = "measure memory inside the container" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"memory-footprint-inside-container\".Results | .[] | .memtotal.Result" +checktype = "mean" +midval = 4135244.0 +minpercent = 20.0 +maxpercent = 20.0 + +[[metric]] +name = "blogbench" +type = "json" +description = "measure container average of blogbench write" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"blogbench\".Results | .[] | .write.Result" +checktype = "mean" +midval = 1623.0 +minpercent = 20.0 +maxpercent = 20.0 + +[[metric]] +name = "blogbench" +type = "json" +description = "measure container average of blogbench read" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"blogbench\".Results | .[] | .read.Result" +checktype = "mean" +midval = 96939.0 +minpercent = 20.0 +maxpercent = 20.0 + +[[metric]] +name = "tensorflow" +type = "json" +description = "tensorflow resnet model" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"tensorflow\".Results | .[] | .resnet.Result" +checktype = "mean" +midval = 3566.0 +minpercent = 20.0 +maxpercent = 20.0 + +[[metric]] +name = "tensorflow" +type = "json" +description = "tensorflow alexnet model" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"tensorflow\".Results | .[] | .alexnet.Result" +checktype = "mean" +midval = 98.0 +minpercent = 20.0 +maxpercent = 20.0 diff --git a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml index c6bc85147..af9622418 100644 --- a/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml +++ b/tests/metrics/cmd/checkmetrics/ci_worker/checkmetrics-json-qemu-kata-metric8.toml @@ -16,9 +16,9 @@ description = "measure container lifecycle timings" # within (inclusive) checkvar = ".\"boot-times\".Results | .[] | .\"to-workload\".Result" checktype = "mean" -midval = 0.61 -minpercent = 20.0 -maxpercent = 20.0 +midval = 0.71 +minpercent = 30.0 +maxpercent = 30.0 [[metric]] name = "memory-footprint" @@ -32,3 +32,68 @@ checktype = "mean" midval = 2435844.00 minpercent = 20.0 maxpercent = 20.0 + +[[metric]] +name = "memory-footprint-inside-container" +type = "json" +description = "measure memory inside the container" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"memory-footprint-inside-container\".Results | .[] | .memtotal.Result" +checktype = "mean" +midval = 3677280.0 +minpercent = 25.0 +maxpercent = 25.0 + +[[metric]] +name = "blogbench" +type = "json" +description = "measure container average of blogbench write" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"blogbench\".Results | .[] | .write.Result" +checktype = "mean" +midval = 1639.0 +minpercent = 20.0 +maxpercent = 20.0 + +[[metric]] +name = "blogbench" +type = "json" +description = "measure container average of blogbench read" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"blogbench\".Results | .[] | .read.Result" +checktype = "mean" +midval = 98687.0 +minpercent = 20.0 +maxpercent = 20.0 + +[[metric]] +name = "tensorflow" +type = "json" +description = "tensorflow resnet model" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"tensorflow\".Results | .[] | .resnet.Result" +checktype = "mean" +midval = 3546.0 +minpercent = 20.0 +maxpercent = 20.0 + +[[metric]] +name = "tensorflow" +type = "json" +description = "tensorflow alexnet model" +# Min and Max values to set a 'range' that +# the median of the CSV Results data must fall +# within (inclusive) +checkvar = ".\"tensorflow\".Results | .[] | .alexnet.Result" +checktype = "mean" +midval = 98.0 +minpercent = 20.0 +maxpercent = 20.0 diff --git a/tests/metrics/cpu/README.md b/tests/metrics/cpu/README.md new file mode 100644 index 000000000..3dd3a2928 --- /dev/null +++ b/tests/metrics/cpu/README.md @@ -0,0 +1,9 @@ +# Kata Containers C-Ray Metrics +This is a test of C-Ray which is a simple raytracer designed to test the floating-point CPU performance. + +## Running the C-Ray test +Individual test can be run by hand, for example: + +``` +$ cd metrics/disk/c-ray $ ./cray.sh +``` diff --git a/tests/metrics/cpu/c-ray/Dockerfile b/tests/metrics/cpu/c-ray/Dockerfile new file mode 100644 index 000000000..41db08b7a --- /dev/null +++ b/tests/metrics/cpu/c-ray/Dockerfile @@ -0,0 +1,26 @@ +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Usage: FROM [image name] +FROM ubuntu:20.04 + +# Version of the Dockerfile +LABEL DOCKERFILE_VERSION="1.0" + +ENV DEBIAN_FRONTEND=noninteractive + +# URL for c-ray benchmark +ENV CRAY_URL "http://www.phoronix-test-suite.com/benchmark-files/c-ray-1.1.tar.gz" + +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential gcc curl && \ + apt-get remove -y unattended-upgrades && \ + curl -OkL "${CRAY_URL}" && \ + tar -zxvf c-ray-1.1.tar.gz && \ + cd c-ray-1.1 && \ + cc -o c-ray-mt c-ray-mt.c -lm -lpthread && \ + make && \ + make install + +CMD ["/bin/bash"] diff --git a/tests/metrics/cpu/c-ray/cray.sh b/tests/metrics/cpu/c-ray/cray.sh new file mode 100755 index 000000000..f84502c41 --- /dev/null +++ b/tests/metrics/cpu/c-ray/cray.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +set -e + +# General env +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../../lib/common.bash" + +TEST_NAME="cray" +IMAGE="docker.io/library/cray:latest" +DOCKERFILE="${SCRIPT_PATH}/Dockerfile" +CMD="cd c-ray-1.1 && ./c-ray-mt -t 32 -s 1024x768 -r 8 -i sphfract -o output.ppm 2>&1 | tee -a output.txt && cat output.txt" +cray_file=$(mktemp crayresults.XXXXXXXXXX) + +function remove_tmp_file() { + rm -rf "${cray_file}" +} + +trap remove_tmp_file EXIT + +function main() { + # Check tools/commands dependencies + cmds=("awk" "docker") + init_env + check_cmds "${cmds[@]}" + check_ctr_images "$IMAGE" "$DOCKERFILE" + + sudo -E "${CTR_EXE}" run --rm --runtime="${CTR_RUNTIME}" "${IMAGE}" test sh -c "${CMD}" > "${cray_file}" + metrics_json_init + results=$(cat "${cray_file}" | grep seconds | awk '{print $3}' | head -n 1) + metrics_json_start_array + + local json="$(cat << EOF + { + "rendering": { + "Result": ${results}, + "Units": "s" + } + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Results" + metrics_json_save + + clean_env_ctr +} + +main "$@" diff --git a/tests/metrics/density/memory_usage.sh b/tests/metrics/density/memory_usage.sh index 57d2ce3cd..f0505aaff 100755 --- a/tests/metrics/density/memory_usage.sh +++ b/tests/metrics/density/memory_usage.sh @@ -320,8 +320,6 @@ EOF metrics_json_add_array_element "$json" metrics_json_end_array "Results" - - clean_env_ctr } function save_config(){ @@ -344,6 +342,9 @@ EOF } function main(){ + # Collect kata-env data + common_init + # Verify enough arguments if [ $# != 2 ] && [ $# != 3 ];then echo >&2 "error: Not enough arguments [$@]" @@ -378,6 +379,7 @@ function main(){ fi metrics_json_save + clean_env_ctr } main "$@" diff --git a/tests/metrics/density/memory_usage_inside_container.sh b/tests/metrics/density/memory_usage_inside_container.sh index 071ded175..71abf49d0 100755 --- a/tests/metrics/density/memory_usage_inside_container.sh +++ b/tests/metrics/density/memory_usage_inside_container.sh @@ -20,7 +20,7 @@ IMAGE='quay.io/prometheus/busybox:latest' CMD="sleep 10; cat /proc/meminfo" # We specify here in 'k', as that then matches the results we get from the meminfo, # which makes later direct comparison easier. -MEMSIZE=${MEMSIZE:-$((2048*1024))} +MEMSIZE="${MEMSIZE:-$((2048*1024))}" # this variable determines the number of attempts when a test # result is considered not valid (a zero value or a negative value) @@ -38,7 +38,7 @@ count_iters=0 # valid_result: if value stored is '1' the result is valid, '0' otherwise valid_result=0 -parse_results() { +function parse_results() { local raw_results="${1}" # Variables used for sum cummulative values in the case of two or more reps. @@ -47,20 +47,20 @@ parse_results() { local memfree_acu="${3:-0}" local memavailable_acu="${4:-0}" - local memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $2}') - units_memtotal=$(echo "$raw_results" | awk '/MemTotal/ {print $3}') + local memtotal=$(echo "${raw_results}" | awk '/MemTotal/ {print $2}') + units_memtotal=$(echo "${raw_results}" | awk '/MemTotal/ {print $3}') - local memfree=$(echo "$raw_results" | awk '/MemFree/ {print $2}') - units_memfree=$(echo "$raw_results" | awk '/MemFree/ {print $3}') + local memfree=$(echo "${raw_results}" | awk '/MemFree/ {print $2}') + units_memfree=$(echo "${raw_results}" | awk '/MemFree/ {print $3}') - local memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $2}') - units_memavailable=$(echo "$raw_results" | awk '/MemAvailable/ {print $3}') + local memavailable=$(echo "${raw_results}" | awk '/MemAvailable/ {print $2}') + units_memavailable=$(echo "${raw_results}" | awk '/MemAvailable/ {print $3}') # check results: if any result is zero or negative, it is considered as invalid, and the test will be repeated. - if (( $(echo "$memtotal <= 0" | bc -l) )) || (( $(echo "$memfree <= 0" | bc -l) )) || (( $(echo "$memavailable <= 0" | bc -l) )); then + if (( $(echo "${memtotal} <= 0" | bc -l) )) || (( $(echo "${memfree} <= 0" | bc -l) )) || (( $(echo "${memavailable} <= 0" | bc -l) )); then MAX_FAILED_ATTEMPTS=$((MAX_FAILED_ATTEMPTS-1)) valid_result=0 - info "Skipping invalid result: memtotal: $memtotal memfree: $memfree memavailable: $memavailable" + info "Skipping invalid result: memtotal: ${memtotal} memfree: ${memfree} memavailable: ${memavailable}" return 0 fi @@ -68,14 +68,14 @@ parse_results() { memfreeAvg=$((memfree+memfree_acu)) memavailableAvg=$((memavailable+memavailable_acu)) valid_result=1 - info "Iteration# $count_iters memtotal: $memtotal memfree: $memfree memavailable: $memavailable" + info "Iteration# ${count_iters} memtotal: ${memtotal} memfree: ${memfree} memavailable: ${memavailable}" } -store_results_json() { +function store_results_json() { metrics_json_start_array - memtotalAvg=$(echo "scale=2; $memtotalAvg / $count_iters" | bc) - memfreeAvg=$(echo "scale=2; $memfreeAvg / $count_iters" | bc) - memavailableAvg=$(echo "scale=2; $memavailableAvg / $count_iters" | bc) + memtotalAvg=$(echo "scale=2; ${memtotalAvg} / ${count_iters}" | bc) + memfreeAvg=$(echo "scale=2; ${memfreeAvg} / ${count_iters}" | bc) + memavailableAvg=$(echo "scale=2; ${memavailableAvg} / ${count_iters}" | bc) local json="$(cat << EOF { @@ -109,7 +109,7 @@ EOF function main() { # switch to select output format local num_iterations=${1:-1} - info "Iterations: $num_iterations" + info "Iterations: ${num_iterations}" # Check tools/commands dependencies cmds=("awk" "ctr") @@ -117,13 +117,13 @@ function main() { check_cmds "${cmds[@]}" check_images "${IMAGE}" metrics_json_init - while [ $count_iters -lt $num_iterations ]; do - local output=$(sudo -E "${CTR_EXE}" run --memory-limit $((MEMSIZE*1024)) --rm --runtime=$CTR_RUNTIME $IMAGE busybox sh -c "$CMD" 2>&1) + while [ "${count_iters}" -lt "${num_iterations}" ]; do + local output=$(sudo -E "${CTR_EXE}" run --memory-limit $((MEMSIZE*1024)) --rm --runtime="${CTR_RUNTIME}" "${IMAGE}" busybox sh -c "${CMD}" 2>&1) parse_results "${output}" "${memtotalAvg}" "${memfreeAvg}" "${memavailableAvg}" # quit if number of attempts exceeds the allowed value. - [ ${MAX_FAILED_ATTEMPTS} -eq 0 ] && die "Max number of attempts exceeded." - [ ${valid_result} -eq 1 ] && count_iters=$((count_iters+1)) + [ "${MAX_FAILED_ATTEMPTS}" -eq 0 ] && die "Max number of attempts exceeded." + [ "${valid_result}" -eq 1 ] && count_iters=$((count_iters+1)) done store_results_json clean_env_ctr diff --git a/tests/metrics/gha-run.sh b/tests/metrics/gha-run.sh index 5f8bfbf08..850cca98f 100755 --- a/tests/metrics/gha-run.sh +++ b/tests/metrics/gha-run.sh @@ -18,59 +18,6 @@ declare -r results_dir="${metrics_dir}/results" declare -r checkmetrics_dir="${metrics_dir}/cmd/checkmetrics" declare -r checkmetrics_config_dir="${checkmetrics_dir}/ci_worker" -function create_symbolic_links() { - local link_configuration_file="/opt/kata/share/defaults/kata-containers/configuration.toml" - local source_configuration_file="/opt/kata/share/defaults/kata-containers/configuration-${KATA_HYPERVISOR}.toml" - - if [ "${KATA_HYPERVISOR}" != 'qemu' ] && [ "${KATA_HYPERVISOR}" != 'clh' ]; then - die "Failed to set the configuration.toml: '${KATA_HYPERVISOR}' is not recognized as a valid hypervisor name." - fi - - sudo ln -sf "${source_configuration_file}" "${link_configuration_file}" -} - -# Configures containerd -function overwrite_containerd_config() { - containerd_config="/etc/containerd/config.toml" - sudo rm "${containerd_config}" - sudo tee "${containerd_config}" << EOF -version = 2 -[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] - SystemdCgroup = true - -[plugins] - [plugins."io.containerd.grpc.v1.cri"] - [plugins."io.containerd.grpc.v1.cri".containerd] - default_runtime_name = "kata" - [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] - [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata] - runtime_type = "io.containerd.kata.v2" -EOF -} - -function install_kata() { - local kata_tarball="kata-static.tar.xz" - declare -r katadir="/opt/kata" - declare -r destdir="/" - declare -r local_bin_dir="/usr/local/bin/" - - # Removing previous kata installation - sudo rm -rf "${katadir}" - - pushd "${kata_tarball_dir}" - sudo tar -xvf "${kata_tarball}" -C "${destdir}" - popd - - # create symbolic links to kata components - for b in "${katadir}/bin/*" ; do - sudo ln -sf "${b}" "${local_bin_dir}/$(basename $b)" - done - - check_containerd_config_for_kata - restart_containerd_service - install_checkmetrics -} - function install_checkmetrics() { # Ensure we have the latest checkmetrics pushd "${checkmetrics_dir}" @@ -79,20 +26,18 @@ function install_checkmetrics() { popd } -function check_containerd_config_for_kata() { - # check containerd config - declare -r line1="default_runtime_name = \"kata\"" - declare -r line2="runtime_type = \"io.containerd.kata.v2\"" - declare -r num_lines_containerd=2 - declare -r containerd_path="/etc/containerd/config.toml" - local count_matches=$(grep -ic "$line1\|$line2" "${containerd_path}") +# @path_results: path to the input metric-results folder +# @tarball_fname: path and filename to the output tarball +function compress_metrics_results_dir() +{ + local path_results="${1:-results}" + local tarball_fname="${2:-}" - if [ "${count_matches}" = "${num_lines_containerd}" ]; then - info "containerd ok" - else - info "overwriting containerd configuration w/ a valid one" - overwrite_containerd_config - fi + [ -z "${tarball_fname}" ] && die "Missing the tarball filename or the path to save the tarball results is incorrect." + [ ! -d "${path_results}" ] && die "Missing path to the results folder." + + cd "${path_results}" && tar -czf "${tarball_fname}" *.json && cd - + info "tarball generated: ${tarball_fname}" } function check_metrics() { @@ -111,46 +56,45 @@ function make_tarball_results() { function run_test_launchtimes() { info "Running Launch Time test using ${KATA_HYPERVISOR} hypervisor" - create_symbolic_links bash tests/metrics/time/launch_times.sh -i public.ecr.aws/ubuntu/ubuntu:latest -n 20 } function run_test_memory_usage() { info "Running memory-usage test using ${KATA_HYPERVISOR} hypervisor" - create_symbolic_links bash tests/metrics/density/memory_usage.sh 20 5 - - check_metrics } function run_test_memory_usage_inside_container() { info "Running memory-usage inside the container test using ${KATA_HYPERVISOR} hypervisor" - # ToDo: remove the exit once the metrics workflow is stable - exit 0 - create_symbolic_links bash tests/metrics/density/memory_usage_inside_container.sh 5 } function run_test_blogbench() { info "Running Blogbench test using ${KATA_HYPERVISOR} hypervisor" - # ToDo: remove the exit once the metrics workflow is stable - exit 0 - create_symbolic_links bash tests/metrics/storage/blogbench.sh } +function run_test_tensorflow() { + info "Running TensorFlow test using ${KATA_HYPERVISOR} hypervisor" + + bash tests/metrics/machine_learning/tensorflow.sh 1 20 + + check_metrics +} + function main() { action="${1:-}" case "${action}" in - install-kata) install_kata ;; + install-kata) install_kata && install_checkmetrics ;; make-tarball-results) make_tarball_results ;; run-test-launchtimes) run_test_launchtimes ;; run-test-memory-usage) run_test_memory_usage ;; run-test-memory-usage-inside-container) run_test_memory_usage_inside_container ;; run-test-blogbench) run_test_blogbench ;; + run-test-tensorflow) run_test_tensorflow ;; *) >&2 die "Invalid argument" ;; esac } diff --git a/tests/metrics/lib/common.bash b/tests/metrics/lib/common.bash index 0bb31030d..c43019a70 100755 --- a/tests/metrics/lib/common.bash +++ b/tests/metrics/lib/common.bash @@ -18,7 +18,6 @@ DOCKER_EXE="${DOCKER_EXE:-docker}" CTR_RUNTIME="${CTR_RUNTIME:-io.containerd.kata.v2}" RUNTIME="${RUNTIME:-containerd-shim-kata-v2}" KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu}" -TEST_REPO="${TEST_REPO:-github.com/kata-containers/tests}" JSON_HOST="${JSON_HOST:-}" KSM_BASE="/sys/kernel/mm/ksm" @@ -179,6 +178,7 @@ function init_env() # This clean up is more aggressive, this is in order to # decrease the factors that could affect the metrics results. kill_processes_before_start + info "init environment complete" } # This function checks if there are containers or @@ -220,11 +220,11 @@ function show_system_ctr_state() function common_init() { - if [ "$CTR_RUNTIME" == "io.containerd.kata.v2" ] || [ "$RUNTIME" == "containerd-shim-kata-v2" ]; then + if [ "${CTR_RUNTIME}" = "io.containerd.kata.v2" ] || [ "${RUNTIME}" = "containerd-shim-kata-v2" ]; then extract_kata_env else # We know we have nothing to do for runc or shimv2 - if [ "$CTR_RUNTIME" != "io.containerd.runc.v2" ] || [ "$RUNTIME" != "runc" ]; then + if [ "${CTR_RUNTIME}" != "io.containerd.runc.v2" ] && [ "${RUNTIME}" != "runc" ]; then warn "Unrecognised runtime" fi fi @@ -256,7 +256,7 @@ function set_ksm_aggressive() fi } -restore_virtio_fs(){ +function restore_virtio_fs(){ # Re-enable virtio-fs if it was enabled previously [ -n "${was_virtio_fs}" ] && sudo -E PATH="$PATH" "${LIB_DIR}/../../.ci/set_kata_config.sh" shared_fs virtio-fs || \ info "Not restoring virtio-fs since it wasn't enabled previously" @@ -359,5 +359,3 @@ function wait_ksm_settle() done info "Timed out after ${1}s waiting for KSM to settle" } - -common_init diff --git a/tests/metrics/lib/json.bash b/tests/metrics/lib/json.bash index 79f6d235d..017d6494a 100755 --- a/tests/metrics/lib/json.bash +++ b/tests/metrics/lib/json.bash @@ -13,7 +13,7 @@ JSON_TX_ONELINE="${JSON_TX_ONELINE:-}" JSON_URL="${JSON_URL:-}" # Generate a timestamp in nanoseconds since 1st Jan 1970 -timestamp_ns() { +function timestamp_ns() { local t local s local n @@ -22,18 +22,21 @@ timestamp_ns() { t="$(date +%-s:%-N)" s=$(echo $t | awk -F ':' '{print $1}') n=$(echo $t | awk -F ':' '{print $2}') - ns=$(( (s * 1000000000) + n )) + ns=$(echo "$s * 1000000000 + $n" | bc) echo $ns } # Generate a timestamp in milliseconds since 1st Jan 1970 -timestamp_ms() { +function timestamp_ms() { echo $(($(date +%s%N)/1000000)) } -# Intialise the json subsystem -metrics_json_init() { +# Initialise the json subsystem +function metrics_json_init() { + # collect kata-env data + common_init + # Clear out any previous results json_result_array=() @@ -45,18 +48,18 @@ metrics_json_init() { EOF )" - if [ "$CTR_RUNTIME" == "io.containerd.kata.v2" ]; then + if [ "${CTR_RUNTIME}" == "io.containerd.kata.v2" ]; then metrics_json_add_fragment "$json" local json="$(cat << EOF "env" : { - "RuntimeVersion": "$RUNTIME_VERSION", - "RuntimeCommit": "$RUNTIME_COMMIT", - "RuntimeConfig": "$RUNTIME_CONFIG_PATH", - "Hypervisor": "$HYPERVISOR_PATH", - "HypervisorVersion": "$HYPERVISOR_VERSION", - "Shim": "$SHIM_PATH", - "ShimVersion": "$SHIM_VERSION", + "RuntimeVersion": "${RUNTIME_VERSION}", + "RuntimeCommit": "${RUNTIME_COMMIT}", + "RuntimeConfig": "${RUNTIME_CONFIG_PATH}", + "Hypervisor": "${HYPERVISOR_PATH}", + "HypervisorVersion": "${HYPERVISOR_VERSION}", + "Shim": "${SHIM_PATH}", + "ShimVersion": "${SHIM_VERSION}", "machinename": "$(uname -n)" } EOF @@ -86,7 +89,7 @@ EOF metrics_json_add_fragment "$json" # Now add a runtime specific environment section if we can - local iskata=$(is_a_kata_runtime "$RUNTIME") + local iskata=$(is_a_kata_runtime "${RUNTIME}") if [ "$iskata" == "1" ]; then local rpath="$(command -v kata-runtime)" local json="$(cat << EOF @@ -97,7 +100,7 @@ EOF fi fi - if [ "$CTR_RUNTIME" == "io.containerd.runc.v2" ]; then + if [ "${CTR_RUNTIME}" == "io.containerd.runc.v2" ]; then metrics_json_add_fragment "$json" local output=$(runc -v) local runcversion=$(grep version <<< "$output" | sed 's/runc version //') @@ -106,8 +109,8 @@ EOF "runc-env" : { "Version": { - "Semver": "$runcversion", - "Commit": "$runccommit" + "Semver": "${runcversion}", + "Commit": "${runccommit}" } } EOF @@ -118,10 +121,10 @@ EOF } # Save out the final JSON file -metrics_json_save() { +function metrics_json_save() { - if [ ! -d ${RESULT_DIR} ];then - mkdir -p ${RESULT_DIR} + if [ ! -d "${RESULT_DIR}" ];then + mkdir -p "${RESULT_DIR}" fi local maxelem=$(( ${#json_result_array[@]} - 1 )) @@ -163,12 +166,12 @@ EOF fi } -metrics_json_end_of_system() { +function metrics_json_end_of_system() { system_index=$(( ${#json_result_array[@]})) } # Add a top level (complete) JSON fragment to the data -metrics_json_add_fragment() { +function metrics_json_add_fragment() { local data=$1 # Place on end of array @@ -176,12 +179,12 @@ metrics_json_add_fragment() { } # Prepare to collect up array elements -metrics_json_start_array() { +function metrics_json_start_array() { json_array_array=() } # Add a (complete) element to the current array -metrics_json_add_array_element() { +function metrics_json_add_array_element() { local data=$1 # Place on end of array @@ -189,7 +192,7 @@ metrics_json_add_array_element() { } # Add a fragment to the current array element -metrics_json_add_array_fragment() { +function metrics_json_add_array_fragment() { local data=$1 # Place on end of array @@ -197,7 +200,7 @@ metrics_json_add_array_fragment() { } # Turn the currently registered array fragments into an array element -metrics_json_close_array_element() { +function metrics_json_close_array_element() { local maxelem=$(( ${#json_array_fragments[@]} - 1 )) local json="$(cat << EOF @@ -221,7 +224,7 @@ EOF } # Close the current array -metrics_json_end_array() { +function metrics_json_end_array() { local name=$1 local maxelem=$(( ${#json_array_array[@]} - 1 )) diff --git a/tests/metrics/machine_learning/README.md b/tests/metrics/machine_learning/README.md new file mode 100644 index 000000000..f55adba20 --- /dev/null +++ b/tests/metrics/machine_learning/README.md @@ -0,0 +1,49 @@ +# Kata Containers Tensorflow Metrics + +Kata Containers provides a series of performance tests using the +TensorFlow reference benchmarks (tf_cnn_benchmarks). +The tf_cnn_benchmarks containers TensorFlow implementations of several +popular convolutional models https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks. + +Currently the TensorFlow benchmark on Kata Containers includes test for +the `AxelNet` and `ResNet50` models. + +## Running the test + +Individual tests can be run by hand, for example: + +``` +$ cd metrics/machine_learning +$ ./tensorflow.sh 25 60 +``` +# Kata Containers Pytorch Metrics + +Based on a suite of Python high performance computing benchmarks that +uses various popular Python HPC libraries using Python + https://github.com/dionhaefner/pyhpc-benchmarks. + +## Running the Pytorch test + +Individual tests can be run by hand, for example: + +``` +$ cd metrics/machine_learning +$ ./tensorflow.sh 40 100 +``` +# Kata Containers Tensorflow `MobileNet` Metrics + +`MobileNets` are small, low-latency, low-power models parameterized to meet the resource +constraints of a variety of use cases. They can be built upon for classification, detection, +embeddings and segmentation similar to how other popular large scale models, such as Inception, are used. +`MobileNets` can be run efficiently on mobile devices with `Tensorflow` Lite. + +Kata Containers provides a test for running `MobileNet V1` inference using Intel-Optimized `Tensorflow`. + +## Running the `Tensorflow` `MobileNet` test +Individual test can be run by hand, for example: + +``` +$ cd metrics/machine_learning +$ ./tensorflow_mobilenet_benchmark.sh 25 60 +``` + diff --git a/tests/metrics/machine_learning/pytorch.sh b/tests/metrics/machine_learning/pytorch.sh new file mode 100755 index 000000000..9958c1690 --- /dev/null +++ b/tests/metrics/machine_learning/pytorch.sh @@ -0,0 +1,160 @@ +#!/bin/bash +# +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +set -e + +# General env +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +IMAGE="docker.io/library/pytorch:latest" +DOCKERFILE="${SCRIPT_PATH}/pytorch_dockerfile/Dockerfile" +equation_pytorch_file=$(mktemp pytorchresults.XXXXXXXXXX) +isoneural_pytorch_file=$(mktemp pytorchresults.XXXXXXXXXX) +NUM_CONTAINERS="$1" +TIMEOUT="$2" +TEST_NAME="pytorch" +CMD_RUN="cd pyhpc-benchmarks-3.0 && python run.py benchmarks/equation_of_state --burnin 20 --device cpu -b pytorch -s 524288 > LOG" +CMD_RUN_ISONEURAL="cd pyhpc-benchmarks-3.0 && python run.py benchmarks/isoneutral_mixing --burnin 20 --device cpu -b pytorch -s 524288 > LOG" +CMD_RESULT="cd pyhpc-benchmarks-3.0 && cat LOG" +CMD_FILE="cat pyhpc-benchmarks-3.0/LOG | grep 'seconds' | wc -l" +PAYLOAD_ARGS="tail -f /dev/null" + +function remove_tmp_file() { + rm -rf "${equation_pytorch_file}" "${isoneural_pytorch_file}" +} + +trap remove_tmp_file EXIT + +function check_containers_are_up() { + local containers_launched=0 + for i in $(seq "${TIMEOUT}") ; do + info "Verify that the containers are running" + containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")" + [ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break + sleep 1 + [ "${i}" == "${TIMEOUT}" ] && return 1 + done +} + +function equation_of_state_pytorch_test() { + info "Running Equation of State Pytorch test" + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN}" + done + + for i in "${containers[@]}"; do + check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") + retries="200" + for j in $(seq 1 "${retries}"); do + [ "${check_file}" -eq 1 ] && break + sleep 1 + done + done + + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${equation_pytorch_file}" + done + + local equation_pytorch_results=$(cat "${equation_pytorch_file}" | grep pytorch | sed '/Using pytorch version/d' | awk '{print $4}' | tr '\n' ',' | sed 's/.$//') + local equation_average_pytorch=$(echo "${equation_pytorch_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) + + local json="$(cat << EOF + { + "Pytorch Equation of State": { + "Result": "${equation_pytorch_results}", + "Average": "${equation_average_pytorch}", + "Units": "s" + } + } +EOF +)" + metrics_json_add_array_element "$json" + +} + +function isoneural_pytorch_test() { + info "Running Isoneural Pytorch test" + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RUN_ISONEURAL}" + done + + for i in "${containers[@]}"; do + check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") + retries="200" + for j in $(seq 1 "${retries}"); do + [ "${check_file}" -eq 1 ] && break + sleep 1 + done + done + + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULT}" >> "${isoneural_pytorch_file}" + done + + local isoneural_pytorch_results=$(cat "${isoneural_pytorch_file}" | grep pytorch | sed '/Using pytorch version/d' | awk '{print $4}' | tr '\n' ',' | sed 's/.$//') + local isoneural_average_pytorch=$(echo "${isoneural_pytorch_results}" | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) + + local json="$(cat << EOF + { + "Pytorch Isoneural": { + "Result": "${isoneural_pytorch_results}", + "Average": "${isoneural_average_pytorch}", + "Units": "s" + } + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Results" + +} + + +function main() { + # Verify enough arguments + if [ $# != 2 ]; then + echo >&2 "error: Not enough arguments [$@]" + help + exit 1 + fi + + local i=0 + local containers=() + local not_started_count="${NUM_CONTAINERS}" + + # Check tools/commands dependencies + cmds=("awk" "docker" "bc") + check_cmds "${cmds[@]}" + check_ctr_images "${IMAGE}" "${DOCKERFILE}" + + init_env + info "Creating ${NUM_CONTAINERS} containers" + + for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do + containers+=($(random_name)) + sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}" + ((not_started_count--)) + info "$not_started_count remaining containers" + done + + metrics_json_init + metrics_json_start_array + + + # Check that the requested number of containers are running + check_containers_are_up + + equation_of_state_pytorch_test + + isoneural_pytorch_test + + metrics_json_save + + clean_env_ctr + +} +main "$@" diff --git a/tests/metrics/machine_learning/pytorch_dockerfile/Dockerfile b/tests/metrics/machine_learning/pytorch_dockerfile/Dockerfile new file mode 100644 index 000000000..7acdd9280 --- /dev/null +++ b/tests/metrics/machine_learning/pytorch_dockerfile/Dockerfile @@ -0,0 +1,19 @@ +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Usage: FROM [image name] +FROM intel/intel-optimized-pytorch:1.12.100 + +# Version of the Dockerfile +LABEL DOCKERFILE_VERSION="1.0" + +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential curl git && \ + apt-get remove -y unattended-upgrades && \ + curl -OkL https://github.com/dionhaefner/pyhpc-benchmarks/archive/refs/tags/v3.0.tar.gz && \ + tar -xf v3.0.tar.gz && \ + pip install --no-cache-dir click==8.1.3 && \ + cd pyhpc-benchmarks-3.0 && pip3 install --no-cache-dir --user torch==1.10.0 + +CMD ["/bin/bash"] diff --git a/tests/metrics/machine_learning/tensorflow.sh b/tests/metrics/machine_learning/tensorflow.sh new file mode 100755 index 000000000..fc6c1f8c9 --- /dev/null +++ b/tests/metrics/machine_learning/tensorflow.sh @@ -0,0 +1,236 @@ +#!/bin/bash +# +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +set -o pipefail + +# General env +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +IMAGE="docker.io/library/tensorflow:latest" +DOCKERFILE="${SCRIPT_PATH}/tensorflow_dockerfile/Dockerfile" +BATCH_SIZE="100" +NUM_BATCHES="100" +resnet_tensorflow_file=$(mktemp resnettensorflowresults.XXXXXXXXXX) +alexnet_tensorflow_file=$(mktemp alexnettensorflowresults.XXXXXXXXXX) +NUM_CONTAINERS="$1" +TIMEOUT="$2" +TEST_NAME="tensorflow" +PAYLOAD_ARGS="tail -f /dev/null" +# Options to control the start of the workload using a trigger-file +dst_dir="/host" +src_dir=$(mktemp --tmpdir -d tensorflow.XXXXXXXXXX) +MOUNT_OPTIONS="type=bind,src=$src_dir,dst=$dst_dir,options=rbind:ro" +# CMD points to the script that starts the workload +alexnet_start_script="alexnet_start.sh" +resnet_start_script="resnet_start.sh" +CMD_RESNET="$dst_dir/$resnet_start_script" +CMD_ALEXNET="$dst_dir/$alexnet_start_script" +timeout=600 +INITIAL_NUM_PIDS=1 +CMD_FILE="cat alexnet_results | grep 'total images' | wc -l" +RESNET_CMD_FILE="cat resnet_results | grep 'total images' | wc -l" + +function remove_tmp_file() { + rm -rf "${resnet_tensorflow_file}" "${alexnet_tensorflow_file}" +} + +trap remove_tmp_file EXIT + +function help() { +cat << EOF +Usage: $0 + Description: + This script launches n number of containers + to run the tf cnn benchmarks using a Tensorflow + container. + Options: + : Number of containers to run. + : Timeout to launch the containers. +EOF +} + +function create_resnet_start_script() { + local script="${src_dir}/${resnet_start_script}" + rm -rf "${script}" + +cat <>"${script}" +#!/bin/bash +python benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py -data_format=NHWC --device cpu --batch_size=${BATCH_SIZE} --num_batches=${NUM_BATCHES} > resnet_results +EOF + chmod +x "${script}" +} + +function create_alexnet_start_script() { + local script="${src_dir}/${alexnet_start_script}" + rm -rf "${script}" + +cat <>"${script}" +#!/bin/bash +python benchmarks/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --num_batches=${NUM_BATCHES} --device=cpu --batch_size=${BATCH_SIZE} --forward_only=true --model=alexnet --data_format=NHWC > alexnet_results +EOF + chmod +x "${script}" +} + +function tensorflow_test() { + info "Copy Resnet Tensorflow test" + local pids=() + local j=0 + for i in "${containers[@]}"; do + $(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESNET}")& + pids["${j}"]=$! + ((j++)) + done + + # wait for all pids + for pid in ${pids[*]}; do + wait "${pid}" + done + + info "All containers are running the workload..." + + for i in "${containers[@]}"; do + check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${RESNET_CMD_FILE}") + retries="300" + for j in $(seq 1 "${retries}"); do + [ "${check_file}" -eq "1" ] && break + sleep 1 + done + done + + info "Copy Alexnet Tensorflow test" + local pids=() + local j=0 + for i in "${containers[@]}"; do + $(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_ALEXNET}")& + pids["${j}"]=$! + ((j++)) + done + + # wait for all pids + for pid in ${pids[*]}; do + wait "${pid}" + done + + for i in "${containers[@]}"; do + check_file=$(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") + retries="300" + for j in $(seq 1 "${retries}"); do + [ "${check_file}" -eq "1" ] && break + sleep 1 + done + done + + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "cat resnet_results" >> "${resnet_tensorflow_file}" + done + + local res_results=$(cat "${resnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') + local resnet_results=$(printf "%.0f\n" "${res_results}") + local res_average=$(echo "${resnet_results}" | sed "s/,/+/g;s/.*/(&)\/${NUM_CONTAINERS}/g" | bc -l) + local average_resnet=$(printf "%.0f\n" "${res_average}") + + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "cat alexnet_results" >> "${alexnet_tensorflow_file}" + done + + local alex_results=$(cat "${alexnet_tensorflow_file}" | grep "total images/sec" | cut -d ":" -f2 | sed -e 's/^[ \t]*//' | tr '\n' ',' | sed 's/.$//') + local alexnet_results=$(printf "%.0f\n" "${alex_results}") + local alex_average=$(echo "${alexnet_results}" | sed "s/,/+/g;s/.*/(&)\/${NUM_CONTAINERS}/g" | bc -l) + local average_alexnet=$(printf "%.0f\n" "${alex_average}") + + local json="$(cat << EOF + { + "resnet": { + "Result": ${resnet_results}, + "Average": ${average_resnet}, + "Units": "images/s" + }, + "alexnet": { + "Result": ${alexnet_results}, + "Average": ${average_alexnet}, + "Units": "images/s" + } + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Results" +} + +function check_containers_are_up() { + local containers_launched=0 + for i in $(seq "${TIMEOUT}") ; do + info "Verify that the containers are running" + containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")" + [ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break + sleep 1 + [ "${i}" == "${TIMEOUT}" ] && return 1 + done +} + +function main() { + # Verify enough arguments + if [ $# != 2 ]; then + echo >&2 "error: Not enough arguments [$@]" + help + exit 1 + fi + + local i=0 + local containers=() + local not_started_count="${NUM_CONTAINERS}" + + # Check tools/commands dependencies + cmds=("awk" "docker" "bc") + check_cmds "${cmds[@]}" + check_ctr_images "${IMAGE}" "${DOCKERFILE}" + + init_env + create_resnet_start_script + create_alexnet_start_script + + info "Creating ${NUM_CONTAINERS} containers" + + for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do + containers+=($(random_name)) + sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" --mount="${MOUNT_OPTIONS}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}" + ((not_started_count--)) + info "$not_started_count remaining containers" + done + + metrics_json_init + metrics_json_start_array + + # Check that the requested number of containers are running + check_containers_are_up + + # Check that the requested number of containers are running + local timeout_launch="10" + check_containers_are_up & pid=$! + (sleep "${timeout_launch}" && kill -HUP "${pid}") 2>/dev/null & pid_tout=$! + + if wait "${pid}" 2>/dev/null; then + pkill -HUP -P "${pid_tout}" + wait "${pid_tout}" + else + warn "Time out exceeded" + return 1 + fi + + # Get the initial number of pids in a single container before the workload starts + INITIAL_NUM_PIDS=$(sudo -E "${CTR_EXE}" t metrics "${containers[-1]}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2) + ((INITIAL_NUM_PIDS++)) + + tensorflow_test + + metrics_json_save + + rm -rf "${src_dir}" + + clean_env_ctr +} +main "$@" diff --git a/tests/metrics/machine_learning/tensorflow_dockerfile/Dockerfile b/tests/metrics/machine_learning/tensorflow_dockerfile/Dockerfile new file mode 100644 index 000000000..a8c73d5f6 --- /dev/null +++ b/tests/metrics/machine_learning/tensorflow_dockerfile/Dockerfile @@ -0,0 +1,18 @@ +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Usage: FROM [image name] +FROM intel/intel-optimized-tensorflow:2.9.1 + +# Version of the Dockerfile +LABEL DOCKERFILE_VERSION="1.0" + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends build-essential git && \ + apt-get remove -y unattended-upgrades && \ + git clone https://github.com/tensorflow/benchmarks + +CMD ["/bin/bash"] diff --git a/tests/metrics/machine_learning/tensorflow_mobilenet_benchmark.sh b/tests/metrics/machine_learning/tensorflow_mobilenet_benchmark.sh new file mode 100755 index 000000000..63b42c1e7 --- /dev/null +++ b/tests/metrics/machine_learning/tensorflow_mobilenet_benchmark.sh @@ -0,0 +1,190 @@ +#!/bin/bash +# +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +set -o pipefail + +# General env +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +IMAGE="docker.io/library/tensorflowmobilenet:latest" +DOCKERFILE="${SCRIPT_PATH}/tensorflow_mobilenet_dockerfile/Dockerfile" +tensorflow_file=$(mktemp tensorflowresults.XXXXXXXXXX) +NUM_CONTAINERS="$1" +TIMEOUT="$2" +TEST_NAME="tensorflow-intelai" +PAYLOAD_ARGS="tail -f /dev/null" +TESTDIR="${TESTDIR:-/testdir}" +# Options to control the start of the workload using a trigger-file +dst_dir="/host" +src_dir=$(mktemp --tmpdir -d tensorflowai.XXXXXXXXXX) +MOUNT_OPTIONS="type=bind,src=$src_dir,dst=$dst_dir,options=rbind:ro" +start_script="mobilenet_start.sh" +# CMD points to the script that starts the workload +CMD="$dst_dir/$start_script" +guest_trigger_file="$dst_dir/$trigger_file" +host_trigger_file="$src_dir/$trigger_file" +timeout=600 +INITIAL_NUM_PIDS=1 +CMD_FILE="cat results | grep 'Average Throughput' | wc -l" +CMD_RESULTS="cat results | grep 'Average Throughput' | cut -d':' -f2 | cut -d' ' -f2 | tr '\n' ','" + +function remove_tmp_file() { + rm -rf "${tensorflow_file}" +} + +trap remove_tmp_file EXIT + +function help() { +cat << EOF +Usage: $0 + Description: + This script launches n number of containers + to run the tf cnn benchmarks using a Tensorflow + container. + Options: + : Number of containers to run. + : Timeout to launch the containers. +EOF +} + +function create_start_script() { + local script="${src_dir}/${start_script}" + rm -rf "${script}" + +cat <>"${script}" +#!/bin/bash +python3.8 models/benchmarks/launch_benchmark.py --benchmark-only --framework tensorflow --model-name mobilenet_v1 --mode inference --precision bfloat16 --batch-size 100 --in-graph /mobilenet_v1_1.0_224_frozen.pb --num-intra-threads 16 --num-inter-threads 1 --verbose --\ input_height=224 input_width=224 warmup_steps=20 steps=20 \ input_layer=input output_layer=MobilenetV1/Predictions/Reshape_1 > results +EOF + chmod +x "${script}" +} + +function mobilenet_test() { + local CMD_EXPORT_VAR="export KMP_AFFINITY=granularity=fine,verbose,compact && export OMP_NUM_THREADS=16" + + info "Export environment variables" + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_EXPORT_VAR}" + done + + info "Running Mobilenet Tensorflow test" + local pids=() + local j=0 + for i in "${containers[@]}"; do + $(sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD}")& + pids["${j}"]=$! + ((j++)) + done + + # wait for all pids + for pid in ${pids[*]}; do + wait "${pid}" + done + + touch "${host_trigger_file}" + info "All containers are running the workload..." + + for i in "${containers[@]}"; do + check_file=$(sudo -E "${CTR_EXE}" t exec -d --exec-id "$(random_name)" "${i}" sh -c "${CMD_FILE}") + retries="30" + for j in $(seq 1 "${retries}"); do + [ "${check_file}" -eq "1" ] && break + sleep 1 + done + done + + for i in "${containers[@]}"; do + sudo -E "${CTR_EXE}" t exec --exec-id "$(random_name)" "${i}" sh -c "${CMD_RESULTS}" >> "${tensorflow_file}" + done + + local mobilenet_results=$(cat "${tensorflow_file}" | sed 's/.$//') + local average_mobilenet=$(echo "${mobilenet_results}" | sed 's/.$//' | sed "s/,/+/g;s/.*/(&)\/$NUM_CONTAINERS/g" | bc -l) + local json="$(cat << EOF + { + "Mobilenet": { + "Result": "${mobilenet_results}", + "Average": "${average_mobilenet}", + "Units": "images/s" + } + } +EOF +)" + metrics_json_add_array_element "$json" + metrics_json_end_array "Results" +} + +function check_containers_are_up() { + local containers_launched=0 + for i in $(seq "${TIMEOUT}") ; do + info "Verify that the containers are running" + containers_launched="$(sudo ${CTR_EXE} t list | grep -c "RUNNING")" + [ "${containers_launched}" -eq "${NUM_CONTAINERS}" ] && break + sleep 1 + [ "${i}" == "${TIMEOUT}" ] && return 1 + done +} + +function main() { + # Verify enough arguments + if [ $# != 2 ]; then + echo >&2 "error: Not enough arguments [$@]" + help + exit 1 + fi + + local i=0 + local containers=() + local not_started_count="${NUM_CONTAINERS}" + + # Check tools/commands dependencies + cmds=("awk" "docker" "bc") + check_cmds "${cmds[@]}" + check_ctr_images "${IMAGE}" "${DOCKERFILE}" + + init_env + create_start_script + + info "Creating ${NUM_CONTAINERS} containers" + + for ((i=1; i<= "${NUM_CONTAINERS}"; i++)); do + containers+=($(random_name)) + sudo -E "${CTR_EXE}" run -d --runtime "${CTR_RUNTIME}" --mount="${MOUNT_OPTIONS}" "${IMAGE}" "${containers[-1]}" sh -c "${PAYLOAD_ARGS}" + ((not_started_count--)) + info "${not_started_count} remaining containers" + done + + metrics_json_init + metrics_json_start_array + + # Check that the requested number of containers are running + check_containers_are_up + + # Check that the requested number of containers are running + local timeout_launch="10" + check_containers_are_up & pid=$! + (sleep "${timeout_launch}" && kill -HUP "${pid}") 2>/dev/null & pid_tout=$! + + if wait "${pid}" 2>/dev/null; then + pkill -HUP -P "${pid_tout}" + wait "${pid_tout}" + else + warn "Time out exceeded" + return 1 + fi + + # Get the initial number of pids in a single container before the workload starts + INITIAL_NUM_PIDS=$(sudo -E "${CTR_EXE}" t metrics "${containers[-1]}" | grep pids.current | grep pids.current | xargs | cut -d ' ' -f 2) + ((INITIAL_NUM_PIDS++)) + + mobilenet_test + + metrics_json_save + + sudo rm -rf "${src_dir}" + + clean_env_ctr +} +main "$@" diff --git a/tests/metrics/machine_learning/tensorflow_mobilenet_dockerfile/Dockerfile b/tests/metrics/machine_learning/tensorflow_mobilenet_dockerfile/Dockerfile new file mode 100644 index 000000000..fd1bd31ec --- /dev/null +++ b/tests/metrics/machine_learning/tensorflow_mobilenet_dockerfile/Dockerfile @@ -0,0 +1,21 @@ +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# Usage: FROM [image name] +FROM ubuntu:20.04 + +ENV DEBIAN_FRONTEND=noninteractive + +# Version of the Dockerfile +LABEL DOCKERFILE_VERSION="1.0" + +RUN apt-get update && \ + apt-get install -y wget nano curl build-essential git && \ + apt-get install -y python3.8 python3-pip && \ + pip install --no-cache-dir intel-tensorflow-avx512==2.8.0 && \ + pip install --no-cache-dir protobuf==3.20.* && \ + wget -q https://storage.googleapis.com/intel-optimized-tensorflow/models/v1_8/mobilenet_v1_1.0_224_frozen.pb && \ + git clone https://github.com/IntelAI/models.git + +CMD ["/bin/bash"] diff --git a/tests/metrics/storage/README.md b/tests/metrics/storage/README.md new file mode 100644 index 000000000..b9d1cc68a --- /dev/null +++ b/tests/metrics/storage/README.md @@ -0,0 +1,11 @@ +# Kata Containers storage I/O tests +The metrics tests in this directory are designed to be used to assess storage IO. +## `Blogbench` test +The `blogbench` script is based on the `blogbench` program which is designed to emulate a busy blog server with a number of concurrent +threads performing a mixture of reads, writes and rewrites. +### Running the `blogbench` test +The `blogbench` test can be run by hand, for example: +``` +$ cd metrics +$ bash storage/blogbench.sh +``` diff --git a/tests/metrics/storage/blogbench.sh b/tests/metrics/storage/blogbench.sh index 19a960103..c5913f931 100755 --- a/tests/metrics/storage/blogbench.sh +++ b/tests/metrics/storage/blogbench.sh @@ -36,8 +36,10 @@ function main() { init_env check_cmds "${cmds[@]}" check_ctr_images "${IMAGE}" "${DOCKERFILE}" + sudo systemctl restart containerd metrics_json_init + info "Running Blogbench test" local output=$(sudo -E ${CTR_EXE} run --rm --runtime=${CTR_RUNTIME} ${IMAGE} test ${CMD}) # Save configuration @@ -65,6 +67,7 @@ EOF metrics_json_end_array "Config" # Save results + info "Saving Blogbench results" metrics_json_start_array local writes=$(tail -2 <<< "${output}" | head -1 | awk '{print $5}') @@ -83,11 +86,11 @@ EOF local json="$(cat << EOF { "write": { - "Result" : "${writes}", + "Result" : ${writes}, "Units" : "items" }, "read": { - "Result" : "${reads}", + "Result" : ${reads}, "Units" : "items" }, "Nb blogs": { diff --git a/tests/metrics/storage/blogbench_dockerfile/Dockerfile b/tests/metrics/storage/blogbench_dockerfile/Dockerfile index 593063798..08d06cec5 100644 --- a/tests/metrics/storage/blogbench_dockerfile/Dockerfile +++ b/tests/metrics/storage/blogbench_dockerfile/Dockerfile @@ -11,6 +11,8 @@ FROM docker.io/library/ubuntu:latest # Version of the Dockerfile LABEL DOCKERFILE_VERSION="1.0" +ENV DEBIAN_FRONTEND=noninteractive + # URL for blogbench test and blogbench version ENV BLOGBENCH_URL "https://download.pureftpd.org/pub/blogbench" ENV BLOGBENCH_VERSION 1.1 diff --git a/tests/metrics/storage/fio-k8s/.gitignore b/tests/metrics/storage/fio-k8s/.gitignore new file mode 100644 index 000000000..1caa97b15 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/.gitignore @@ -0,0 +1 @@ +./cmd/fiotest/fio-k8s diff --git a/tests/metrics/storage/fio-k8s/Makefile b/tests/metrics/storage/fio-k8s/Makefile new file mode 100644 index 000000000..ba96203ba --- /dev/null +++ b/tests/metrics/storage/fio-k8s/Makefile @@ -0,0 +1,28 @@ +# +# Copyright (c) 2021-2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +MKFILE_DIR := $(dir $(MKFILE_PATH)) + +build: + make -C $(MKFILE_DIR)/cmd/fiotest/ gomod + make -C $(MKFILE_DIR)/cmd/fiotest/ build + +test-report: + $(MKFILE_DIR)/scripts/dax-compare-test/report/gen-html-fio-report.sh $(MKFILE_DIR)/cmd/fiotest/test-results/ + +test-report-interactive: + $(MKFILE_DIR)/scripts/dax-compare-test/report/run-docker-jupyter-server.sh $(MKFILE_DIR)/cmd/fiotest/test-results/ + +test: build + make -C $(MKFILE_DIR)/cmd/fiotest/ run + make test-report + +run: build + make -C $(MKFILE_DIR)/scripts/dax-compare-test/ run + +test-ci: build + make -C $(MKFILE_DIR)/cmd/fiotest/ runci diff --git a/tests/metrics/storage/fio-k8s/cmd/fiotest/Makefile b/tests/metrics/storage/fio-k8s/cmd/fiotest/Makefile new file mode 100644 index 000000000..4c8a27c69 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/cmd/fiotest/Makefile @@ -0,0 +1,24 @@ +# +# Copyright (c) 2021-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +MKFILE_DIR := $(dir $(MKFILE_PATH)) + +build: + GO111MODULE=on go build + +run: build + $(MKFILE_DIR)/fio-k8s --debug --fio.size 10M --output-dir test-results --test-name kata $(MKFILE_DIR)/../../configs/example-config/ + $(MKFILE_DIR)/fio-k8s --debug --fio.size 10M --output-dir test-results --test-name runc --container-runtime runc $(MKFILE_DIR)/../../configs/example-config/ + +gomod: + go mod edit -replace=github.com/kata-containers/kata-containers/tests/metrics/k8s=../../pkg/k8s + go mod edit -replace=github.com/kata-containers/kata-containers/tests/metrics/exec=../../pkg/exec + go mod edit -replace=github.com/kata-containers/kata-containers/tests/metrics/env=../../pkg/env + go mod tidy + +runci: build + $(MKFILE_DIR)/fio-k8s --debug --fio.size 10M --output-dir test-results --test-name kata $(MKFILE_DIR)/../../configs/example-config/ diff --git a/tests/metrics/storage/fio-k8s/cmd/fiotest/go.mod b/tests/metrics/storage/fio-k8s/cmd/fiotest/go.mod new file mode 100644 index 000000000..4d6f8fb45 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/cmd/fiotest/go.mod @@ -0,0 +1,24 @@ +module github.com/kata-containers/kata-containers/tests/metrics/storage/fio-k8s + +go 1.19 + +replace github.com/kata-containers/kata-containers/tests/metrics/exec => ../../pkg/exec + +replace github.com/kata-containers/kata-containers/tests/metrics/k8s => ../../pkg/k8s + +replace github.com/kata-containers/kata-containers/tests/metrics/env => ../../pkg/env + +require ( + github.com/kata-containers/kata-containers/tests/metrics/env v0.0.0-00010101000000-000000000000 + github.com/kata-containers/kata-containers/tests/metrics/exec v0.0.0-00010101000000-000000000000 + github.com/kata-containers/kata-containers/tests/metrics/k8s v0.0.0-00010101000000-000000000000 + github.com/pkg/errors v0.9.1 + github.com/sirupsen/logrus v1.9.3 + github.com/urfave/cli v1.22.14 +) + +require ( + github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect + github.com/russross/blackfriday/v2 v2.1.0 // indirect + golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8 // indirect +) diff --git a/tests/metrics/storage/fio-k8s/cmd/fiotest/go.sum b/tests/metrics/storage/fio-k8s/cmd/fiotest/go.sum new file mode 100644 index 000000000..45fbeb4a0 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/cmd/fiotest/go.sum @@ -0,0 +1,31 @@ +github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= +github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w= +github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/urfave/cli v1.22.14 h1:ebbhrRiGK2i4naQJr+1Xj92HXZCrK7MsyTS/ob3HnAk= +github.com/urfave/cli v1.22.14/go.mod h1:X0eDS6pD6Exaclxm99NJ3FiCDRED7vIHpx2mDOHLvkA= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8 h1:0A+M6Uqn+Eje4kHMK80dtF3JCXC4ykBgQG4Fe06QRhQ= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/tests/metrics/storage/fio-k8s/cmd/fiotest/main.go b/tests/metrics/storage/fio-k8s/cmd/fiotest/main.go new file mode 100644 index 000000000..de7e7b2ff --- /dev/null +++ b/tests/metrics/storage/fio-k8s/cmd/fiotest/main.go @@ -0,0 +1,373 @@ +// Copyright (c) 2021-2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +package main + +import ( + "encoding/csv" + "encoding/json" + "fmt" + "os" + "path" + "path/filepath" + "strings" + "time" + + env "github.com/kata-containers/kata-containers/tests/metrics/env" + exec "github.com/kata-containers/kata-containers/tests/metrics/exec" + "github.com/kata-containers/kata-containers/tests/metrics/k8s" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +var log = logrus.New() + +var ( + optContainerRuntime = "container-runtime" + optDebug = "debug" + optOutputDir = "output-dir" + optTestName = "test-name" + // fio options + optFioBlockSize = "fio.block-size" + optFioDirect = "fio.direct" + optFioIoDepth = "fio.iodepth" + optFioSize = "fio.size" + optFioNumJobs = "fio.numjobs" +) + +type RwFioOp struct { + BandwidthKb int `json:"bw"` + IOPS float64 `json:"iops"` +} + +type fioResult struct { + GlobalOptions struct { + IOEngine string `json:"ioengine"` + RW string `json:"rw"` + } `json:"global options"` + Jobs []struct { + JobName string `json:"jobname"` + Read RwFioOp `json:"read"` + Write RwFioOp `json:"write"` + } `json:"jobs"` +} + +// Run fio in k8s metrics test in K8s +func (c fioTestConfig) run() (result fioResult, err error) { + log.Infof("Running fio config: %s", c.jobFile) + + pod := k8s.Pod{YamlPath: c.k8sYaml} + + log.Infof("Delete pod if already created") + err = pod.Delete() + if err != nil { + return result, err + } + + log.Infof("Create pod: %s", pod.YamlPath) + err = pod.Run() + if err != nil { + return result, err + } + + defer func() { + log.Info("Deleting pod") + delErr := pod.Delete() + if delErr != nil { + log.Error(delErr) + if err != nil { + err = errors.Wrapf(err, "Could not delete pod after: %s", delErr) + } + } + }() + + destDir := "/home/fio-jobs" + _, err = pod.Exec("mkdir " + destDir) + if err != nil { + return result, err + } + + dstJobFile := path.Join(destDir, "jobFile") + err = pod.CopyFromHost(c.jobFile, dstJobFile) + if err != nil { + return result, err + } + + _, err = pod.Exec("apt update") + if err != nil { + return result, err + } + _, err = pod.Exec("apt install -y fio") + if err != nil { + return result, err + } + + err = env.DropCaches() + if err != nil { + return result, err + } + + var directStr string + if c.direct { + directStr = "1" + } else { + directStr = "0" + } + + cmdFio := "fio" + cmdFio += " --append-terse " + cmdFio += " --blocksize=" + c.blocksize + cmdFio += " --direct=" + directStr + cmdFio += " --directory=" + c.directory + cmdFio += " --iodepth=" + c.iodepth + cmdFio += " --numjobs=" + c.numjobs + cmdFio += " --runtime=" + c.runtime + cmdFio += " --size=" + c.size + cmdFio += " --output-format=json" + cmdFio += " " + dstJobFile + + log.Infof("Exec fio") + output, err := pod.Exec(cmdFio, k8s.ExecOptShowStdOut()) + if err != nil { + return result, err + } + err = json.Unmarshal([]byte(output), &result) + if err != nil { + return result, errors.Wrapf(err, "failed to unmarshall output : %s", output) + } + + log.Infof("ioengine:%s", result.GlobalOptions.IOEngine) + log.Infof("rw:%s", result.GlobalOptions.RW) + if len(result.Jobs) == 0 { + return result, errors.New("No jobs found after parsing fio results") + } + + testDir := path.Join(c.outputDir, filepath.Base(c.jobFile)) + err = os.MkdirAll(testDir, 0775) + if err != nil { + return result, errors.Wrapf(err, "failed to create test directory for :%s", c.jobFile) + } + outputFile := path.Join(testDir, "output.json") + log.Infof("Store results output in : %s", outputFile) + + err = os.WriteFile(outputFile, []byte(output), 0644) + if err != nil { + return result, err + } + + return result, nil +} + +type fioTestConfig struct { + //test options + k8sYaml string + containerRuntime string + outputDir string + + //fio options + blocksize string + directory string + iodepth string + numjobs string + jobFile string + loops string + runtime string + size string + + direct bool +} + +func runFioJobs(testDirPath string, cfg fioTestConfig) (results []fioResult, err error) { + fioJobsDir, err := filepath.Abs(path.Join(testDirPath, "fio-jobs")) + if err != nil { + return results, err + } + + files, err := os.ReadDir(fioJobsDir) + if err != nil { + log.Fatal(err) + return results, err + } + + if cfg.containerRuntime == "" { + return results, errors.New("containerRuntime is empty") + } + + podYAMLName := cfg.containerRuntime + ".yaml" + cfg.k8sYaml = path.Join(testDirPath, podYAMLName) + + if len(files) == 0 { + return results, errors.New("No fio configs found") + } + + for _, file := range files { + cfg.jobFile = path.Join(fioJobsDir, file.Name()) + r, err := cfg.run() + if err != nil { + return results, err + } + results = append(results, r) + + log.Infof("workload:%s", r.Jobs[0].JobName) + log.Infof("bw_r:%d", r.Jobs[0].Read.BandwidthKb) + log.Infof("IOPS_r:%f", r.Jobs[0].Read.IOPS) + log.Infof("bw_w:%d", r.Jobs[0].Write.BandwidthKb) + log.Infof("IOPS_w:%f", r.Jobs[0].Write.IOPS) + + waitTime := 5 + log.Debugf("Sleep %d seconds(if not wait sometimes create another pod timesout)", waitTime) + time.Sleep(time.Duration(waitTime) * time.Second) + } + return results, err + +} + +func generateResultsView(testName string, results []fioResult, outputDir string) error { + outputFile := path.Join(outputDir, "results.csv") + f, err := os.Create(outputFile) + if err != nil { + return err + } + defer f.Close() + + log.Infof("Creating results output in %s", outputFile) + + w := csv.NewWriter(f) + + headers := []string{"NAME", "WORKLOAD", "bw_r", "bw_w", "IOPS_r", "IOPS_w"} + err = w.Write(headers) + if err != nil { + return err + } + + for _, r := range results { + if len(r.Jobs) == 0 { + return errors.Errorf("fio result has no jobs: %v", r) + } + row := []string{testName} + row = append(row, r.Jobs[0].JobName) + row = append(row, fmt.Sprintf("%d", r.Jobs[0].Read.BandwidthKb)) + row = append(row, fmt.Sprintf("%d", r.Jobs[0].Write.BandwidthKb)) + row = append(row, fmt.Sprintf("%f", r.Jobs[0].Read.IOPS)) + row = append(row, fmt.Sprintf("%f", r.Jobs[0].Write.IOPS)) + if err := w.Write(row); err != nil { + return err + } + } + + w.Flush() + + return w.Error() +} + +func main() { + + app := &cli.App{ + Flags: []cli.Flag{ + &cli.BoolFlag{ + Name: optDebug, + Usage: "Logs in debug level", + }, + &cli.StringFlag{ + Name: optTestName, + Value: "kata-fio-test", + Usage: "Change the fio test name for reports", + }, + &cli.StringFlag{ + Name: optOutputDir, + Value: ".", + Usage: "Use a file to store results", + }, + &cli.StringFlag{ + Name: optContainerRuntime, + Value: "kata", + Usage: "Choose the runtime to use", + }, + //fio options + &cli.StringFlag{ + Name: optFioSize, + Value: "200M", + Usage: "File size to use for tests", + }, + &cli.StringFlag{ + Name: optFioBlockSize, + Value: "4K", + Usage: "Block size for fio tests", + }, + &cli.BoolFlag{ + Name: optFioDirect, + Usage: "Use direct io", + }, + &cli.StringFlag{ + Name: optFioIoDepth, + Value: "16", + Usage: "Number of I/O units to keep in flight against the file", + }, + &cli.StringFlag{ + Name: optFioNumJobs, + Value: "1", + Usage: "Number of clones (processes/threads performing the same workload) of this job", + }, + }, + Action: func(c *cli.Context) error { + jobsDir := c.Args().First() + + if jobsDir == "" { + cli.SubcommandHelpTemplate = strings.Replace(cli.SubcommandHelpTemplate, "[arguments...]", "", -1) + cli.ShowCommandHelp(c, "") + return errors.New("Missing ") + } + + if c.Bool(optDebug) { + log.SetLevel(logrus.DebugLevel) + k8s.Debug = true + env.Debug = true + } + + exec.SetLogger(log) + k8s.SetLogger(log) + env.SetLogger(log) + + testName := c.String(optTestName) + + outputDir, err := filepath.Abs(path.Join(c.String(optOutputDir), testName)) + if err != nil { + return err + } + + cfg := fioTestConfig{ + blocksize: c.String(optFioBlockSize), + direct: c.Bool(optFioDirect), + directory: ".", + iodepth: c.String(optFioIoDepth), + loops: "3", + numjobs: c.String(optFioNumJobs), + runtime: "20", + size: c.String(optFioSize), + containerRuntime: c.String(optContainerRuntime), + outputDir: outputDir, + } + + log.Infof("Results will be created in %s", cfg.outputDir) + + err = os.MkdirAll(cfg.outputDir, 0775) + if err != nil { + return err + } + + results, err := runFioJobs(jobsDir, cfg) + if err != nil { + return err + } + + return generateResultsView(c.String(optTestName), results, outputDir) + }, + } + + err := app.Run(os.Args) + if err != nil { + log.Fatal(err) + } + +} diff --git a/tests/metrics/storage/fio-k8s/configs/example-config/fio-jobs/randrw-async.job b/tests/metrics/storage/fio-k8s/configs/example-config/fio-jobs/randrw-async.job new file mode 100644 index 000000000..9e0edb96c --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/example-config/fio-jobs/randrw-async.job @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2022 Intel Corporation +[global] +name=io_uring +filename=fio-file +rw=randrw +rwmixread=75 +ioengine=io_uring + +[randrw-io_uring] diff --git a/tests/metrics/storage/fio-k8s/configs/example-config/fio-jobs/randrw-libaio.job b/tests/metrics/storage/fio-k8s/configs/example-config/fio-jobs/randrw-libaio.job new file mode 100644 index 000000000..327852e44 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/example-config/fio-jobs/randrw-libaio.job @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2021 Intel Corporation +[global] +name=randrw-libaio +filename=fio-file +rw=randrw +rwmixread=75 +ioengine=libaio + +[randrw-libaio] diff --git a/tests/metrics/storage/fio-k8s/configs/example-config/fio-jobs/randrw-sync.job b/tests/metrics/storage/fio-k8s/configs/example-config/fio-jobs/randrw-sync.job new file mode 100644 index 000000000..3f7f2b6ed --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/example-config/fio-jobs/randrw-sync.job @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2022 Intel Corporation +[global] +name=sync +filename=fio-file +rw=randrw +rwmixread=75 +ioengine=sync + +[randrw-sync] diff --git a/tests/metrics/storage/fio-k8s/configs/example-config/kata.yaml b/tests/metrics/storage/fio-k8s/configs/example-config/kata.yaml new file mode 100644 index 000000000..08f397dea --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/example-config/kata.yaml @@ -0,0 +1,16 @@ +## Copyright (c) 2021 Intel Corporation +# +## SPDX-License-Identifier: Apache-2.0 +# +apiVersion: v1 +kind: Pod +metadata: + name: iometrics +spec: + runtimeClassName: kata + containers: + - name: iometrics + image: ubuntu:latest + # Just spin & wait forever + command: [ "/bin/bash", "-c", "--" ] + args: [ "sleep infinity" ] diff --git a/tests/metrics/storage/fio-k8s/configs/example-config/runc.yaml b/tests/metrics/storage/fio-k8s/configs/example-config/runc.yaml new file mode 100644 index 000000000..4fd96d3c1 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/example-config/runc.yaml @@ -0,0 +1,15 @@ +## Copyright (c) 2021 Intel Corporation +# +## SPDX-License-Identifier: Apache-2.0 +# +apiVersion: v1 +kind: Pod +metadata: + name: iometrics +spec: + containers: + - name: iometrics + image: ubuntu:latest + # Just spin & wait forever + command: [ "/bin/bash", "-c", "--" ] + args: [ "sleep infinity" ] diff --git a/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randread-libaio.job b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randread-libaio.job new file mode 100644 index 000000000..5ab226061 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randread-libaio.job @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2021 Intel Corporation +[global] +name=randread-libaio +filename=fio-file +rw=randread +ioengine=libaio + +[randread-libaio] diff --git a/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randread-mmpap.job b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randread-mmpap.job new file mode 100644 index 000000000..cd79854f2 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randread-mmpap.job @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2021 Intel Corporation +[global] +name=randread-mmap +rw=randread +ioengine=mmap + +[randread-mmap] +filename=fio-file diff --git a/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randrw-libaio.job b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randrw-libaio.job new file mode 100644 index 000000000..327852e44 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randrw-libaio.job @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2021 Intel Corporation +[global] +name=randrw-libaio +filename=fio-file +rw=randrw +rwmixread=75 +ioengine=libaio + +[randrw-libaio] diff --git a/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randrw-mmap.job b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randrw-mmap.job new file mode 100644 index 000000000..cf0c1288a --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randrw-mmap.job @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2021 Intel Corporation +[global] +name=randrw-mmap +rw=randrw +rwmixread=75 +ioengine=mmap + +[randrw-mmap] +filename=fio-file diff --git a/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randwrite-libaio.job b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randwrite-libaio.job new file mode 100644 index 000000000..ef3bfc5a5 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randwrite-libaio.job @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2021 Intel Corporation +[global] +name=randwrite-libaio +filename=fio-file +rw=randwrite +ioengine=libaio + +[randwrite-libaio] diff --git a/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randwrite-mmap.job b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randwrite-mmap.job new file mode 100644 index 000000000..0d4f6ef73 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/randwrite-mmap.job @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2021 Intel Corporation +[global] +name=randwrite-mmap +rw=randwrite +ioengine=mmap + +[randwrite-mmap] +filename=fio-file diff --git a/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqread-libaio.job b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqread-libaio.job new file mode 100644 index 000000000..6f8a16a3f --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqread-libaio.job @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2021 Intel Corporation +[global] +name=seqread-libaio +filename=fio-file +rw=read +ioengine=libaio + +[seqread-libaio] diff --git a/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqread-mmap.job b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqread-mmap.job new file mode 100644 index 000000000..9829726d3 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqread-mmap.job @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2021 Intel Corporation +[global] +name=seqread-mmap +rw=read +ioengine=mmap + +[seqread-mmap] +filename=fio-file diff --git a/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqread-psync.job b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqread-psync.job new file mode 100644 index 000000000..a1f54ca7c --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqread-psync.job @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2021 Intel Corporation +[global] +name=seqread-psync +filename=fio-file +rw=read + +[seqread-psync] diff --git a/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqwrite-libaio.job b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqwrite-libaio.job new file mode 100644 index 000000000..9927e05c9 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqwrite-libaio.job @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2021 Intel Corporation +[global] +name=seqwrite-libaio +filename=fio-file +rw=write +ioengine=libaio + +[seqwrite-libaio] diff --git a/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqwrite-mmap.job b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqwrite-mmap.job new file mode 100644 index 000000000..e3485db4f --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/test-config/fio-jobs/seqwrite-mmap.job @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2021 Intel Corporation +[global] +name=seqwrite-mmap +filename=fio-file +rw=write +ioengine=mmap + +[seqwrite-mmap] +filename=fio-file diff --git a/tests/metrics/storage/fio-k8s/configs/test-config/kata.yaml b/tests/metrics/storage/fio-k8s/configs/test-config/kata.yaml new file mode 100644 index 000000000..08f397dea --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/test-config/kata.yaml @@ -0,0 +1,16 @@ +## Copyright (c) 2021 Intel Corporation +# +## SPDX-License-Identifier: Apache-2.0 +# +apiVersion: v1 +kind: Pod +metadata: + name: iometrics +spec: + runtimeClassName: kata + containers: + - name: iometrics + image: ubuntu:latest + # Just spin & wait forever + command: [ "/bin/bash", "-c", "--" ] + args: [ "sleep infinity" ] diff --git a/tests/metrics/storage/fio-k8s/configs/test-config/runc.yaml b/tests/metrics/storage/fio-k8s/configs/test-config/runc.yaml new file mode 100644 index 000000000..4fd96d3c1 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/configs/test-config/runc.yaml @@ -0,0 +1,15 @@ +## Copyright (c) 2021 Intel Corporation +# +## SPDX-License-Identifier: Apache-2.0 +# +apiVersion: v1 +kind: Pod +metadata: + name: iometrics +spec: + containers: + - name: iometrics + image: ubuntu:latest + # Just spin & wait forever + command: [ "/bin/bash", "-c", "--" ] + args: [ "sleep infinity" ] diff --git a/tests/metrics/storage/fio-k8s/pkg/env/Makefile b/tests/metrics/storage/fio-k8s/pkg/env/Makefile new file mode 100644 index 000000000..a1c0a7779 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/pkg/env/Makefile @@ -0,0 +1,9 @@ +# +# Copyright (c) 2021-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +gomod: + GO111MODULE=on go mod edit -replace=github.com/kata-containers/kata-containers/tests/metrics/exec=../exec + GO111MODULE=on go mod tidy diff --git a/tests/metrics/storage/fio-k8s/pkg/env/env.go b/tests/metrics/storage/fio-k8s/pkg/env/env.go new file mode 100644 index 000000000..034127f96 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/pkg/env/env.go @@ -0,0 +1,38 @@ +// Copyright (c) 2021-2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +package env + +import ( + exec "github.com/kata-containers/kata-containers/tests/metrics/exec" +) + +// logger interface for pkg +var log logger +var Debug bool = false + +type logger interface { + Infof(string, ...interface{}) + Debugf(string, ...interface{}) + Errorf(string, ...interface{}) +} + +func SetLogger(l logger) { + log = l +} + +var sysDropCachesPath = "/proc/sys/vm/drop_caches" + +func DropCaches() (err error) { + log.Infof("drop caches") + _, err = exec.ExecCmd("sync", Debug) + if err != nil { + return err + } + + _, err = exec.ExecCmd("echo 3 | sudo tee "+sysDropCachesPath, Debug) + if err != nil { + return err + } + return nil +} diff --git a/tests/metrics/storage/fio-k8s/pkg/env/go.mod b/tests/metrics/storage/fio-k8s/pkg/env/go.mod new file mode 100644 index 000000000..4e59e60d9 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/pkg/env/go.mod @@ -0,0 +1,10 @@ +module github.com/kata-containers/kata-containers/tests/metrics/storage/fio-k8s/exec + +go 1.19 + +require ( + github.com/kata-containers/kata-containers/tests/metrics/exec v0.0.0-00010101000000-000000000000 // indirect + github.com/pkg/errors v0.9.1 // indirect +) + +replace github.com/kata-containers/kata-containers/tests/metrics/exec => ../exec diff --git a/tests/metrics/storage/fio-k8s/pkg/env/go.sum b/tests/metrics/storage/fio-k8s/pkg/env/go.sum new file mode 100644 index 000000000..7c401c3f5 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/pkg/env/go.sum @@ -0,0 +1,2 @@ +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= diff --git a/tests/metrics/storage/fio-k8s/pkg/exec/Exec.go b/tests/metrics/storage/fio-k8s/pkg/exec/Exec.go new file mode 100644 index 000000000..6e409f427 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/pkg/exec/Exec.go @@ -0,0 +1,67 @@ +// Copyright (c) 2021-2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +package exec + +import ( + "bytes" + "io" + "os" + "os/exec" + + "github.com/pkg/errors" +) + +// logger interface for pkg +var log logger + +type logger interface { + Infof(string, ...interface{}) + Debugf(string, ...interface{}) + Errorf(string, ...interface{}) +} + +func SetLogger(l logger) { + log = l +} + +// Exec a command +// err != nil if command fails to execute +// output is a string with a combined stdout and stderr +func ExecCmd(c string, showInStdout bool) (stdout string, err error) { + if c == "" { + return "", errors.New("command is empty") + } + + log.Debugf("Exec: %s", c) + cmd := exec.Command("bash", "-o", "pipefail", "-c", c) + var stdBuffer bytes.Buffer + var writers []io.Writer + writers = append(writers, &stdBuffer) + if showInStdout { + writers = append(writers, os.Stdout) + } + mw := io.MultiWriter(writers...) + + cmd.Stdout = mw + cmd.Stderr = mw + + err = cmd.Run() + output := stdBuffer.String() + + return stdBuffer.String(), errors.Wrap(err, output) +} + +// Exec a command +// Send output to Stdout and Stderr +func ExecStdout(c string) error { + if c == "" { + return errors.New("command is empty") + } + + log.Debugf("Exec: %s", c) + cmd := exec.Command("bash", "-o", "pipefail", "-c", c) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + return cmd.Run() +} diff --git a/tests/metrics/storage/fio-k8s/pkg/exec/go.mod b/tests/metrics/storage/fio-k8s/pkg/exec/go.mod new file mode 100644 index 000000000..c74fcbeb8 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/pkg/exec/go.mod @@ -0,0 +1,5 @@ +module github.com/kata-containers/kata-containers/tests/metrics/storage/fio-k8s/exec + +go 1.19 + +require github.com/pkg/errors v0.9.1 diff --git a/tests/metrics/storage/fio-k8s/pkg/exec/go.sum b/tests/metrics/storage/fio-k8s/pkg/exec/go.sum new file mode 100644 index 000000000..7c401c3f5 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/pkg/exec/go.sum @@ -0,0 +1,2 @@ +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= diff --git a/tests/metrics/storage/fio-k8s/pkg/k8s/Makefile b/tests/metrics/storage/fio-k8s/pkg/k8s/Makefile new file mode 100644 index 000000000..f5bee6d16 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/pkg/k8s/Makefile @@ -0,0 +1,8 @@ +# +# Copyright (c) 2021-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +gomod: + GO111MODULE=on go mod edit -replace=github.com/kata-containers/kata-containers/tests/metrics/exec=../exec + GO111MODULE=on go mod tidy diff --git a/tests/metrics/storage/fio-k8s/pkg/k8s/exec.go b/tests/metrics/storage/fio-k8s/pkg/k8s/exec.go new file mode 100644 index 000000000..9be25f618 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/pkg/k8s/exec.go @@ -0,0 +1,34 @@ +// Copyright (c) 2021-2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +package k8s + +import ( + "fmt" + + exec "github.com/kata-containers/kata-containers/tests/metrics/exec" +) + +type execOpt struct { + showInStdOut bool +} + +type ExecOption func(e *execOpt) + +func ExecOptShowStdOut() ExecOption { + return func(e *execOpt) { + e.showInStdOut = true + } + +} + +func (p *Pod) Exec(cmd string, opts ...ExecOption) (output string, err error) { + log.Debugf("Exec %q in %s", cmd, p.YamlPath) + o := &execOpt{showInStdOut: false} + for _, opt := range opts { + opt(o) + + } + execCmd := fmt.Sprintf("kubectl exec -f %s -- /bin/bash -c %q", p.YamlPath, cmd) + return exec.ExecCmd(execCmd, Debug || o.showInStdOut) +} diff --git a/tests/metrics/storage/fio-k8s/pkg/k8s/go.mod b/tests/metrics/storage/fio-k8s/pkg/k8s/go.mod new file mode 100644 index 000000000..6a349002a --- /dev/null +++ b/tests/metrics/storage/fio-k8s/pkg/k8s/go.mod @@ -0,0 +1,10 @@ +module github.com/kata-containers/kata-containers/tests/metrics/k8s + +go 1.19 + +replace github.com/kata-containers/kata-containers/tests/metrics/exec => ../exec + +require ( + github.com/kata-containers/kata-containers/tests/metrics/exec v0.0.0-00010101000000-000000000000 // indirect + github.com/pkg/errors v0.9.1 // indirect +) diff --git a/tests/metrics/storage/fio-k8s/pkg/k8s/go.sum b/tests/metrics/storage/fio-k8s/pkg/k8s/go.sum new file mode 100644 index 000000000..7c401c3f5 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/pkg/k8s/go.sum @@ -0,0 +1,2 @@ +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= diff --git a/tests/metrics/storage/fio-k8s/pkg/k8s/k8s.go b/tests/metrics/storage/fio-k8s/pkg/k8s/k8s.go new file mode 100644 index 000000000..2fef788cc --- /dev/null +++ b/tests/metrics/storage/fio-k8s/pkg/k8s/k8s.go @@ -0,0 +1,68 @@ +// Copyright (c) 2021-2023 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +package k8s + +import ( + "fmt" + + exec "github.com/kata-containers/kata-containers/tests/metrics/exec" + "github.com/pkg/errors" +) + +// logger interface for pkg +var log logger +var Debug bool = false + +type logger interface { + Infof(string, ...interface{}) + Debugf(string, ...interface{}) + Errorf(string, ...interface{}) +} + +func SetLogger(l logger) { + log = l +} + +type Pod struct { + YamlPath string +} + +func (p *Pod) waitForReady() (err error) { + log.Debugf("Wait for pod %s", p.YamlPath) + _, err = exec.ExecCmd("kubectl wait --for=condition=ready -f "+p.YamlPath, Debug) + return err +} + +func (p *Pod) Run() (err error) { + + log.Debugf("Creating K8s Pod %s", p.YamlPath) + _, err = exec.ExecCmd("kubectl apply -f "+p.YamlPath, Debug) + if err != nil { + return errors.Wrapf(err, "Failed to run pod %s", p.YamlPath) + } + + err = p.waitForReady() + if err != nil { + return errors.Wrapf(err, "Failed to wait for pod %s", p.YamlPath) + } + return err +} + +func (p *Pod) Delete() (err error) { + log.Debugf("Delete pod %s", p.YamlPath) + _, err = exec.ExecCmd("kubectl delete --ignore-not-found -f "+p.YamlPath, Debug) + return errors.Wrapf(err, "Failed to delete pod %s", p.YamlPath) +} + +func (p *Pod) CopyFromHost(src, dst string) (err error) { + podName, err := exec.ExecCmd("kubectl get -f "+p.YamlPath+" -o jsonpath={.metadata.name}", Debug) + if err != nil { + return nil + } + + log.Debugf("Copy from host %q->%q in pod %s", src, dst, p.YamlPath) + execCmd := fmt.Sprintf("kubectl cp %s %s:%s", src, podName, dst) + _, err = exec.ExecCmd(execCmd, Debug) + return err +} diff --git a/tests/metrics/storage/fio-k8s/scripts/Makefile b/tests/metrics/storage/fio-k8s/scripts/Makefile new file mode 100644 index 000000000..a33a5015c --- /dev/null +++ b/tests/metrics/storage/fio-k8s/scripts/Makefile @@ -0,0 +1,10 @@ +# +# Copyright (c) 2021-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# +MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +MKFILE_DIR := $(dir $(MKFILE_PATH)) +run: + $(MKFILE_DIR)/compare-virtiofsd-dax.sh + "$(MKFILE_DIR)/report/gen-html-fio-report.sh" "./results" diff --git a/tests/metrics/storage/fio-k8s/scripts/compare-virtiofsd-dax.sh b/tests/metrics/storage/fio-k8s/scripts/compare-virtiofsd-dax.sh new file mode 100755 index 000000000..248e37776 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/scripts/compare-virtiofsd-dax.sh @@ -0,0 +1,151 @@ +#!/bin/bash +#Copyright (c) 2021-2023 Intel Corporation +# +#SPDX-License-Identifier: Apache-2.0 +# + +set -o errexit +set -o nounset +set -o pipefail +set -o errtrace + +script_dir=$(dirname "$(readlink -f "$0")") + +runtime_path="/opt/kata/bin/kata-runtime" +kata_config_path="/opt/kata/share/defaults/kata-containers/configuration.toml" + +results_dir="$(realpath ./)/results" + +KATA_RUNTIME="${KATA_RUNTIME_CLASS:-kata}" +BAREMETAL_RUNTIME="runc" +RUNTIME_CLASS="" + +FIO_SIZE="${FIO_SIZE:-500M}" +FIO_BLOCKSIZE="${FIO_BLOCKSIZE:-4K}" +VIRTIOFS_DAX_SIZE=${VIRTIOFS_DAX_SIZE:-600M} + +# set the base case for virtiofsd +set_base_virtiofs_config() { + # Running kata-qemu-virtiofs + # Defaults for virtiofs + sudo crudini --set --existing "$kata_config_path" hypervisor.qemu virtio_fs_cache '"auto"' + sudo crudini --set --existing "$kata_config_path" hypervisor.qemu virtio_fs_cache_size ${VIRTIOFS_DAX_SIZE} +} + +## helper function: get name of current bash function +fn_name() { + echo "${FUNCNAME[1]}" +} + +# directory where results are stored +get_results_dir() { + local test_name + local test_result_dir + test_name="${1}" + test_result_dir="${results_dir}/${test_name}" + mkdir -p "${test_result_dir}" + echo "${test_result_dir}" +} + +# Collect kata env +# save kata config toml +# save output from kata-env +kata_env() { + local suffix=${1} + local config_path + local kata_env_bk + local kata_config_bk + kata_env_bk="$(get_results_dir "${suffix}")/kata-env.toml" + kata_config_bk="$(get_results_dir "${suffix}")/kata-config.toml" + + ${runtime_path} kata-env >"${kata_env_bk}" + config_path="$(${runtime_path} kata-env --json | jq .Runtime.Config.Path -r)" + cp "${config_path}" "${kata_config_bk}" +} + +# Collect the command used by virtiofsd +collect_qemu_virtiofs_cmd() { + local rdir + local test_name + test_name="${1}" + + rdir=$(get_results_dir "${test_name}") + # TODO +} + +# Run metrics runner +run_workload() { + local test_name + local test_result_file + local test_result_dir + + test_name="${1}" + + test_result_dir="$(get_results_dir "${test_name}")" + test_result_file="${test_result_dir}/test-out.txt" + + echo "Running for kata config: ${test_name}" + collect_qemu_virtiofs_cmd "$test_name" + + fio_runner_dir="${script_dir}/../../cmd/fiotest/" + fio_jobs="${script_dir}/../../configs/test-config/" + make -C "${fio_runner_dir}" build + pwd + set -x + "${fio_runner_dir}fio-k8s" \ + --debug \ + --fio.size "${FIO_SIZE}" \ + --fio.block-size "${FIO_BLOCKSIZE}" \ + --container-runtime "${RUNTIME_CLASS}" \ + --test-name "${test_name}" \ + --output-dir "$(dirname ${test_result_dir})" \ + "${fio_jobs}" | + tee \ + "${test_result_file}" + set +x +} + +pool_0_cache_auto_dax() { + local suffix="$(fn_name)" + + set_base_virtiofs_config + sudo crudini --set --existing "$kata_config_path" hypervisor.qemu virtio_fs_extra_args '["--thread-pool-size=0","-o","no_posix_lock","-o","xattr"]' + sudo crudini --set --existing "$kata_config_path" hypervisor.qemu virtio_fs_cache '"auto"' + sudo crudini --set --existing "$kata_config_path" hypervisor.qemu virtio_fs_cache_size 1024 + kata_env "${suffix}" + RUNTIME_CLASS="${KATA_RUNTIME}" + run_workload "${suffix}" +} + +pool_0_cache_auto_no_dax() { + local suffix="$(fn_name)" + + set_base_virtiofs_config + sudo crudini --set --existing "$kata_config_path" hypervisor.qemu virtio_fs_extra_args '["--thread-pool-size=0","-o","no_posix_lock","-o","xattr"]' + sudo crudini --set --existing "$kata_config_path" hypervisor.qemu virtio_fs_cache '"auto"' + sudo crudini --set --existing "$kata_config_path" hypervisor.qemu virtio_fs_cache_size 0 + + kata_env "${suffix}" + + RUNTIME_CLASS="${KATA_RUNTIME}" + run_workload "${suffix}" + echo "done" +} + +k8s_baremetal() { + local suffix="$(fn_name)" + + RUNTIME_CLASS="${BAREMETAL_RUNTIME}" + run_workload "${suffix}" +} + +main() { + + mkdir -p "${results_dir}" + + k8s_baremetal + pool_0_cache_auto_dax + pool_0_cache_auto_no_dax +} + +main $* diff --git a/tests/metrics/storage/fio-k8s/scripts/dax-compare-test/report/fio.ipynb b/tests/metrics/storage/fio-k8s/scripts/dax-compare-test/report/fio.ipynb new file mode 100644 index 000000000..d2e8da202 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/scripts/dax-compare-test/report/fio.ipynb @@ -0,0 +1,51 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "tWacOPbMYPtc" + }, + "source": [ + "# FIO comparision" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jXtTs6yldl_y" + }, + "outputs": [], + "source": [ + "import fio\n", + "fio.generate_report()" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "fio.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tests/metrics/storage/fio-k8s/scripts/dax-compare-test/report/fio.py b/tests/metrics/storage/fio-k8s/scripts/dax-compare-test/report/fio.py new file mode 100644 index 000000000..313f63313 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/scripts/dax-compare-test/report/fio.py @@ -0,0 +1,102 @@ +# Copyright (c) 2021-2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +import pandas as pd +import os +import re +import io +import glob +from IPython.display import display, Markdown +import matplotlib.pyplot as plt + +#Compare the tests results group by fio job. +#Input: +# df: dataset from `import_data()` +# metric: string of metrics provided in `df` +def compare_tests_group_by_fio_job(df, metric): + test_names, metric_df = group_metrics_group_by_testname(df, metric) + show_df(metric_df) + plot_df(metric_df,test_names) + +# Given a metric return results per test group by fio job. +# input: +# df: dataset from `import_data()` +# metric: string with the name of the metric to filter. +# output: +# dataset with fomat: +# 'workload' , 'name[0]' , ... , 'name[n]' +# +def group_metrics_group_by_testname(df, metric): + #name of each tests from results + names = set() + # Rows of new data set + rows = [] + # map: + # keys: name of fio job + # value: dict[k]:v where k: name of a test, v: value of test for metric` + workload = {} + + for k, row in df.iterrows(): + # name of a fio job + w = row['WORKLOAD'] + # name of tests + tname = row['NAME'] + names.add(tname) + # given a fio job name get dict of values + # if not previous values init empty dict + dict_values = workload.get(w, {}) + # For a given metric, add it into as value of dict_values[testname]=val + #e.g + # dict_values["test-name"] = row["IOPS"] + dict_values[tname] = row[metric] + workload[w] = dict_values + + names = list(names) + cols = ['WORKLOAD'] + list(names) + rdf = pd.DataFrame(workload,columns = cols) + + for k in workload: + d = workload[k] + + if not d[names[0]] == 0: + d["WORKLOAD"] = k; + rdf = rdf.append(d,ignore_index=True) + rdf = rdf.dropna() + return names, rdf + +def plot_df(df, names,sort_key=""): + if sort_key != "": + df.sort_values(sort_key, ascending=False) + df.plot(kind='bar',x="WORKLOAD",y=names, figsize=(30, 10)) + plt.show() + + +def import_data(): + frames = [] + for f in glob.glob('./results/*/results.csv'): + print("reading:" + f) + df = pd.read_csv(f) + frames.append(df) + return pd.concat(frames) + +def show_df(df): + pd.set_option('display.max_rows', df.shape[0]+1) + print(df) + +def print_md(s): + display(Markdown(s)) + +#notebook entrypoint +def generate_report(): + #Load the all test results in a single dataset + df_results = import_data() + print_md("Show all data from results") + show_df(df_results) + print_md("### Compare the tests results group by fio job. The metric used to compare is write bandwidth") + compare_tests_group_by_fio_job(df_results, 'bw_w') + print_md("### Compare the tests results group by fio job. The metric used to compare is read bandwidth") + compare_tests_group_by_fio_job(df_results, 'bw_r') + print_md("### Compare the tests results group by fio job. The metric used to compare is write IOPS(Input/Output Operations Per Second)") + compare_tests_group_by_fio_job(df_results, 'IOPS_w') + print_md("### Compare the tests results group by fio job. The metric used to compare is read IOPS(Input/Output Operations Per Second)") + compare_tests_group_by_fio_job(df_results, 'IOPS_r') diff --git a/tests/metrics/storage/fio-k8s/scripts/dax-compare-test/report/gen-html-fio-report.sh b/tests/metrics/storage/fio-k8s/scripts/dax-compare-test/report/gen-html-fio-report.sh new file mode 100644 index 000000000..8061c059d --- /dev/null +++ b/tests/metrics/storage/fio-k8s/scripts/dax-compare-test/report/gen-html-fio-report.sh @@ -0,0 +1,48 @@ +#!/bin/bash +#Copyright (c) 2021-2023 Intel Corporation +# +#SPDX-License-Identifier: Apache-2.0 +# + +set -o errexit +set -o nounset +set -o pipefail +set -o errtrace + +script_dir=$(dirname "$(readlink -f "$0")") + +results_dir=${1:-} + +usage(){ + echo "$0 " +} + +if [ "${results_dir}" == "" ];then + echo "missing results directory" + usage + exit 1 +fi + +if [ ! -d "${results_dir}" ];then + echo "${results_dir} is not a directory" + usage + exit 1 +fi + +results_dir=$(realpath "${results_dir}") + +generate_report(){ + sudo chown "${USER}:${USER}" -R ${results_dir} + sudo docker run --rm -e JUPYTER_ENABLE_LAB=yes \ + -v "${script_dir}:/home/jovyan" \ + -v "${results_dir}:/home/jovyan/results" \ + --user $(id -u):$(id -g) \ + jupyter/scipy-notebook:399cbb986c6b \ + bash -e -c ' + cd results; + jupyter nbconvert --execute /home/jovyan/fio.ipynb --to html; + cp /home/jovyan/fio.html /home/jovyan/results; + ' +} + +generate_report diff --git a/tests/metrics/storage/fio-k8s/scripts/dax-compare-test/report/run-docker-jupyter-server.sh b/tests/metrics/storage/fio-k8s/scripts/dax-compare-test/report/run-docker-jupyter-server.sh new file mode 100644 index 000000000..f386da4e2 --- /dev/null +++ b/tests/metrics/storage/fio-k8s/scripts/dax-compare-test/report/run-docker-jupyter-server.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#Copyright (c) 2021-2023 Intel Corporation +# +#SPDX-License-Identifier: Apache-2.0 +# + +set -o errexit +set -o nounset +set -o pipefail +set -o errtrace + +script_dir=$(dirname "$(readlink -f "$0")") +NOTEBOOK_PORT="8888" + +results_dir=${1:-} + +usage(){ + echo "$0 " +} + +if [ "${results_dir}" == "" ];then + echo "missing results directory" + usage + exit 1 +fi + +if [ ! -d "${results_dir}" ];then + echo "${results_dir} is not a directory" + usage + exit 1 +fi + +results_dir=$(realpath "${results_dir}") + +sudo -E docker run --rm -p "${NOTEBOOK_PORT}:${NOTEBOOK_PORT}" -e JUPYTER_ENABLE_LAB=yes \ + -v "${script_dir}:/home/jovyan" \ + -v "${results_dir}:/home/jovyan/results" \ + jupyter/scipy-notebook:399cbb986c6b \ + start.sh jupyter lab --LabApp.token='' diff --git a/tools/packaging/guest-image/build_image.sh b/tools/packaging/guest-image/build_image.sh index c98ec51c3..0b1312ff3 100755 --- a/tools/packaging/guest-image/build_image.sh +++ b/tools/packaging/guest-image/build_image.sh @@ -21,24 +21,15 @@ readonly osbuilder_dir="$(cd "${repo_root_dir}/tools/osbuilder" && pwd)" export GOPATH=${GOPATH:-${HOME}/go} -final_image_name="kata-containers" -final_initrd_name="kata-containers-initrd" -image_initrd_extension=".img" - arch_target="$(uname -m)" -final_image_name="kata-containers" -final_initrd_name="kata-containers-initrd" +final_artifact_name="kata-containers" image_initrd_extension=".img" build_initrd() { info "Build initrd" - info "initrd os: $initrd_distro" - info "initrd os version: $initrd_os_version" - local rootfs_build_dest="${builddir}/initrd-image" - export DISTRO="$initrd_distro" - export OS_VERSION="${initrd_os_version}" - export USE_DOCKER=1 - export AGENT_INIT="yes" + info "initrd os: $os_name" + info "initrd os version: $os_version" + # ROOTFS_BUILD_DEST is a Make variable # SNP will also use the SEV guest module if [[ "${AA_KBC:-}" == "offline_sev_kbc" || "${AA_KBC:-}" == "online_sev_kbc" ]]; then @@ -46,40 +37,49 @@ build_initrd() { kernel_version="$(get_from_kata_deps "assets.kernel.sev.version")" kernel_version=${kernel_version#v} module_dir="${repo_root_dir}/tools/packaging/kata-deploy/local-build/build/kernel-sev/builddir/kata-linux-${kernel_version}-${config_version}/lib/modules/${kernel_version}" - sudo -E PATH="$PATH" make rootfs ROOTFS_BUILD_DEST="${rootfs_build_dest}" KERNEL_MODULES_DIR="${module_dir}" + sudo -E PATH="$PATH" make rootfs AGENT_INIT=yes USE_DOCKER=1 ROOTFS_BUILD_DEST="${builddir}/initrd-image" KERNEL_MODULES_DIR="${module_dir}" else - sudo -E PATH="$PATH" make rootfs ROOTFS_BUILD_DEST="${rootfs_build_dest}" + sudo -E PATH="$PATH" make rootfs AGENT_INIT=yes USE_DOCKER=1 ROOTFS_BUILD_DEST="${builddir}/intrd-image" fi if [ -n "${INCLUDE_ROOTFS:-}" ]; then - sudo cp -RL --preserve=mode "${INCLUDE_ROOTFS}/." "${rootfs_build_dest}/${initrd_distro}_rootfs/" + sudo cp -RL --preserve=mode "${INCLUDE_ROOTFS}/." "${builddir}/initrd-image/${initrd_distro}_rootfs/" fi - sudo -E PATH="$PATH" make initrd ROOTFS_BUILD_DEST="${rootfs_build_dest}" - mv "kata-containers-initrd.img" "${install_dir}/${initrd_name}" + + sudo -E PATH="$PATH" make initrd \ + DISTRO="$os_name" \ + DEBUG="${DEBUG:-}" \ + OS_VERSION="${os_version}" \ + ROOTFS_BUILD_DEST="${builddir}/initrd-image" \ + USE_DOCKER=1 \ + AGENT_INIT="yes" + mv "kata-containers-initrd.img" "${install_dir}/${artifact_name}" ( cd "${install_dir}" - ln -sf "${initrd_name}" "${final_initrd_name}${image_initrd_extension}" + ln -sf "${artifact_name}" "${final_artifact_name}${image_initrd_extension}" ) } build_image() { + set -x info "Build image" - info "image os: $img_distro" - info "image os version: $img_os_version" + info "image os: $os_name" + info "image os version: $os_version" sudo -E PATH="${PATH}" make image \ - DISTRO="${img_distro}" \ + DISTRO="${os_name}" \ DEBUG="${DEBUG:-}" \ USE_DOCKER="1" \ - IMG_OS_VERSION="${img_os_version}" \ + IMG_OS_VERSION="${os_version}" \ ROOTFS_BUILD_DEST="${builddir}/rootfs-image" - mv -f "kata-containers.img" "${install_dir}/${image_name}" + mv -f "kata-containers.img" "${install_dir}/${artifact_name}" if [ -e "root_hash.txt" ]; then - [ -z "${root_hash_suffix}" ] && root_hash_suffix=vanilla - mv "${repo_root_dir}/tools/osbuilder/root_hash.txt" "${repo_root_dir}/tools/osbuilder/root_hash_${root_hash_suffix}.txt" + root_hash_suffix=${image_initrd_suffix} + [ -z "${image_initrd_suffix}" ] && root_hash_suffix=vanilla + mv root_hash.txt root_hash_${root_hash_suffix}.txt fi ( cd "${install_dir}" - ln -sf "${image_name}" "${final_image_name}${image_initrd_extension}" + ln -sf "${artifact_name}" "${final_artifact_name}${image_initrd_extension}" ) } @@ -93,6 +93,8 @@ Usage: ${script_name} [options] Options: + --osname=${os_name} + --osversion=${os_version} --imagetype=${image_type} --prefix=${prefix} --destdir=${destdir} @@ -108,46 +110,25 @@ main() { prefix="/opt/kata" image_suffix="" image_initrd_suffix="" - root_hash_suffix="" builddir="${PWD}" while getopts "h-:" opt; do case "$opt" in -) case "${OPTARG}" in + osname=*) + os_name=${OPTARG#*=} + ;; + osversion=*) + os_version=${OPTARG#*=} + ;; imagetype=image) image_type=image - #image information - img_distro=$(get_from_kata_deps "assets.image.architecture.${arch_target}.name") - img_os_version=$(get_from_kata_deps "assets.image.architecture.${arch_target}.version") - image_name="kata-${img_distro}-${img_os_version}.${image_type}" ;; imagetype=initrd) image_type=initrd - #initrd information - initrd_distro=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.name") - initrd_os_version=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.version") - initrd_name="kata-${initrd_distro}-${initrd_os_version}.${image_type}" ;; image_initrd_suffix=*) image_initrd_suffix=${OPTARG#*=} - if [ "${image_initrd_suffix}" == "sev" ]; then - initrd_distro=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.sev.name") - initrd_os_version=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.sev.version") - initrd_name="kata-${initrd_distro}-${initrd_os_version}-${image_initrd_suffix}.${image_type}" - final_initrd_name="${final_initrd_name}-${image_initrd_suffix}" - elif [ "${image_initrd_suffix}" == "tdx" ]; then - img_distro=$(get_from_kata_deps "assets.image.architecture.${arch_target}.name") - img_os_version=$(get_from_kata_deps "assets.image.architecture.${arch_target}.version") - image_name="kata-${img_distro}-${img_os_version}-${image_initrd_suffix}.${image_type}" - final_image_name="${final_image_name}-${image_initrd_suffix}" - - initrd_distro=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.name") - initrd_os_version=$(get_from_kata_deps "assets.initrd.architecture.${arch_target}.version") - initrd_name="kata-${initrd_distro}-${initrd_os_version}-${image_initrd_suffix}.${image_type}" - final_initrd_name="${final_initrd_name}-${image_initrd_suffix}" - elif [ -n "${image_initrd_suffix}" ]; then - die "Invalid image_initrd_suffix ${image_initrd_suffix}" - fi ;; root_hash_suffix=*) root_hash_suffix=${OPTARG#*=} @@ -179,7 +160,16 @@ main() { echo "build ${image_type}" + if [ "${image_type}" = "initrd" ]; then + final_artifact_name+="-initrd" + fi + if [ -n "${image_initrd_suffix}" ]; then + artifact_name="kata-${os_name}-${os_version}-${image_initrd_suffix}.${image_type}" + final_artifact_name+="-${image_initrd_suffix}" + else + artifact_name="kata-${os_name}-${os_version}.${image_type}" + fi install_dir="${destdir}/${prefix}/share/kata-containers/" readonly install_dir diff --git a/tools/packaging/kata-debug/Dockerfile b/tools/packaging/kata-debug/Dockerfile new file mode 100644 index 000000000..202fd03d6 --- /dev/null +++ b/tools/packaging/kata-debug/Dockerfile @@ -0,0 +1,16 @@ +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +FROM ubuntu:22.04 + +COPY debug.sh /usr/bin/debug.sh + +RUN \ +apt-get update && \ +apt-get install -y --no-install-recommends tree && \ +apt-get clean && \ +rm -rf /var/lib/apt/lists/ + +CMD ["/usr/bin/debug.sh"] diff --git a/tools/packaging/kata-debug/README.md b/tools/packaging/kata-debug/README.md new file mode 100644 index 000000000..7bc625754 --- /dev/null +++ b/tools/packaging/kata-debug/README.md @@ -0,0 +1,28 @@ +# kata-debug + +`kata-debug` is a tool that is used as part of the Kata Containers CI to gather +information from the node, in order to help debugging issues with Kata +Containers. + +As one can imagine, this can be expanded and used outside of the CI context, +and any contribution back to the script is very much welcome. + +The resulting container is stored at the [Kata Containers `quay.io` +space](https://quay.io/repository/kata-containers/kata-debug) and can +be used as shown below: +```sh +kubectl debug $NODE_NAME -it --image=quay.io/kata-containers/kata-debug:latest +``` + +## Building and publishing +The project can be built and publish by calling the following command from the +Kata Containers top directory: +```sh +make build-and-publish-kata-debug +``` + +Users can specify the following environment variables to the build: +* `KATA_DEBUG_REGISTRY` - The container registry to be used + default: `quay.io/kata-containers/kata-debug` +- `KATA_DEBUG_TAG` - A tag to the be used for the image + default: `$(git rev-parse HEAD)-$(uname -a)` diff --git a/tools/packaging/kata-debug/debug.sh b/tools/packaging/kata-debug/debug.sh new file mode 100755 index 000000000..9cc766ec1 --- /dev/null +++ b/tools/packaging/kata-debug/debug.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Copyright (c) 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +echo "Let's gather Kata Containers debug information" +echo "" +echo "::group::Check Kata Containers logs" +chroot /host /bin/bash -c "sudo journalctl -xe -t kata | tee" +echo "::endgroup::" +echo "" +echo "::group::Checking the loaded kernel modules" +chroot /host /bin/bash -c "sudo lsmod" +echo "::endgroup::" +echo "" +echo "::group::Check Kata Containers deployed binaries" +tree /host/opt/kata /host/usr/local/bin +echo "::endgroup::" +echo "" +echo "::group:: Check node's dmesg" +chroot /host /bin/bash -c "sudo dmesg" +echo "::endgroup::" diff --git a/tools/packaging/kata-debug/kata-debug-build-and-upload-payload.sh b/tools/packaging/kata-debug/kata-debug-build-and-upload-payload.sh new file mode 100755 index 000000000..9438c9368 --- /dev/null +++ b/tools/packaging/kata-debug/kata-debug-build-and-upload-payload.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# +# Copyright 2023 Intel +# +# SPDX-License-Identifier: Apache-2.0 +# + +[ -z "${DEBUG}" ] || set -x +set -o errexit +set -o nounset +set -o pipefail +set -o errtrace + +KATA_DEBUG_DIR="`dirname ${0}`" + +REGISTRY="${1:-"quay.io/kata-containers/kata-debug"}" +TAG="${2:-}" + +arch=$(uname -m) +[ "$arch" = "x86_64" ] && arch="amd64" +IMAGE_TAG="${REGISTRY}:$(git rev-parse HEAD)-${arch}" + +pushd ${KATA_DEBUG_DIR} + +echo "Building the image" +docker build --tag ${IMAGE_TAG} . + +echo "Pushing the image to the registry" +docker push ${IMAGE_TAG} + +if [ -n "${TAG}" ]; then + ADDITIONAL_TAG="${REGISTRY}:${TAG}" + + echo "Building the ${ADDITIONAL_TAG} image" + + docker build --tag ${ADDITIONAL_TAG} . + + echo "Pushing the image ${ADDITIONAL_TAG} to the registry" + docker push ${ADDITIONAL_TAG} +fi + +popd diff --git a/tools/packaging/kata-deploy/Dockerfile b/tools/packaging/kata-deploy/Dockerfile index f8b9edf4c..8e7f6e2ac 100644 --- a/tools/packaging/kata-deploy/Dockerfile +++ b/tools/packaging/kata-deploy/Dockerfile @@ -28,3 +28,4 @@ tar xvf ${WORKDIR}/${KATA_ARTIFACTS} -C ${DESTINATION} && \ rm -f ${WORKDIR}/${KATA_ARTIFACTS} COPY scripts ${DESTINATION}/scripts +COPY runtimeclasses ${DESTINATION}/runtimeclasses diff --git a/tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml b/tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml index bd177834f..9a9306099 100644 --- a/tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml +++ b/tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml @@ -13,7 +13,7 @@ spec: labels: name: kubelet-kata-cleanup spec: - serviceAccountName: kata-label-node + serviceAccountName: kata-deploy-sa nodeSelector: katacontainers.io/kata-runtime: cleanup containers: @@ -26,6 +26,16 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: DEBUG + value: "false" + - name: SHIMS + value: "clh dragonball fc qemu-nvidia-gpu qemu-sev qemu-snp qemu-tdx qemu" + - name: DEFAULT_SHIM + value: "qemu" + - name: CREATE_RUNTIMECLASSES + value: "false" + - name: CREATE_DEFAULT_RUNTIMECLASS + value: "false" securityContext: privileged: true volumeMounts: diff --git a/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml b/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml index b850d604f..300cd5f5d 100644 --- a/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml +++ b/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml @@ -13,7 +13,7 @@ spec: labels: name: kata-deploy spec: - serviceAccountName: kata-label-node + serviceAccountName: kata-deploy-sa containers: - name: kube-kata image: quay.io/kata-containers/kata-deploy-cc:v0 @@ -28,8 +28,16 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName - - name: CONFIGURE_CC - value: "yes" + - name: DEBUG + value: "false" + - name: SHIMS + value: "clh dragonball fc qemu qemu-nvidia-gpu qemu-sev qemu-snp qemu-tdx" + - name: DEFAULT_SHIM + value: "qemu" + - name: CREATE_RUNTIMECLASSES + value: "false" + - name: CREATE_DEFAULT_RUNTIMECLASS + value: "false" securityContext: privileged: true volumeMounts: diff --git a/tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml b/tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml index 408b5be90..3bde9f0a8 100644 --- a/tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml +++ b/tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml @@ -2,28 +2,30 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: kata-label-node + name: kata-deploy-sa namespace: kube-system --- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: node-labeler + name: kata-deploy-role rules: - apiGroups: [""] resources: ["nodes"] verbs: ["get", "patch"] +- apiGroups: ["node.k8s.io"] + resources: ["runtimeclasses"] + verbs: ["create", "delete", "get", "list", "patch", "update", "watch"] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: - name: kata-label-node-rb + name: kata-deploy-rb roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: node-labeler + name: kata-deploy-role subjects: - kind: ServiceAccount - name: kata-label-node + name: kata-deploy-sa namespace: kube-system - diff --git a/tools/packaging/kata-deploy/local-build/Makefile b/tools/packaging/kata-deploy/local-build/Makefile index 0f2c68fb8..91804d989 100644 --- a/tools/packaging/kata-deploy/local-build/Makefile +++ b/tools/packaging/kata-deploy/local-build/Makefile @@ -18,9 +18,9 @@ EXTRA_TARBALL=\ ovmf-tarball \ qemu-snp-experimental-tarball \ qemu-tdx-experimental-tarball \ - cc-tdx-td-shim-tarball \ - cc-sev-rootfs-initrd-tarball \ - cc-tdx-rootfs-image-tarball + rootfs-initrd-sev-tarball \ + rootfs-image-tdx-tarball \ + cc-tdx-td-shim-tarball endif define BUILD @@ -78,9 +78,6 @@ firecracker-tarball: kernel-dragonball-experimental-tarball: ${MAKE} $@-build -kernel-experimental-tarball: - ${MAKE} $@-build - kernel-nvidia-gpu-tarball: ${MAKE} $@-build @@ -142,7 +139,7 @@ virtiofsd-tarball: ${MAKE} $@-build merge-builds: - $(MK_DIR)/kata-deploy-merge-builds.sh build + $(MK_DIR)/kata-deploy-merge-builds.sh build "$(MK_DIR)/../../../../versions.yaml" install-tarball: tar -xf ./kata-static.tar.xz -C / @@ -168,15 +165,9 @@ cc-rootfs-image-tarball: cc-rootfs-initrd-tarball: ${MAKE} $@-build -cc-sev-rootfs-initrd-tarball: kernel-sev-tarball - ${MAKE} $@-build - cc-se-image-tarball: kernel-tarball cc-rootfs-initrd-tarball ${MAKE} $@-build -cc-tdx-rootfs-image-tarball: - ${MAKE} $@-build - cc-tdx-td-shim-tarball: ${MAKE} $@-build diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh index 549c33eea..db5ed3602 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-binaries.sh @@ -223,68 +223,12 @@ install_cached_cc_shim_v2() { #Install cc capable guest image install_cc_image() { - export AA_KBC="${1:-offline_fs_kbc}" - image_type="${2:-image}" - image_initrd_suffix="${3:-""}" - root_hash_suffix="${4:-""}" - tee="${5:-""}" + export AA_KBC="${AA_KBC:-offline_fs_kbc}" export KATA_BUILD_CC=yes - export MEASURED_ROOTFS=${MEASURED_ROOTFS} + export MEASURED_ROOTFS=yes + variant="${1:-}" - local jenkins="${jenkins_url}/job/kata-containers-2.0-rootfs-image-cc-$(uname -m)/${cached_artifacts_path}" - local component="rootfs-image" - local root_hash_vanilla="root_hash_vanilla.txt" - local root_hash_tdx="" - local initramfs_last_commit="" - if [ -n "${tee}" ]; then - if [ "${tee}" == "tdx" ]; then - jenkins="${jenkins_url}/job/kata-containers-2.0-rootfs-image-${tee}-cc-$(uname -m)/${cached_artifacts_path}" - component="${tee}-rootfs-image" - root_hash_vanilla="" - root_hash_tdx="root_hash_${tee}.txt" - fi - if [ "${tee}" == "sev" ]; then - jenkins="${jenkins_url}/job/kata-containers-2.0-rootfs-initrd-${tee}-cc-$(uname -m)/${cached_artifacts_path}" - component="${tee}-rootfs-initrd" - root_hash_vanilla="" - initramfs_last_commit="$(get_initramfs_image_name)" - fi - fi - - local osbuilder_last_commit="$(echo $(get_last_modification "${repo_root_dir}/tools/osbuilder") | sed s/-dirty//)" - local guest_image_last_commit="$(get_last_modification "${repo_root_dir}/tools/packaging/guest-image")" - local agent_last_commit="$(get_last_modification "${repo_root_dir}/src/agent")" - local libs_last_commit="$(get_last_modification "${repo_root_dir}/src/libs")" - local attestation_agent_version="$(get_from_kata_deps "externals.attestation-agent.version")" - local gperf_version="$(get_from_kata_deps "externals.gperf.version")" - local libseccomp_version="$(get_from_kata_deps "externals.libseccomp.version")" - local pause_version="$(get_from_kata_deps "externals.pause.version")" - local rust_version="$(get_from_kata_deps "languages.rust.meta.newest-version")" - - install_cached_tarball_component \ - "${component}" \ - "${jenkins}" \ - "${osbuilder_last_commit}-${guest_image_last_commit}-${initramfs_last_commit}-${agent_last_commit}-${libs_last_commit}-${attestation_agent_version}-${gperf_version}-${libseccomp_version}-${pause_version}-${rust_version}-${image_type}-${AA_KBC}" \ - "" \ - "${final_tarball_name}" \ - "${final_tarball_path}" \ - "${root_hash_vanilla}" \ - "${root_hash_tdx}" \ - && return 0 - - info "Create CC image configured with AA_KBC=${AA_KBC}" - "${rootfs_builder}" \ - --imagetype="${image_type}" \ - --prefix="${prefix}" \ - --destdir="${destdir}" \ - --image_initrd_suffix="${image_initrd_suffix}" \ - --root_hash_suffix="${root_hash_suffix}" -} - -install_cc_sev_image() { - AA_KBC="online_sev_kbc" - image_type="initrd" - install_cc_image "${AA_KBC}" "${image_type}" "sev" "" "sev" + install_image "${variant}" } install_cc_se_image() { @@ -292,12 +236,11 @@ install_cc_se_image() { "${se_image_builder}" --destdir="${destdir}" } -install_cc_tdx_image() { - AA_KBC="cc_kbc_tdx" - image_type="image" - image_suffix="tdx" - root_hash_suffix="tdx" - install_cc_image "${AA_KBC}" "${image_type}" "${image_suffix}" "${root_hash_suffix}" "tdx" +install_image_tdx() { + export AA_KBC="cc_kbc_tdx" + + info "Install CC image configured with AA_KBC=${AA_KBC}" + install_cc_image "tdx" } #Install all components that are not assets @@ -358,9 +301,14 @@ install_cc_tdx_td_shim() { #Install guest image install_image() { - local image_type="${1:-"image"}" - local initrd_suffix="${2:-""}" - local jenkins="${jenkins_url}/job/kata-containers-main-rootfs-${image_type}-$(uname -m)/${cached_artifacts_path}" + local variant="${1:-}" + + image_type="image" + if [ -n "${variant}" ]; then + image_type+="-${variant}" + fi + + local jenkins="${jenkins_url}/job/kata-containers-main-rootfs-${image_type}-${ARCH}/${cached_artifacts_path}" local component="rootfs-${image_type}" local osbuilder_last_commit="$(get_last_modification "${repo_root_dir}/tools/osbuilder")" @@ -370,30 +318,58 @@ install_image() { local gperf_version="$(get_from_kata_deps "externals.gperf.version")" local libseccomp_version="$(get_from_kata_deps "externals.libseccomp.version")" local rust_version="$(get_from_kata_deps "languages.rust.meta.newest-version")" + local attestation_agent_version="$(get_from_kata_deps "externals.attestation-agent.version")" + local pause_version="$(get_from_kata_deps "externals.pause.version")" + local root_hash_vanilla="" + local root_hash_tdx="" + + local version_checker="${osbuilder_last_commit}-${guest_image_last_commit}-${agent_last_commit}-${libs_last_commit}-${gperf_version}-${libseccomp_version}-${rust_version}-${image_type}" + if [ -n "${variant}" ]; then + jenkins="${jenkins_url}/job/kata-containers-2.0-rootfs-image-${variant}-cc-$(uname -m)/${cached_artifacts_path}" + component="${variant}-rootfs-image" + root_hash_tdx="root_hash_${variant}.txt" + initramfs_last_commit="" + version=_checker="${osbuilder_last_commit}-${guest_image_last_commit}-${initramfs_last_commit}-${agent_last_commit}-${libs_last_commit}-${attestation_agent_version}-${gperf_version}-${libseccomp_version}-${pause_version}-${rust_version}-${image_type}-${AA_KBC}" + fi + install_cached_tarball_component \ "${component}" \ "${jenkins}" \ - "${osbuilder_last_commit}-${guest_image_last_commit}-${agent_last_commit}-${libs_last_commit}-${gperf_version}-${libseccomp_version}-${rust_version}-image" \ + "${version_checker}" \ "" \ "${final_tarball_name}" \ "${final_tarball_path}" \ + "${root_hash_vanilla}" \ + "${root_hash_tdx}" \ && return 0 info "Create image" - "${rootfs_builder}" --imagetype=image --prefix="${prefix}" --destdir="${destdir}" --image_initrd_suffix="${initrd_suffix}" -} -#Install guest image for tdx -install_image_tdx() { - install_image "image-tdx" "tdx" + if [ -n "${variant}" ]; then + os_name="$(get_from_kata_deps "assets.image.architecture.${ARCH}.${variant}.name")" + os_version="$(get_from_kata_deps "assets.image.architecture.${ARCH}.${variant}.version")" + else + os_name="$(get_from_kata_deps "assets.image.architecture.${ARCH}.name")" + os_version="$(get_from_kata_deps "assets.image.architecture.${ARCH}.version")" + fi + + "${rootfs_builder}" --osname="${os_name}" --osversion="${os_version}" --imagetype=image --prefix="${prefix}" --destdir="${destdir}" --image_initrd_suffix="${variant}" } #Install guest initrd install_initrd() { - local initrd_type="${1:-"initrd"}" - local initrd_suffix="${2:-""}" - local jenkins="${jenkins_url}/job/kata-containers-main-rootfs-${initrd_type}-$(uname -m)/${cached_artifacts_path}" + local variant="${1:-}" + + initrd_type="initrd" + if [ -n "${variant}" ]; then + initrd_type+="-${variant}" + fi + + local jenkins="${jenkins_url}/job/kata-containers-main-rootfs-${initrd_type}-${ARCH}/${cached_artifacts_path}" + if [ -n "${variant}" ]; then + jenkins="${jenkins_url}/job/kata-containers-2.0-rootfs-initrd-${variant}-cc-$(uname -m)/${cached_artifacts_path}" + fi local component="rootfs-${initrd_type}" local osbuilder_last_commit="$(get_last_modification "${repo_root_dir}/tools/osbuilder")" @@ -403,23 +379,56 @@ install_initrd() { local gperf_version="$(get_from_kata_deps "externals.gperf.version")" local libseccomp_version="$(get_from_kata_deps "externals.libseccomp.version")" local rust_version="$(get_from_kata_deps "languages.rust.meta.newest-version")" + local attestation_agent_version="$(get_from_kata_deps "externals.attestation-agent.version")" + local pause_version="$(get_from_kata_deps "externals.pause.version")" + local root_hash_vanilla="" + local root_hash_tdx="" + + [[ "${ARCH}" == "aarch64" && "${CROSS_BUILD}" == "true" ]] && echo "warning: Don't cross build initrd for aarch64 as it's too slow" && exit 0 + + local version_checker="${osbuilder_last_commit}-${guest_image_last_commit}-${agent_last_commit}-${libs_last_commit}-${gperf_version}-${libseccomp_version}-${rust_version}-${initrd_type}" + if [ -n "${variant}" ]; then + initramfs_last_commit="$(get_initramfs_image_name)" + version_checker="${osbuilder_last_commit}-${guest_image_last_commit}-${initramfs_last_commit}-${agent_last_commit}-${libs_last_commit}-${attestation_agent_version}-${gperf_version}-${libseccomp_version}-${pause_version}-${rust_version}-${initrd_type}-${AA_KBC}" + fi install_cached_tarball_component \ "${component}" \ "${jenkins}" \ - "${osbuilder_last_commit}-${guest_image_last_commit}-${agent_last_commit}-${libs_last_commit}-${gperf_version}-${libseccomp_version}-${rust_version}-${initrd_type}" \ + "${version_checker}" \ "" \ "${final_tarball_name}" \ "${final_tarball_path}" \ + "${root_hash_vanilla}" \ + "${root_hash_tdx}" \ && return 0 info "Create initrd" - "${rootfs_builder}" --imagetype=initrd --prefix="${prefix}" --destdir="${destdir}" --image_initrd_suffix="${initrd_suffix}" + + if [ -n "${variant}" ]; then + os_name="$(get_from_kata_deps "assets.initrd.architecture.${ARCH}.${variant}.name")" + os_version="$(get_from_kata_deps "assets.initrd.architecture.${ARCH}.${variant}.version")" + else + os_name="$(get_from_kata_deps "assets.initrd.architecture.${ARCH}.name")" + os_version="$(get_from_kata_deps "assets.initrd.architecture.${ARCH}.version")" + fi + + "${rootfs_builder}" --osname="${os_name}" --osversion="${os_version}" --imagetype=initrd --prefix="${prefix}" --destdir="${destdir}" --image_initrd_suffix="${variant}" +} + +#Install Mariner guest initrd +install_initrd_mariner() { + install_initrd "mariner" } #Install guest initrd for sev install_initrd_sev() { - install_initrd "initrd-sev" "sev" + export AA_KBC="online_sev_kbc" + export KATA_BUILD_CC="yes" + export MEASURED_ROOTFS="no" + + info "Install CC initrd configured with AA_KBC=${AA_KBC}" + install_initrd "sev" } #Install kernel component helper @@ -437,7 +446,7 @@ install_cached_kernel_tarball_component() { install_cached_tarball_component \ "${kernel_name}" \ "${url}" \ - "${kernel_version}-${kernel_kata_config_version}" \ + "${kernel_version}-${kernel_kata_config_version}-$(get_last_modification $(dirname $kernel_builder))" \ "$(get_kernel_image_name)" \ "${final_tarball_name}" \ "${final_tarball_path}" \ @@ -451,7 +460,7 @@ install_cached_kernel_tarball_component() { install_cached_tarball_component \ "${kernel_name}" \ "${jenkins_url}/job/kata-containers-main-${kernel_name}-$(uname -m)/${cached_artifacts_path}" \ - "${kernel_version}-${kernel_kata_config_version}" \ + "${kernel_version}-${kernel_kata_config_version}-$(get_last_modification $(dirname $kernel_builder))" \ "$(get_kernel_image_name)" \ "kata-static-kernel-sev-modules.tar.xz" \ "${workdir}/kata-static-kernel-sev-modules.tar.xz" \ @@ -468,7 +477,7 @@ install_cached_kernel_tarball_component() { install_cc_initrd() { export AA_KBC="${AA_KBC:-offline_fs_kbc}" info "Create CC initrd configured with AA_KBC=${AA_KBC}" - "${rootfs_builder}" --imagetype=initrd --prefix="${prefix}" --destdir="${destdir}" + install_initrd } #Install kernel asset @@ -544,14 +553,6 @@ install_kernel_nvidia_gpu_tdx_experimental() { "-x tdx -g nvidia -u ${kernel_url} -H deb" } -#Install experimental kernel asset -install_kernel_experimental() { - install_kernel_helper \ - "assets.kernel-experimental.version" \ - "kernel-experimental" \ - "-f -b experimental" -} - #Install experimental TDX kernel asset install_kernel_tdx_experimental() { local kernel_url="$(get_from_kata_deps assets.kernel-tdx-experimental.url)" @@ -862,19 +863,14 @@ handle_build() { cc) install_cc_image install_cc_shimv2 - install_cc_sev_image ;; cc-rootfs-image) install_cc_image ;; cc-rootfs-initrd) install_cc_initrd ;; - cc-sev-rootfs-initrd) install_cc_sev_image ;; - cc-se-image) install_cc_se_image ;; - cc-tdx-rootfs-image) install_cc_tdx_image ;; - cc-shim-v2) install_cc_shimv2 ;; cc-tdx-td-shim) install_cc_tdx_td_shim ;; @@ -889,8 +885,6 @@ handle_build() { kernel-dragonball-experimental) install_kernel_dragonball_experimental ;; - kernel-experimental) install_kernel_experimental ;; - kernel-nvidia-gpu) install_kernel_nvidia_gpu ;; kernel-nvidia-gpu-snp) install_kernel_nvidia_gpu_snp;; diff --git a/tools/packaging/kata-deploy/local-build/kata-deploy-merge-builds.sh b/tools/packaging/kata-deploy/local-build/kata-deploy-merge-builds.sh index 729e9a241..00f0dab18 100755 --- a/tools/packaging/kata-deploy/local-build/kata-deploy-merge-builds.sh +++ b/tools/packaging/kata-deploy/local-build/kata-deploy-merge-builds.sh @@ -11,7 +11,10 @@ set -o pipefail set -o errtrace kata_build_dir=${1:-build} +kata_versions_yaml_file=${2:-""} + tar_path="${PWD}/kata-static.tar.xz" +kata_versions_yaml_file_path="${PWD}/${kata_versions_yaml_file}" pushd "${kata_build_dir}" tarball_content_dir="${PWD}/kata-tarball-content" @@ -24,6 +27,15 @@ do tar -xvf "${c}" -C "${tarball_content_dir}" done +pushd ${tarball_content_dir} + shim="containerd-shim-kata-v2" + shim_path=$(find . -name ${shim} | sort | head -1) + prefix=${shim_path%"bin/${shim}"} + + echo "$(git describe)" > ${prefix}/VERSION + [[ -n "${kata_versions_yaml_file}" ]] && cp ${kata_versions_yaml_file_path} ${prefix}/ +popd + echo "create ${tar_path}" (cd "${tarball_content_dir}"; tar cvfJ "${tar_path}" .) rm -rf "${tarball_content_dir}" diff --git a/tools/packaging/kata-deploy/runtimeclasses/kata-clh.yaml b/tools/packaging/kata-deploy/runtimeclasses/kata-clh.yaml new file mode 100644 index 000000000..3c2260252 --- /dev/null +++ b/tools/packaging/kata-deploy/runtimeclasses/kata-clh.yaml @@ -0,0 +1,13 @@ +--- +kind: RuntimeClass +apiVersion: node.k8s.io/v1 +metadata: + name: kata-clh +handler: kata-clh +overhead: + podFixed: + memory: "130Mi" + cpu: "250m" +scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" diff --git a/tools/packaging/kata-deploy/runtimeclasses/kata-dragonball.yaml b/tools/packaging/kata-deploy/runtimeclasses/kata-dragonball.yaml new file mode 100644 index 000000000..664822c94 --- /dev/null +++ b/tools/packaging/kata-deploy/runtimeclasses/kata-dragonball.yaml @@ -0,0 +1,13 @@ +--- +kind: RuntimeClass +apiVersion: node.k8s.io/v1 +metadata: + name: kata-dragonball +handler: kata-dragonball +overhead: + podFixed: + memory: "130Mi" + cpu: "250m" +scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" diff --git a/tools/packaging/kata-deploy/runtimeclasses/kata-fc.yaml b/tools/packaging/kata-deploy/runtimeclasses/kata-fc.yaml new file mode 100644 index 000000000..2a087cc8d --- /dev/null +++ b/tools/packaging/kata-deploy/runtimeclasses/kata-fc.yaml @@ -0,0 +1,13 @@ +--- +kind: RuntimeClass +apiVersion: node.k8s.io/v1 +metadata: + name: kata-fc +handler: kata-fc +overhead: + podFixed: + memory: "130Mi" + cpu: "250m" +scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" diff --git a/tools/packaging/kata-deploy/runtimeclasses/kata-qemu-nvidia-gpu.yaml b/tools/packaging/kata-deploy/runtimeclasses/kata-qemu-nvidia-gpu.yaml new file mode 100644 index 000000000..f99d3a280 --- /dev/null +++ b/tools/packaging/kata-deploy/runtimeclasses/kata-qemu-nvidia-gpu.yaml @@ -0,0 +1,13 @@ +--- +kind: RuntimeClass +apiVersion: node.k8s.io/v1 +metadata: + name: kata-qemu-nvidia-gpu +handler: kata-qemu-nvidia-gpu +overhead: + podFixed: + memory: "160Mi" + cpu: "250m" +scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" diff --git a/tools/packaging/kata-deploy/runtimeclasses/kata-qemu-sev.yaml b/tools/packaging/kata-deploy/runtimeclasses/kata-qemu-sev.yaml new file mode 100644 index 000000000..a9eb75a8e --- /dev/null +++ b/tools/packaging/kata-deploy/runtimeclasses/kata-qemu-sev.yaml @@ -0,0 +1,13 @@ +--- +kind: RuntimeClass +apiVersion: node.k8s.io/v1 +metadata: + name: kata-qemu-sev +handler: kata-qemu-sev +overhead: + podFixed: + memory: "2048Mi" + cpu: "1.0" +scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" diff --git a/tools/packaging/kata-deploy/runtimeclasses/kata-qemu-snp.yaml b/tools/packaging/kata-deploy/runtimeclasses/kata-qemu-snp.yaml new file mode 100644 index 000000000..a285d616d --- /dev/null +++ b/tools/packaging/kata-deploy/runtimeclasses/kata-qemu-snp.yaml @@ -0,0 +1,13 @@ +--- +kind: RuntimeClass +apiVersion: node.k8s.io/v1 +metadata: + name: kata-qemu-snp +handler: kata-qemu-snp +overhead: + podFixed: + memory: "2048Mi" + cpu: "1.0" +scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" diff --git a/tools/packaging/kata-deploy/runtimeclasses/kata-qemu-tdx.yaml b/tools/packaging/kata-deploy/runtimeclasses/kata-qemu-tdx.yaml new file mode 100644 index 000000000..e684291d7 --- /dev/null +++ b/tools/packaging/kata-deploy/runtimeclasses/kata-qemu-tdx.yaml @@ -0,0 +1,13 @@ +--- +kind: RuntimeClass +apiVersion: node.k8s.io/v1 +metadata: + name: kata-qemu-tdx +handler: kata-qemu-tdx +overhead: + podFixed: + memory: "2048Mi" + cpu: "1.0" +scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" diff --git a/tools/packaging/kata-deploy/runtimeclasses/kata-qemu.yaml b/tools/packaging/kata-deploy/runtimeclasses/kata-qemu.yaml new file mode 100644 index 000000000..c362e3ff6 --- /dev/null +++ b/tools/packaging/kata-deploy/runtimeclasses/kata-qemu.yaml @@ -0,0 +1,13 @@ +--- +kind: RuntimeClass +apiVersion: node.k8s.io/v1 +metadata: + name: kata-qemu +handler: kata-qemu +overhead: + podFixed: + memory: "160Mi" + cpu: "250m" +scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" diff --git a/tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml b/tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml index b55096226..e67cafe10 100644 --- a/tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml +++ b/tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml @@ -1,58 +1,6 @@ --- kind: RuntimeClass apiVersion: node.k8s.io/v1 -metadata: - name: kata-qemu -handler: kata-qemu -overhead: - podFixed: - memory: "160Mi" - cpu: "250m" -scheduling: - nodeSelector: - katacontainers.io/kata-runtime: "true" ---- -kind: RuntimeClass -apiVersion: node.k8s.io/v1 -metadata: - name: kata-qemu-sev -handler: kata-qemu-sev -overhead: - podFixed: - memory: "2048Mi" - cpu: "1.0" -scheduling: - nodeSelector: - katacontainers.io/kata-runtime: "true" ---- -kind: RuntimeClass -apiVersion: node.k8s.io/v1 -metadata: - name: kata-qemu-tdx -handler: kata-qemu-tdx -overhead: - podFixed: - memory: "2048Mi" - cpu: "1.0" -scheduling: - nodeSelector: - katacontainers.io/kata-runtime: "true" ---- -kind: RuntimeClass -apiVersion: node.k8s.io/v1 -metadata: - name: kata-qemu-snp -handler: kata-qemu-snp -overhead: - podFixed: - memory: "2048Mi" - cpu: "1.0" -scheduling: - nodeSelector: - katacontainers.io/kata-runtime: "true" ---- -kind: RuntimeClass -apiVersion: node.k8s.io/v1 metadata: name: kata-clh handler: kata-clh @@ -66,19 +14,6 @@ scheduling: --- kind: RuntimeClass apiVersion: node.k8s.io/v1 -metadata: - name: kata-fc -handler: kata-fc -overhead: - podFixed: - memory: "130Mi" - cpu: "250m" -scheduling: - nodeSelector: - katacontainers.io/kata-runtime: "true" ---- -kind: RuntimeClass -apiVersion: node.k8s.io/v1 metadata: name: kata-dragonball handler: kata-dragonball @@ -92,6 +27,19 @@ scheduling: --- kind: RuntimeClass apiVersion: node.k8s.io/v1 +metadata: + name: kata-fc +handler: kata-fc +overhead: + podFixed: + memory: "130Mi" + cpu: "250m" +scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" +--- +kind: RuntimeClass +apiVersion: node.k8s.io/v1 metadata: name: kata-qemu-nvidia-gpu handler: kata-qemu-nvidia-gpu @@ -102,3 +50,55 @@ overhead: scheduling: nodeSelector: katacontainers.io/kata-runtime: "true" +--- +kind: RuntimeClass +apiVersion: node.k8s.io/v1 +metadata: + name: kata-qemu-sev +handler: kata-qemu-sev +overhead: + podFixed: + memory: "2048Mi" + cpu: "1.0" +scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" +--- +kind: RuntimeClass +apiVersion: node.k8s.io/v1 +metadata: + name: kata-qemu-snp +handler: kata-qemu-snp +overhead: + podFixed: + memory: "2048Mi" + cpu: "1.0" +scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" +--- +kind: RuntimeClass +apiVersion: node.k8s.io/v1 +metadata: + name: kata-qemu-tdx +handler: kata-qemu-tdx +overhead: + podFixed: + memory: "2048Mi" + cpu: "1.0" +scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" +--- +kind: RuntimeClass +apiVersion: node.k8s.io/v1 +metadata: + name: kata-qemu +handler: kata-qemu +overhead: + podFixed: + memory: "160Mi" + cpu: "250m" +scheduling: + nodeSelector: + katacontainers.io/kata-runtime: "true" diff --git a/tools/packaging/kata-deploy/scripts/kata-deploy.sh b/tools/packaging/kata-deploy/scripts/kata-deploy.sh index 020e0dcf6..dd364581a 100755 --- a/tools/packaging/kata-deploy/scripts/kata-deploy.sh +++ b/tools/packaging/kata-deploy/scripts/kata-deploy.sh @@ -10,47 +10,13 @@ set -o nounset crio_drop_in_conf_dir="/etc/crio/crio.conf.d/" crio_drop_in_conf_file="${crio_drop_in_conf_dir}/99-kata-deploy" +crio_drop_in_conf_file_debug="${crio_drop_in_conf_dir}/100-debug" containerd_conf_file="/etc/containerd/config.toml" containerd_conf_file_backup="${containerd_conf_file}.bak" -shims_x86_64=( - "fc" - "qemu" - "qemu-nvidia-gpu" - "qemu-tdx" - "qemu-sev" - "qemu-snp" - "clh" - "dragonball" -) +IFS=' ' read -a shims <<< "$SHIMS" -# THOSE ARE NOT YET ON MAIN, PLEASE, MOVE THEM TO THE UPPDER LIST WHENEVER THEY MAKE THEIR WAY IN. -shims_x86_64+=( - "remote" - "clh-tdx" -) - -shims_s390x=( - "qemu" -) - - -# THOSE ARE NOT YET ON MAIN, PLEASE, MOVE THEM TO THE UPPDER LIST WHENEVER THEY MAKE THEIR WAY IN. -shims_s390x+=( - "remote" - "qemu-se" -) - -arch=$(uname -m) -if [[ "${arch}" == "x86_64" ]]; then - shims=(${shims_x86_64[@]}) -elif [[ "${arch}" == "s390x" ]]; then - shims=(${shims_s390x[@]}) -else - die "${arch} is a not supported architecture" -fi - -default_shim="qemu" +default_shim="$DEFAULT_SHIM" # If we fail for any reason a message will be displayed die() { @@ -63,6 +29,41 @@ function print_usage() { echo "Usage: $0 [install/cleanup/reset]" } +function create_runtimeclasses() { + echo "Creating the runtime classes" + + for shim in "${shims[@]}"; do + echo "Creating the kata-${shim} runtime class" + kubectl apply -f /opt/kata-artifacts/runtimeclasses/kata-${shim}.yaml + done + + if [[ "${CREATE_DEFAULT_RUNTIMECLASS}" == "true" ]]; then + echo "Creating the kata runtime class for the default shim (an alias for kata-${default_shim})" + cp /opt/kata-artifacts/runtimeclasses/kata-${default_shim}.yaml /tmp/kata.yaml + sed -i -e 's/kata-'${default_shim}'/kata/g' /tmp/kata.yaml + kubectl apply -f /tmp/kata.yaml + rm -f /tmp/kata.yaml + fi +} + +function delete_runtimeclasses() { + echo "Deleting the runtime classes" + + for shim in "${shims[@]}"; do + echo "Deleting the kata-${shim} runtime class" + kubectl delete -f /opt/kata-artifacts/runtimeclasses/kata-${shim}.yaml + done + + + if [[ "${CREATE_DEFAULT_RUNTIMECLASS}" == "true" ]]; then + echo "Deleting the kata runtime class for the default shim (an alias for kata-${default_shim})" + cp /opt/kata-artifacts/runtimeclasses/kata-${default_shim}.yaml /tmp/kata.yaml + sed -i -e 's/kata-'${default_shim}'/kata/g' /tmp/kata.yaml + kubectl delete -f /tmp/kata.yaml + rm -f /tmp/kata.yaml + fi +} + function get_container_runtime() { local runtime=$(kubectl get node $NODE_NAME -o jsonpath='{.status.nodeInfo.containerRuntimeVersion}') @@ -91,6 +92,16 @@ function install_artifacts() { [ -d /opt/kata/runtime-rs/bin ] && \ chmod +x /opt/kata/runtime-rs/bin/* + # Allow enabling debug for Kata Containers + if [[ "${DEBUG}" == "true" ]]; then + config_path="/opt/kata/share/defaults/kata-containers/" + for shim in "${shims[@]}"; do + sed -i -e 's/^#\(enable_debug\).*=.*$/\1 = true/g' "${config_path}/configuration-${shim}.toml" + sed -i -e 's/^#\(debug_console_enabled\).*=.*$/\1 = true/g' "${config_path}/configuration-${shim}.toml" + sed -i -e 's/^kernel_params = "\(.*\)"/kernel_params = "\1 agent.log=debug initcall_debug"/g' "${config_path}/configuration-${shim}.toml" + done + fi + # Allow Mariner to use custom configuration. if [ "${HOST_OS:-}" == "cbl-mariner" ]; then config_path="/opt/kata/share/defaults/kata-containers/configuration-clh.toml" @@ -99,6 +110,10 @@ function install_artifacts() { sed -i -E "s|(valid_hypervisor_paths) = .+|\1 = [\"${clh_path}\"]|" "${config_path}" sed -i -E "s|(path) = \".+/cloud-hypervisor\"|\1 = \"${clh_path}\"|" "${config_path}" fi + + if [[ "${CREATE_RUNTIMECLASSES}" == "true" ]]; then + create_runtimeclasses + fi } function wait_till_node_is_ready() { @@ -198,6 +213,10 @@ function cleanup_different_shims_base() { rm "${default_shim_file}" || true restore_shim "${default_shim_file}" + + if [[ "${CREATE_RUNTIMECLASSES}" == "true" ]]; then + delete_runtimeclasses + fi } function configure_crio_runtime() { @@ -238,6 +257,14 @@ function configure_crio() { for shim in "${shims[@]}"; do configure_crio_runtime $shim done + + + if [ "${DEBUG}" == "true" ]; then + cat <