Merge pull request #7134 from stevenhorsman/CCv0-merge-19th-june

CCv0: Merge main into CCv0 branch
This commit is contained in:
Steve Horsman
2023-06-27 09:16:49 +01:00
committed by GitHub
162 changed files with 5096 additions and 1717 deletions

View File

@@ -2,6 +2,10 @@ name: CI | Build kata-static tarball for amd64
on:
workflow_call:
inputs:
stage:
required: false
type: string
default: test
tarball-suffix:
required: false
type: string
@@ -15,8 +19,11 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
stage:
- ${{ inputs.stage }}
asset:
- cloud-hypervisor
- cloud-hypervisor-glibc
- firecracker
- kernel
- kernel-sev
@@ -32,11 +39,16 @@ jobs:
- qemu-snp-experimental
- qemu-tdx-experimental
- rootfs-image
- rootfs-image-tdx
- rootfs-initrd
- rootfs-initrd-mariner
- rootfs-initrd-sev
- shim-v2
- tdvf
- virtiofsd
exclude:
- stage: release
asset: cloud-hypervisor-glibc
steps:
- name: Login to Kata Containers quay.io
if: ${{ inputs.push-to-registry == 'yes' }}

View File

@@ -52,3 +52,7 @@ jobs:
registry: ghcr.io
repo: ${{ github.repository_owner }}/kata-deploy-ci
tag: ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-amd64
run-metrics-tests:
needs: build-kata-static-tarball-amd64
uses: ./.github/workflows/run-launchtimes-metrics.yaml

View File

@@ -9,6 +9,8 @@ on:
jobs:
build-kata-static-tarball-amd64:
uses: ./.github/workflows/build-kata-static-tarball-amd64.yaml
with:
stage: release
kata-deploy:
needs: build-kata-static-tarball-amd64

View File

@@ -9,6 +9,8 @@ on:
jobs:
build-kata-static-tarball-arm64:
uses: ./.github/workflows/build-kata-static-tarball-arm64.yaml
with:
stage: release
kata-deploy:
needs: build-kata-static-tarball-arm64

View File

@@ -9,6 +9,8 @@ on:
jobs:
build-kata-static-tarball-s390x:
uses: ./.github/workflows/build-kata-static-tarball-s390x.yaml
with:
stage: release
kata-deploy:
needs: build-kata-static-tarball-s390x

View File

@@ -83,7 +83,7 @@ jobs:
- name: push amd64 static tarball to github
run: |
tag=$(echo $GITHUB_REF | cut -d/ -f3-)
tarball="kata-static-$tag-x86_64.tar.xz"
tarball="kata-static-$tag-amd64.tar.xz"
mv kata-static.tar.xz "$GITHUB_WORKSPACE/${tarball}"
pushd $GITHUB_WORKSPACE
echo "uploading asset '${tarball}' for tag: ${tag}"
@@ -97,7 +97,7 @@ jobs:
- name: push arm64 static tarball to github
run: |
tag=$(echo $GITHUB_REF | cut -d/ -f3-)
tarball="kata-static-$tag-aarch64.tar.xz"
tarball="kata-static-$tag-arm64.tar.xz"
mv kata-static.tar.xz "$GITHUB_WORKSPACE/${tarball}"
pushd $GITHUB_WORKSPACE
echo "uploading asset '${tarball}' for tag: ${tag}"

View File

@@ -17,79 +17,54 @@ jobs:
strategy:
fail-fast: false
matrix:
host_os:
- ubuntu
vmm:
- clh
- dragonball
- qemu
include:
- host_os: cbl-mariner
vmm: clh
runs-on: ubuntu-latest
env:
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}
DOCKER_TAG: ${{ inputs.tag }}
GH_PR_NUMBER: ${{ github.event.pull_request.number }}
KATA_HOST_OS: ${{ matrix.host_os }}
KATA_HYPERVISOR: ${{ matrix.vmm }}
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
- name: Download Azure CLI
run: |
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
run: bash tests/integration/gha-run.sh install-azure-cli
- name: Log into the Azure account
run: |
az login \
--service-principal \
-u "${{ secrets.AZ_APPID }}" \
-p "${{ secrets.AZ_PASSWORD }}" \
--tenant "${{ secrets.AZ_TENANT_ID }}"
run: bash tests/integration/gha-run.sh login-azure
env:
AZ_APPID: ${{ secrets.AZ_APPID }}
AZ_PASSWORD: ${{ secrets.AZ_PASSWORD }}
AZ_TENANT_ID: ${{ secrets.AZ_TENANT_ID }}
- name: Create AKS cluster
run: |
az aks create \
-g "kataCI" \
-n "${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-${{ matrix.vmm }}-amd64" \
-s "Standard_D4s_v5" \
--node-count 1 \
--generate-ssh-keys
run: bash tests/integration/gha-run.sh create-cluster
- name: Install `bats`
run: |
sudo apt-get update
sudo apt-get -y install bats
run: bash tests/integration/gha-run.sh install-bats
- name: Install `kubectl`
run: |
sudo az aks install-cli
run: bash tests/integration/gha-run.sh install-kubectl
- name: Download credentials for the Kubernetes CLI to use them
run: |
az aks get-credentials -g "kataCI" -n ${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-${{ matrix.vmm }}-amd64
run: bash tests/integration/gha-run.sh get-cluster-credentials
- name: Run tests
timeout-minutes: 35
run: |
sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}|g" tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
cat tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
cat tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml | grep "${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}" || die "Failed to setup the tests image"
kubectl apply -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml
kubectl apply -f tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
kubectl -n kube-system wait --timeout=10m --for=condition=Ready -l name=kata-deploy pod
kubectl apply -f tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml
# This is needed as the kata-deploy pod will be set to "Ready" when it starts running,
# which may cause issues like not having the node properly labeled or the artefacts
# properly deployed when the tests actually start running.
sleep 150s
pushd tests/integration/kubernetes
sed -i -e 's|runtimeClassName: kata|runtimeClassName: kata-${{ matrix.vmm }}|' runtimeclass_workloads/*.yaml
bash run_kubernetes_tests.sh
popd
env:
KATA_HYPERVISOR: ${{ matrix.vmm }}
timeout-minutes: 60
run: bash tests/integration/gha-run.sh run-tests-aks
- name: Delete AKS cluster
if: always()
run: |
az aks delete \
-g "kataCI" \
-n "${{ github.event.pull_request.number }}-${{ github.event.pull_request.head.sha }}-${{ matrix.vmm }}-amd64" \
--yes \
--no-wait
run: bash tests/integration/gha-run.sh delete-cluster

View File

@@ -21,6 +21,10 @@ jobs:
- qemu-sev
runs-on: sev
env:
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}
DOCKER_TAG: ${{ inputs.tag }}
KATA_HYPERVISOR: ${{ matrix.vmm }}
KUBECONFIG: /home/kata/.kube/config
steps:
- uses: actions/checkout@v3
@@ -29,40 +33,8 @@ jobs:
- name: Run tests
timeout-minutes: 30
run: |
sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}|g" tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
cat tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
cat tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml | grep "${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}" || die "Failed to setup the tests image"
kubectl apply -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml
kubectl apply -f tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
kubectl -n kube-system wait --timeout=10m --for=condition=Ready -l name=kata-deploy pod
kubectl apply -f tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml
# This is needed as the kata-deploy pod will be set to "Ready" when it starts running,
# which may cause issues like not having the node properly labeled or the artefacts
# properly deployed when the tests actually start running.
sleep 60s
pushd tests/integration/kubernetes
sed -i -e 's|runtimeClassName: kata|runtimeClassName: kata-${{ matrix.vmm }}|' runtimeclass_workloads/*.yaml
bash run_kubernetes_tests.sh
popd
env:
KATA_HYPERVISOR: ${{ matrix.vmm }}
run: bash tests/integration/gha-run.sh run-tests-sev
- name: Delete kata-deploy
if: always()
run: |
kubectl delete -f tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
kubectl -n kube-system wait --timeout=10m --for=delete -l name=kata-deploy pod
sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}|g" tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml
cat tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml
cat tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml | grep "${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}" || die "Failed to setup the tests image"
kubectl apply -f tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml
sleep 180s
kubectl delete -f tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml
kubectl delete -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml
kubectl delete -f tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml
run: bash tests/integration/gha-run.sh cleanup-sev

View File

@@ -21,6 +21,10 @@ jobs:
- qemu-snp
runs-on: sev-snp
env:
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}
DOCKER_TAG: ${{ inputs.tag }}
KATA_HYPERVISOR: ${{ matrix.vmm }}
KUBECONFIG: /home/kata/.kube/config
steps:
- uses: actions/checkout@v3
@@ -29,40 +33,8 @@ jobs:
- name: Run tests
timeout-minutes: 30
run: |
sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}|g" tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
cat tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
cat tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml | grep "${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}" || die "Failed to setup the tests image"
kubectl apply -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml
kubectl apply -f tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
kubectl -n kube-system wait --timeout=10m --for=condition=Ready -l name=kata-deploy pod
kubectl apply -f tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml
# This is needed as the kata-deploy pod will be set to "Ready" when it starts running,
# which may cause issues like not having the node properly labeled or the artefacts
# properly deployed when the tests actually start running.
sleep 60s
pushd tests/integration/kubernetes
sed -i -e 's|runtimeClassName: kata|runtimeClassName: kata-${{ matrix.vmm }}|' runtimeclass_workloads/*.yaml
bash run_kubernetes_tests.sh
popd
env:
KATA_HYPERVISOR: ${{ matrix.vmm }}
run: bash tests/integration/gha-run.sh run-tests-snp
- name: Delete kata-deploy
if: always()
run: |
kubectl delete -f tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
kubectl -n kube-system wait --timeout=10m --for=delete -l name=kata-deploy pod
sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}|g" tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml
cat tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml
cat tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml | grep "${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}" || die "Failed to setup the tests image"
kubectl apply -f tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml
sleep 180s
kubectl delete -f tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml
kubectl delete -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml
kubectl delete -f tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml
run: bash tests/integration/gha-run.sh cleanup-snp

View File

@@ -21,6 +21,10 @@ jobs:
- qemu-tdx
runs-on: tdx
env:
DOCKER_REGISTRY: ${{ inputs.registry }}
DOCKER_REPO: ${{ inputs.repo }}
DOCKER_TAG: ${{ inputs.tag }}
KATA_HYPERVISOR: ${{ matrix.vmm }}
KUBECONFIG: /etc/rancher/k3s/k3s.yaml
steps:
- uses: actions/checkout@v3
@@ -29,40 +33,8 @@ jobs:
- name: Run tests
timeout-minutes: 30
run: |
sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}|g" tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
cat tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml
cat tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml | grep "${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}" || die "Failed to setup the tests image"
kubectl apply -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml
kubectl apply -k tools/packaging/kata-deploy/kata-deploy/overlays/k3s
kubectl -n kube-system wait --timeout=10m --for=condition=Ready -l name=kata-deploy pod
kubectl apply -f tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml
# This is needed as the kata-deploy pod will be set to "Ready" when it starts running,
# which may cause issues like not having the node properly labeled or the artefacts
# properly deployed when the tests actually start running.
sleep 60s
pushd tests/integration/kubernetes
sed -i -e 's|runtimeClassName: kata|runtimeClassName: kata-${{ matrix.vmm }}|' runtimeclass_workloads/*.yaml
bash run_kubernetes_tests.sh
popd
env:
KATA_HYPERVISOR: ${{ matrix.vmm }}
run: bash tests/integration/gha-run.sh run-tests-tdx
- name: Delete kata-deploy
if: always()
run: |
kubectl delete -k tools/packaging/kata-deploy/kata-deploy/overlays/k3s
kubectl -n kube-system wait --timeout=10m --for=delete -l name=kata-deploy pod
sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}|g" tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml
cat tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml
cat tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml | grep "${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}" || die "Failed to setup the tests image"
kubectl apply -k tools/packaging/kata-deploy/kata-cleanup/overlays/k3s
sleep 180s
kubectl delete -k tools/packaging/kata-deploy/kata-cleanup/overlays/k3s
kubectl delete -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml
kubectl delete -f tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml
run: bash tests/integration/gha-run.sh cleanup-tdx

View File

@@ -0,0 +1,19 @@
name: CI | Run launch-times metrics
on:
workflow_call:
jobs:
launch-times-tests:
runs-on: metrics
env:
GOPATH: ${{ github.workspace }}
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
- name: run launch times on qemu
run: bash tests/metrics/gha-run.sh run-test-launchtimes-qemu
- name: run launch times on clh
run: bash tests/metrics/gha-run.sh run-test-launchtimes-clh

View File

@@ -1,53 +0,0 @@
name: Release Kata in snapcraft store
on:
push:
tags:
- '[0-9]+.[0-9]+.[0-9]+*'
env:
SNAPCRAFT_STORE_CREDENTIALS: ${{ secrets.snapcraft_token }}
jobs:
release-snap:
runs-on: ubuntu-20.04
steps:
- name: Check out Git repository
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Install Snapcraft
run: |
# Required to avoid snapcraft install failure
sudo chown root:root /
# "--classic" is needed for the GitHub action runner
# environment.
sudo snap install snapcraft --classic
# Allow other parts to access snap binaries
echo /snap/bin >> "$GITHUB_PATH"
- name: Build snap
run: |
# Removing man-db, workflow kept failing, fixes: #4480
sudo apt -y remove --purge man-db
sudo apt-get update
sudo apt-get install -y git git-extras
kata_url="https://github.com/kata-containers/kata-containers"
latest_version=$(git ls-remote --tags ${kata_url} | egrep -o "refs.*" | egrep -v "\-alpha|\-rc|{}" | egrep -o "[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+" | sort -V -r | head -1)
current_version="$(echo ${GITHUB_REF} | cut -d/ -f3)"
# Check semantic versioning format (x.y.z) and if the current tag is the latest tag
if echo "${current_version}" | grep -q "^[[:digit:]]\+\.[[:digit:]]\+\.[[:digit:]]\+$" && echo -e "$latest_version\n$current_version" | sort -C -V; then
# Current version is the latest version, build it
snapcraft snap --debug --destructive-mode
fi
- name: Upload snap
run: |
snap_version="$(echo ${GITHUB_REF} | cut -d/ -f3)"
snap_file="kata-containers_${snap_version}_amd64.snap"
# Upload the snap if it exists
if [ -f ${snap_file} ]; then
snapcraft upload --release=stable ${snap_file}
fi

View File

@@ -1,37 +0,0 @@
name: snap CI
on:
pull_request:
types:
- opened
- synchronize
- reopened
- edited
paths-ignore: [ '**.md', '**.png', '**.jpg', '**.jpeg', '**.svg', '/docs/**' ]
jobs:
test:
runs-on: ubuntu-20.04
steps:
- name: Check out
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Install Snapcraft
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
run: |
# Required to avoid snapcraft install failure
sudo chown root:root /
# "--classic" is needed for the GitHub action runner
# environment.
sudo snap install snapcraft --classic
# Allow other parts to access snap binaries
echo /snap/bin >> "$GITHUB_PATH"
- name: Build snap
if: ${{ !contains(github.event.pull_request.labels.*.name, 'force-skip-ci') }}
run: |
snapcraft snap --debug --destructive-mode

View File

@@ -1,4 +1,6 @@
<img src="https://www.openstack.org/assets/kata/kata-vertical-on-white.png" width="150">
<img src="https://object-storage-ca-ymq-1.vexxhost.net/swift/v1/6e4619c416ff4bd19e1c087f27a43eea/www-images-prod/openstack-logo/kata/SVG/kata-1.svg" width="900">
[![CI | Publish Kata Containers payload](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml/badge.svg)](https://github.com/kata-containers/kata-containers/actions/workflows/payload-after-push.yaml)
# Kata Containers
@@ -144,8 +146,6 @@ The table below lists the remaining parts of the project:
Kata Containers is now
[available natively for most distributions](docs/install/README.md#packaged-installation-methods).
However, packaging scripts and metadata are still used to generate [snap](snap/local) and GitHub releases. See
the [components](#components) section for further details.
## Glossary of Terms

View File

@@ -587,10 +587,15 @@ $ sudo kata-monitor
#### Connect to debug console
Command `kata-runtime exec` is used to connect to the debug console.
You need to start a container for example:
```bash
$ sudo ctr run --runtime io.containerd.kata.v2 -d docker.io/library/ubuntu:latest testdebug
```
Then, you can use the command `kata-runtime exec <sandbox id>` to connect to the debug console.
```
$ kata-runtime exec 1a9ab65be63b8b03dfd0c75036d27f0ed09eab38abb45337fea83acd3cd7bacd
$ kata-runtime exec testdebug
bash-4.2# id
uid=0(root) gid=0(root) groups=0(root)
bash-4.2# pwd

View File

@@ -147,7 +147,8 @@ these commands is potentially challenging.
See issue https://github.com/clearcontainers/runtime/issues/341 and [the constraints challenge](#the-constraints-challenge) for more information.
For CPUs resource management see
[CPU constraints](design/vcpu-handling.md).
[CPU constraints(in runtime-go)](design/vcpu-handling-runtime-go.md).
[CPU constraints(in runtime-rs)](design/vcpu-handling-runtime-rs.md).
# Architectural limitations

View File

@@ -6,7 +6,8 @@ Kata Containers design documents:
- [API Design of Kata Containers](kata-api-design.md)
- [Design requirements for Kata Containers](kata-design-requirements.md)
- [VSocks](VSocks.md)
- [VCPU handling](vcpu-handling.md)
- [VCPU handling(in runtime-go)](vcpu-handling-runtime-go.md)
- [VCPU handling(in runtime-rs)](vcpu-handling-runtime-rs.md)
- [VCPU threads pinning](vcpu-threads-pinning.md)
- [Host cgroups](host-cgroups.md)
- [Agent systemd cgroup](agent-systemd-cgroup.md)

View File

@@ -78,4 +78,4 @@ with the containers is if the VM itself or the `containerd-shim-kata-v2` dies, i
the containers are removed automatically.
[1]: https://wiki.qemu.org/Features/VirtioVsock
[2]: ./vcpu-handling.md#virtual-cpus-and-kubernetes-pods
[2]: ./vcpu-handling-runtime-go.md#virtual-cpus-and-kubernetes-pods

View File

@@ -0,0 +1,51 @@
# Virtual machine vCPU sizing in Kata Containers 3.0
> Preview:
> [Kubernetes(since 1.23)][1] and [Containerd(since 1.6.0-beta4)][2] will help calculate `Sandbox Size` info and pass it to Kata Containers through annotations.
> In order to adapt to this beneficial change and be compatible with the past, we have implemented the new vCPUs handling way in `runtime-rs`, which is slightly different from the original `runtime-go`'s design.
## When do we need to handle vCPUs size?
vCPUs sizing should be determined by the container workloads. So throughout the life cycle of Kata Containers, there are several points in time when we need to think about how many vCPUs should be at the time. Mainly including the time points of `CreateVM`, `CreateContainer`, `UpdateContainer`, and `DeleteContainer`.
* `CreateVM`: When creating a sandbox, we need to know how many vCPUs to start the VM with.
* `CreateContainer`: When creating a new container in the VM, we may need to hot-plug the vCPUs according to the requirements in container's spec.
* `UpdateContainer`: When receiving the `UpdateContainer` request, we may need to update the vCPU resources according to the new requirements of the container.
* `DeleteContainer`: When a container is removed from the VM, we may need to hot-unplug the vCPUs to reclaim the vCPU resources introduced by the container.
## On what basis do we calculate the number of vCPUs?
When Kata calculate the number of vCPUs, We have three data sources, the `default_vcpus` and `default_maxvcpus` specified in the configuration file (named `TomlConfig` later in the doc), the `io.kubernetes.cri.sandbox-cpu-quota` and `io.kubernetes.cri.sandbox-cpu-period` annotations passed by the upper layer runtime, and the corresponding CPU resource part in the container's spec for the container when `CreateContainer`/`UpdateContainer`/`DeleteContainer` is requested.
Our understanding and priority of these resources are as follows, which will affect how we calculate the number of vCPUs later.
* From `TomlConfig`:
* `default_vcpus`: default number of vCPUs when starting a VM.
* `default_maxvcpus`: maximum number of vCPUs.
* From `Annotation`:
* `InitialSize`: we call the size of the resource passed from the annotations as `InitialSize`. Kubernetes will calculate the sandbox size according to the Pod's statement, which is the `InitialSize` here. This size should be the size we want to prioritize.
* From `Container Spec`:
* The amount of CPU resources that the Container wants to use will be declared through the spec. Including the aforementioned annotations, we mainly consider `cpu quota` and `cpuset` when calculating the number of vCPUs.
* `cpu quota`: `cpu quota` is the most common way to declare the amount of CPU resources. The number of vCPUs introduced by `cpu quota` declared in a container's spec is: `vCPUs = ceiling( quota / period )`.
* `cpuset`: `cpuset` is often used to bind the CPUs that tasks can run on. The number of vCPUs may introduced by `cpuset` declared in a container's spec is the number of CPUs specified in the set that do not overlap with other containers.
## How to calculate and adjust the vCPUs size:
There are two types of vCPUs that we need to consider, one is the number of vCPUs when starting the VM (named `Boot Size` in the doc). The second is the number of vCPUs when `CreateContainer`/`UpdateContainer`/`DeleteContainer` request is received (`Real-time Size` in the doc).
### `Boot Size`
The main considerations are `InitialSize` and `default_vcpus`. There are the following principles:
`InitialSize` has priority over `default_vcpus` declared in `TomlConfig`.
1. When there is such an annotation statement, the originally `default_vcpus` will be modified to the number of vCPUs in the `InitialSize` as the `Boot Size`. (Because not all runtimes support this annotation for the time being, we still keep the `default_cpus` in `TomlConfig`.)
2. When the specs of all containers are aggregated for sandbox size calculation, the method is consistent with the calculation method of `InitialSize` here.
### `Real-time Size`
When we receive an OCI request, it may be for a single container. But what we have to consider is the number of vCPUs for the entire VM. So we will maintain a list. Every time there is a demand for adjustment, the entire list will be traversed to calculate a value for the number of vCPUs. In addition, there are the following principles:
1. Do not cut computing power and try to keep the number of vCPUs specified by `InitialSize`.
* So the number of vCPUs after will not be less than the `Boot Size`.
2. `cpu quota` takes precedence over `cpuset` and the setting history are took into account.
* We think quota describes the CPU time slice that a cgroup can use, and `cpuset` describes the actual CPU number that a cgroup can use. Quota can better describe the size of the CPU time slice that a cgroup actually wants to use. The `cpuset` only describes which CPUs the cgroup can use, but the cgroup can use the specified CPU but consumes a smaller time slice, so the quota takes precedence over the `cpuset`.
* On the one hand, when both `cpu quota` and `cpuset` are specified, we will calculate the number of vCPUs based on `cpu quota` and ignore `cpuset`. On the other hand, if `cpu quota` was used to control the number of vCPUs in the past, and only `cpuset` was updated during `UpdateContainer`, we will not adjust the number of vCPUs at this time.
3. `StaticSandboxResourceMgmt` controls hotplug.
* Some VMMs and kernels of some architectures do not support hotplugging. We can accommodate this situation through `StaticSandboxResourceMgmt`. When `StaticSandboxResourceMgmt = true` is set, we don't make any further attempts to update the number of vCPUs after booting.
[1]: https://github.com/kubernetes/kubernetes/pull/104886
[2]: https://github.com/containerd/containerd/pull/6155

View File

@@ -45,6 +45,8 @@
- [How to run Kata Containers with `nydus`](how-to-use-virtio-fs-nydus-with-kata.md)
- [How to run Kata Containers with AMD SEV-SNP](how-to-run-kata-containers-with-SNP-VMs.md)
- [How to use EROFS to build rootfs in Kata Containers](how-to-use-erofs-build-rootfs.md)
- [How to run Kata Containers with kinds of Block Volumes](how-to-run-kata-containers-with-kinds-of-Block-Volumes.md)
## Confidential Containers
- [How to use build and test the Confidential Containers `CCv0` proof of concept](how-to-build-and-test-ccv0.md)
- [How to generate a Kata Containers payload for the Confidential Containers Operator](how-to-generate-a-kata-containers-payload-for-the-confidential-containers-operator.md)

View File

@@ -0,0 +1,78 @@
# A new way for Kata Containers to use Kinds of Block Volumes
> **Note:** This guide is only available for runtime-rs with default Hypervisor Dragonball.
> Now, other hypervisors are still ongoing, and it'll be updated when they're ready.
## Background
Currently, there is no widely applicable and convenient method available for users to use some kinds of backend storages, such as File on host based block volume, SPDK based volume or VFIO device based volume for Kata Containers, so we adopt [Proposal: Direct Block Device Assignment](https://github.com/kata-containers/kata-containers/blob/main/docs/design/direct-blk-device-assignment.md) to address it.
## Solution
According to the proposal, it requires to use the `kata-ctl direct-volume` command to add a direct assigned block volume device to the Kata Containers runtime.
And then with the help of method [get_volume_mount_info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L95), get information from JSON file: `(mountinfo.json)` and parse them into structure [Direct Volume Info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L70) which is used to save device-related information.
We only fill the `mountinfo.json`, such as `device` ,`volume_type`, `fs_type`, `metadata` and `options`, which correspond to the fields in [Direct Volume Info](https://github.com/kata-containers/kata-containers/blob/099b4b0d0e3db31b9054e7240715f0d7f51f9a1c/src/libs/kata-types/src/mount.rs#L70), to describe a device.
The JSON file `mountinfo.json` placed in a sub-path `/kubelet/kata-test-vol-001/volume001` which under fixed path `/run/kata-containers/shared/direct-volumes/`.
And the full path looks like: `/run/kata-containers/shared/direct-volumes/kubelet/kata-test-vol-001/volume001`, But for some security reasons. it is
encoded as `/run/kata-containers/shared/direct-volumes/L2t1YmVsZXQva2F0YS10ZXN0LXZvbC0wMDEvdm9sdW1lMDAx`.
Finally, when running a Kata Containers witch `ctr run --mount type=X, src=Y, dst=Z,,options=rbind:rw`, the `type=X` should be specified a proprietary type specifically designed for some kind of volume.
Now, supported types:
- `directvol` for direct volume
- `spdkvol` for SPDK volume (TBD)
- `vfiovol` for VFIO device based volume (TBD)
## Setup Device and Run a Kata-Containers
### Direct Block Device Based Volume
#### create raw block based backend storage
> **Tips:** raw block based backend storage MUST be formatted with `mkfs`.
```bash
$ sudo dd if=/dev/zero of=/tmp/stor/rawdisk01.20g bs=1M count=20480
$ sudo mkfs.ext4 /tmp/stor/rawdisk01.20g
```
#### setup direct block device for kata-containers
```json
{
"device": "/tmp/stor/rawdisk01.20g",
"volume_type": "directvol",
"fs_type": "ext4",
"metadata":"{}",
"options": []
}
```
```bash
$ sudo ./kata-ctl direct-volume add /kubelet/kata-direct-vol-002/directvol002 "{\"device\": \"/tmp/stor/rawdisk01.20g\", \"volume_type\": \"directvol\", \"fs_type\": \"ext4\", \"metadata\":"{}", \"options\": []}"
$# /kubelet/kata-direct-vol-002/directvol002 <==> /run/kata-containers/shared/direct-volumes/W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx
$ cat W1lMa2F0ZXQva2F0YS10a2F0DAxvbC0wMDEvdm9sdW1lMDAx/mountInfo.json
{"volume_type":"directvol","device":"/tmp/stor/rawdisk01.20g","fs_type":"ext4","metadata":{},"options":[]}
```
#### Run a Kata container with direct block device volume
```bash
$ # type=disrectvol,src=/kubelet/kata-direct-vol-002/directvol002,dst=/disk002,options=rbind:rw
$sudo ctr run -t --rm --runtime io.containerd.kata.v2 --mount type=directvol,src=/kubelet/kata-direct-vol-002/directvol002,dst=/disk002,options=rbind:rw "$image" kata-direct-vol-xx05302045 /bin/bash
```
### SPDK Device Based Volume
TBD
### VFIO Device Based Volume
TBD

View File

@@ -19,7 +19,6 @@ Packaged installation methods uses your distribution's native package format (su
|------------------------------------------------------|----------------------------------------------------------------------------------------------|-------------------|-----------------------------------------------------------------------------------------------|
| [Using kata-deploy](#kata-deploy-installation) | The preferred way to deploy the Kata Containers distributed binaries on a Kubernetes cluster | **No!** | Best way to give it a try on kata-containers on an already up and running Kubernetes cluster. |
| [Using official distro packages](#official-packages) | Kata packages provided by Linux distributions official repositories | yes | Recommended for most users. |
| ~~[Using snap](#snap-installation)~~ | ~~Easy to install~~ | ~~yes~~ | **Snap is unmaintained!** ~~Good alternative to official distro packages.~~ |
| [Automatic](#automatic-installation) | Run a single command to install a full system | **No!** | For those wanting the latest release quickly. |
| [Manual](#manual-installation) | Follow a guide step-by-step to install a working system | **No!** | For those who want the latest release with more control. |
| [Build from source](#build-from-source-installation) | Build the software components manually | **No!** | Power users and developers only. |
@@ -42,27 +41,6 @@ Kata packages are provided by official distribution repositories for:
| [CentOS](centos-installation-guide.md) | 8 |
| [Fedora](fedora-installation-guide.md) | 34 |
### Snap Installation
> **WARNING:**
>
> The Snap package method is **unmaintained** and only provides an old
> version of Kata Containers:
> The [latest Kata Containers snap](https://snapcraft.io/kata-containers)
> provides Kata Containers
> [version 2.4.2](https://github.com/kata-containers/kata-containers/releases/tag/2.4.2)
> but the latest stable Kata Containers release at the time of writing is
> [version 3.1.0](https://github.com/kata-containers/kata-containers/releases/tag/3.1.0).
>
> We recommend strongly that you switch to an alternative Kata Containers installation method.
>
> See: https://github.com/kata-containers/kata-containers/issues/6769
> for further details.
~~The snap installation is available for all distributions which support `snapd`.~~
~~[Use snap](snap-installation-guide.md) to install Kata Containers from https://snapcraft.io. ~~
### Automatic Installation
[Use `kata-manager`](/utils/README.md) to automatically install a working Kata Containers system.

View File

@@ -26,7 +26,6 @@ architectures:
|------------------------------------------------------|----------------------------------------------------------------------------------------------|-------------------|-----------------------------------------------------------------------------------------------|----------- |
| [Using kata-deploy](#kata-deploy-installation) | The preferred way to deploy the Kata Containers distributed binaries on a Kubernetes cluster | **No!** | Best way to give it a try on kata-containers on an already up and running Kubernetes cluster. | Yes |
| [Using official distro packages](#official-packages) | Kata packages provided by Linux distributions official repositories | yes | Recommended for most users. | No |
| [Using snap](#snap-installation) | Easy to install | yes | Good alternative to official distro packages. | No |
| [Automatic](#automatic-installation) | Run a single command to install a full system | **No!** | For those wanting the latest release quickly. | No |
| [Manual](#manual-installation) | Follow a guide step-by-step to install a working system | **No!** | For those who want the latest release with more control. | No |
| [Build from source](#build-from-source-installation) | Build the software components manually | **No!** | Power users and developers only. | Yes |
@@ -36,8 +35,6 @@ architectures:
Follow the [`kata-deploy`](../../tools/packaging/kata-deploy/README.md).
### Official packages
`ToDo`
### Snap Installation
`ToDo`
### Automatic Installation
`ToDo`
### Manual Installation

View File

@@ -1,82 +0,0 @@
# Kata Containers snap package
> **WARNING:**
>
> The Snap package method is **unmaintained** and only provides an old
> version of Kata Containers:
> The [latest Kata Containers snap](https://snapcraft.io/kata-containers)
> provides Kata Containers
> [version 2.4.2](https://github.com/kata-containers/kata-containers/releases/tag/2.4.2)
> but the latest stable Kata Containers release at the time of writing is
> [version 3.1.0](https://github.com/kata-containers/kata-containers/releases/tag/3.1.0).
>
> We recommend strongly that you switch to an alternative Kata Containers installation method.
>
> See: https://github.com/kata-containers/kata-containers/issues/6769
> for further details.
## Install Kata Containers
Kata Containers can be installed in any Linux distribution that supports
[snapd](https://docs.snapcraft.io/installing-snapd).
Run the following command to install **Kata Containers**:
> **WARNING:**
>
> The Snap package method is **unmaintained** and only provides an old
> version of Kata Containers:
> The [latest Kata Containers snap](https://snapcraft.io/kata-containers)
> provides Kata Containers
> [version 2.4.2](https://github.com/kata-containers/kata-containers/releases/tag/2.4.2)
> but the latest stable Kata Containers release at the time of writing is
> [version 3.1.0](https://github.com/kata-containers/kata-containers/releases/tag/3.1.0).
>
> We recommend strongly that you switch to an alternative Kata Containers installation method.
>
> See: https://github.com/kata-containers/kata-containers/issues/6769
> for further details.
```sh
$ sudo snap install kata-containers --stable --classic
```
## Configure Kata Containers
By default Kata Containers snap image is mounted at `/snap/kata-containers` as a
read-only file system, therefore default configuration file can not be edited.
Fortunately Kata Containers supports loading a configuration file from another
path than the default.
```sh
$ sudo mkdir -p /etc/kata-containers
$ sudo cp /snap/kata-containers/current/usr/share/defaults/kata-containers/configuration.toml /etc/kata-containers/
$ $EDITOR /etc/kata-containers/configuration.toml
```
## Integration with shim v2 Container Engines
The Container engine daemon (`cri-o`, `containerd`, etc) needs to be able to find the
`containerd-shim-kata-v2` binary to allow Kata Containers to be created.
Run the following command to create a symbolic link to the shim v2 binary.
```sh
$ sudo ln -sf /snap/kata-containers/current/usr/bin/containerd-shim-kata-v2 /usr/local/bin/containerd-shim-kata-v2
```
Once the symbolic link has been created and the engine daemon configured, `io.containerd.kata.v2`
can be used as runtime.
Read the following documents to know how to run Kata Containers 2.x with `containerd`.
* [How to use Kata Containers and Containerd](../how-to/containerd-kata.md)
* [Install Kata Containers with containerd](./container-manager/containerd/containerd-install.md)
## Remove Kata Containers snap package
Run the following command to remove the Kata Containers snap:
```sh
$ sudo snap remove kata-containers
```

View File

@@ -1,101 +0,0 @@
# Kata Containers snap image
This directory contains the resources needed to build the Kata Containers
[snap][1] image.
## Initial setup
Kata Containers can be installed in any Linux distribution that supports
[snapd](https://docs.snapcraft.io/installing-snapd). For this example, we
assume Ubuntu as your base distro.
```sh
$ sudo apt-get --no-install-recommends install -y apt-utils ca-certificates snapd snapcraft
```
## Install snap
You can install the Kata Containers snap from the [snapcraft store][8] or by running the following command:
```sh
$ sudo snap install kata-containers --classic
```
## Build and install snap image
Run the command below which will use the packaging Makefile to build the snap image:
```sh
$ make -C tools/packaging snap
```
> **Warning:**
>
> By default, `snapcraft` will create a clean virtual machine
> environment to build the snap in using the `multipass` tool.
>
> However, `multipass` is silently disabled when `--destructive-mode` is
> used.
>
> Since building the Kata Containers package currently requires
> `--destructive-mode`, the snap will be built using the host
> environment. To avoid parts of the build auto-detecting additional
> features to enable (for example for QEMU), we recommend that you
> only run the snap build in a minimal host environment.
To install the resulting snap image, snap must be put in [classic mode][3] and the
security confinement must be disabled (`--classic`). Also since the resulting snap
has not been signed the verification of signature must be omitted (`--dangerous`).
```sh
$ sudo snap install --classic --dangerous "kata-containers_${version}_${arch}.snap"
```
Replace `${version}` with the current version of Kata Containers and `${arch}` with
the system architecture.
## Configure Kata Containers
By default Kata Containers snap image is mounted at `/snap/kata-containers` as a
read-only file system, therefore default configuration file can not be edited.
Fortunately [`kata-runtime`][4] supports loading a configuration file from another
path than the default.
```sh
$ sudo mkdir -p /etc/kata-containers
$ sudo cp /snap/kata-containers/current/usr/share/defaults/kata-containers/configuration.toml /etc/kata-containers/
$ $EDITOR /etc/kata-containers/configuration.toml
```
## Integration with docker and Kubernetes
The path to the runtime provided by the Kata Containers snap image is
`/snap/kata-containers/current/usr/bin/kata-runtime`. You should use it to
run Kata Containers with [docker][9] and [Kubernetes][10].
## Remove snap
You can remove the Kata Containers snap by running the following command:
```sh
$ sudo snap remove kata-containers
```
## Limitations
The [miniOS image][2] is not included in the snap image as it is not possible for
QEMU to open a guest RAM backing store on a read-only filesystem. Fortunately,
you can start Kata Containers with a Linux initial RAM disk (initrd) that is
included in the snap image. If you want to use the miniOS image instead of initrd,
then a new configuration file can be [created](#configure-kata-containers)
and [configured][7].
[1]: https://docs.snapcraft.io/snaps/intro
[2]: ../../docs/design/architecture/README.md#root-filesystem-image
[3]: https://docs.snapcraft.io/reference/confinement#classic
[4]: https://github.com/kata-containers/kata-containers/tree/main/src/runtime#configuration
[5]: https://docs.docker.com/engine/reference/commandline/dockerd
[6]: ../../docs/install/docker/ubuntu-docker-install.md
[7]: ../../docs/Developer-Guide.md#configure-to-use-initrd-or-rootfs-image
[8]: https://snapcraft.io/kata-containers
[9]: ../../docs/Developer-Guide.md#run-kata-containers-with-docker
[10]: ../../docs/Developer-Guide.md#run-kata-containers-with-kubernetes

View File

@@ -1,114 +0,0 @@
#!/usr/bin/env bash
#
# Copyright (c) 2022 Intel Corporation
#
# SPDX-License-Identifier: Apache-2.0
# Description: Idempotent script to be sourced by all parts in a
# snapcraft config file.
set -o errexit
set -o nounset
set -o pipefail
# XXX: Bash-specific code. zsh doesn't support this option and that *does*
# matter if this script is run sourced... since it'll be using zsh! ;)
[ -n "$BASH_VERSION" ] && set -o errtrace
[ -n "${DEBUG:-}" ] && set -o xtrace
die()
{
echo >&2 "ERROR: $0: $*"
}
[ -n "${SNAPCRAFT_STAGE:-}" ] ||\
die "must be sourced from a snapcraft config file"
snap_yq_version=3.4.1
snap_common_install_yq()
{
export yq="${SNAPCRAFT_STAGE}/bin/yq"
local yq_pkg
yq_pkg="github.com/mikefarah/yq"
local yq_url
yq_url="https://${yq_pkg}/releases/download/${snap_yq_version}/yq_${goos}_${goarch}"
curl -o "${yq}" -L "${yq_url}"
chmod +x "${yq}"
}
# Function that should be called for each snap "part" in
# snapcraft.yaml.
snap_common_main()
{
# Architecture
arch="$(uname -m)"
case "${arch}" in
aarch64)
goarch="arm64"
qemu_arch="${arch}"
;;
ppc64le)
goarch="ppc64le"
qemu_arch="ppc64"
;;
s390x)
goarch="${arch}"
qemu_arch="${arch}"
;;
x86_64)
goarch="amd64"
qemu_arch="${arch}"
;;
*) die "unsupported architecture: ${arch}" ;;
esac
dpkg_arch=$(dpkg --print-architecture)
# golang
#
# We need the O/S name in golang format, but since we don't
# know if the godeps part has run, we don't know if golang is
# available yet, hence fall back to a standard system command.
goos="$(go env GOOS &>/dev/null || true)"
[ -z "$goos" ] && goos=$(uname -s|tr '[A-Z]' '[a-z]')
export GOROOT="${SNAPCRAFT_STAGE}"
export GOPATH="${GOROOT}/gopath"
export GO111MODULE="auto"
mkdir -p "${GOPATH}/bin"
export PATH="${GOPATH}/bin:${PATH}"
# Proxy
export http_proxy="${http_proxy:-}"
export https_proxy="${https_proxy:-}"
# Binaries
mkdir -p "${SNAPCRAFT_STAGE}/bin"
export PATH="$PATH:${SNAPCRAFT_STAGE}/bin"
# YAML query tool
export yq="${SNAPCRAFT_STAGE}/bin/yq"
# Kata paths
export kata_dir=$(printf "%s/src/github.com/%s/%s" \
"${GOPATH}" \
"${SNAPCRAFT_PROJECT_NAME}" \
"${SNAPCRAFT_PROJECT_NAME}")
export versions_file="${kata_dir}/versions.yaml"
[ -n "${yq:-}" ] && [ -x "${yq:-}" ] || snap_common_install_yq
}
snap_common_main

View File

@@ -1,170 +0,0 @@
name: kata-containers
website: https://github.com/kata-containers/kata-containers
summary: Build lightweight VMs that seamlessly plug into the containers ecosystem
description: |
Kata Containers is an open source project and community working to build a
standard implementation of lightweight Virtual Machines (VMs) that feel and
perform like containers, but provide the workload isolation and security
advantages of VMs
confinement: classic
adopt-info: metadata
base: core20
parts:
metadata:
plugin: nil
prime:
- -*
build-packages:
- git
- git-extras
override-pull: |
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
version="9999"
if echo "${GITHUB_REF:-}" | grep -q -E "^refs/tags"; then
version=$(echo ${GITHUB_REF:-} | cut -d/ -f3)
git checkout ${version}
fi
snapcraftctl set-grade "stable"
snapcraftctl set-version "${version}"
mkdir -p $(dirname ${kata_dir})
ln -sf $(realpath "${SNAPCRAFT_STAGE}/..") ${kata_dir}
docker:
after: [metadata]
plugin: nil
prime:
- -*
build-packages:
- ca-certificates
- containerd
- curl
- gnupg
- lsb-release
- runc
override-build: |
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
curl -fsSL https://download.docker.com/linux/ubuntu/gpg |\
sudo gpg --batch --yes --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
distro_codename=$(lsb_release -cs)
echo "deb [arch=${dpkg_arch} signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu ${distro_codename} stable" |\
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get -y update
sudo apt-get -y install docker-ce docker-ce-cli containerd.io
echo "Unmasking docker service"
sudo -E systemctl unmask docker.service || true
sudo -E systemctl unmask docker.socket || true
echo "Adding $USER into docker group"
sudo -E gpasswd -a $USER docker
echo "Starting docker"
# docker may fail to start using "fd://" in docker.service
sudo sed -i 's/fd:\/\//unix:\/\//g' /lib/systemd/system/docker.service
sudo systemctl daemon-reload
sudo -E systemctl start docker || true
image:
after: [docker]
plugin: nil
override-build: |
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
cd "${SNAPCRAFT_PROJECT_DIR}"
sudo -E NO_TTY=true make rootfs-image-tarball
tarfile="${SNAPCRAFT_PROJECT_DIR}/tools/packaging/kata-deploy/local-build/build/kata-static-rootfs-image.tar.xz"
tar -xvJpf "${tarfile}" -C "${SNAPCRAFT_PART_INSTALL}"
sudo -E NO_TTY=true make rootfs-initrd-tarball
tarfile="${SNAPCRAFT_PROJECT_DIR}/tools/packaging/kata-deploy/local-build/build/kata-static-rootfs-initrd.tar.xz"
tar -xvJpf "${tarfile}" -C "${SNAPCRAFT_PART_INSTALL}"
runtime:
after: [docker]
plugin: nil
override-build: |
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
cd "${SNAPCRAFT_PROJECT_DIR}"
sudo -E NO_TTY=true make shim-v2-tarball
tarfile="${SNAPCRAFT_PROJECT_DIR}/tools/packaging/kata-deploy/local-build/build/kata-static-shim-v2.tar.xz"
tar -xvJpf "${tarfile}" -C "${SNAPCRAFT_PART_INSTALL}"
mkdir -p "${SNAPCRAFT_PART_INSTALL}/usr/bin"
ln -sf "${SNAPCRAFT_PART_INSTALL}/opt/kata/bin/containerd-shim-kata-v2" "${SNAPCRAFT_PART_INSTALL}/usr/bin/containerd-shim-kata-v2"
ln -sf "${SNAPCRAFT_PART_INSTALL}/opt/kata/bin/kata-runtime" "${SNAPCRAFT_PART_INSTALL}/usr/bin/kata-runtime"
ln -sf "${SNAPCRAFT_PART_INSTALL}/opt/kata/bin/kata-collect-data.sh" "${SNAPCRAFT_PART_INSTALL}/usr/bin/kata-collect-data.sh"
kernel:
after: [docker]
plugin: nil
override-build: |
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
cd "${SNAPCRAFT_PROJECT_DIR}"
sudo -E NO_TTY=true make kernel-tarball
tarfile="${SNAPCRAFT_PROJECT_DIR}/tools/packaging/kata-deploy/local-build/build/kata-static-kernel.tar.xz"
tar -xvJpf "${tarfile}" -C "${SNAPCRAFT_PART_INSTALL}"
qemu:
plugin: make
after: [docker]
override-build: |
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
cd "${SNAPCRAFT_PROJECT_DIR}"
sudo -E NO_TTY=true make qemu-tarball
tarfile="${SNAPCRAFT_PROJECT_DIR}/tools/packaging/kata-deploy/local-build/build/kata-static-qemu.tar.xz"
tar -xvJpf "${tarfile}" -C "${SNAPCRAFT_PART_INSTALL}"
virtiofsd:
plugin: nil
after: [docker]
override-build: |
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
cd "${SNAPCRAFT_PROJECT_DIR}"
sudo -E NO_TTY=true make virtiofsd-tarball
tarfile="${SNAPCRAFT_PROJECT_DIR}/tools/packaging/kata-deploy/local-build/build/kata-static-virtiofsd.tar.xz"
tar -xvJpf "${tarfile}" -C "${SNAPCRAFT_PART_INSTALL}"
cloud-hypervisor:
plugin: nil
after: [docker]
override-build: |
source "${SNAPCRAFT_PROJECT_DIR}/snap/local/snap-common.sh"
if [ "${arch}" == "aarch64" ] || [ "${arch}" == "x86_64" ]; then
cd "${SNAPCRAFT_PROJECT_DIR}"
sudo -E NO_TTY=true make cloud-hypervisor-tarball
tarfile="${SNAPCRAFT_PROJECT_DIR}/tools/packaging/kata-deploy/local-build/build/kata-static-cloud-hypervisor.tar.xz"
tar -xvJpf "${tarfile}" -C "${SNAPCRAFT_PART_INSTALL}"
fi
apps:
runtime:
command: usr/bin/kata-runtime
shim:
command: usr/bin/containerd-shim-kata-v2
collect-data:
command: usr/bin/kata-collect-data.sh

33
src/agent/Cargo.lock generated
View File

@@ -2146,6 +2146,7 @@ dependencies = [
"num_cpus",
"oci",
"regex",
"safe-path",
"serde",
"serde_json",
"slog",
@@ -2285,6 +2286,7 @@ dependencies = [
"slog-async",
"slog-json",
"slog-scope",
"slog-term",
]
[[package]]
@@ -2645,6 +2647,15 @@ dependencies = [
"libc",
]
[[package]]
name = "num_threads"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
dependencies = [
"libc",
]
[[package]]
name = "objc"
version = "0.2.7"
@@ -3887,6 +3898,13 @@ version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
[[package]]
name = "safe-path"
version = "0.1.0"
dependencies = [
"libc",
]
[[package]]
name = "salsa20"
version = "0.10.2"
@@ -4384,6 +4402,19 @@ dependencies = [
"slog-scope",
]
[[package]]
name = "slog-term"
version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87d29185c55b7b258b4f120eab00f48557d4d9bc814f41713f449d35b0f8977c"
dependencies = [
"atty",
"slog",
"term",
"thread_local",
"time 0.3.20",
]
[[package]]
name = "smallvec"
version = "1.10.0"
@@ -4646,6 +4677,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890"
dependencies = [
"itoa",
"libc",
"num_threads",
"serde",
"time-core",
"time-macros",

View File

@@ -161,7 +161,7 @@ impl Process {
pub fn notify_term_close(&mut self) {
let notify = self.term_exit_notifier.clone();
notify.notify_one();
notify.notify_waiters();
}
pub fn close_stdin(&mut self) {

View File

@@ -81,7 +81,8 @@ cfg_if! {
// sysfs as directories in the subtree under /sys/devices/LNXSYSTM:00
pub const ACPI_DEV_PATH: &str = "/devices/LNXSYSTM";
pub const SYSFS_CPU_ONLINE_PATH: &str = "/sys/devices/system/cpu";
pub const SYSFS_CPU_PATH: &str = "/sys/devices/system/cpu";
pub const SYSFS_CPU_ONLINE_PATH: &str = "/sys/devices/system/cpu/online";
pub const SYSFS_MEMORY_BLOCK_SIZE_PATH: &str = "/sys/devices/system/memory/block_size_bytes";
pub const SYSFS_MEMORY_HOTPLUG_PROBE_PATH: &str = "/sys/devices/system/memory/probe";

View File

@@ -89,6 +89,27 @@ impl Handle {
.await?;
}
// we need to update the link's interface name, thus we should rename the existed link whose name
// is the same with the link's request name, otherwise, it would update the link failed with the
// name conflicted.
let mut new_link = None;
if link.name() != iface.name {
if let Ok(link) = self.find_link(LinkFilter::Name(iface.name.as_str())).await {
// update the existing interface name with a temporary name, otherwise
// it would failed to udpate this interface with an existing name.
let mut request = self.handle.link().set(link.index());
request.message_mut().header = link.header.clone();
request
.name(format!("{}_temp", link.name()))
.up()
.execute()
.await?;
new_link = Some(link);
}
}
// Update link
let mut request = self.handle.link().set(link.index());
request.message_mut().header = link.header.clone();
@@ -101,6 +122,14 @@ impl Handle {
.execute()
.await?;
// swap the updated iface's name.
if let Some(nlink) = new_link {
let mut request = self.handle.link().set(nlink.index());
request.message_mut().header = nlink.header.clone();
request.name(link.name()).up().execute().await?;
}
Ok(())
}

View File

@@ -673,15 +673,16 @@ impl AgentService {
let cid = req.container_id;
let eid = req.exec_id;
let mut term_exit_notifier = Arc::new(tokio::sync::Notify::new());
let term_exit_notifier;
let reader = {
let s = self.sandbox.clone();
let mut sandbox = s.lock().await;
let p = sandbox.find_container_process(cid.as_str(), eid.as_str())?;
term_exit_notifier = p.term_exit_notifier.clone();
if p.term_master.is_some() {
term_exit_notifier = p.term_exit_notifier.clone();
p.get_reader(StreamType::TermMaster)
} else if stdout {
if p.parent_stdout.is_some() {

View File

@@ -12,6 +12,7 @@ use crate::pci;
use crate::uevent::{Uevent, UeventMatcher};
use crate::watcher::BindWatcher;
use anyhow::{anyhow, Context, Result};
use kata_types::cpu::CpuSet;
use libc::pid_t;
use oci::{Hook, Hooks};
use protocols::agent::OnlineCPUMemRequest;
@@ -25,6 +26,7 @@ use std::collections::HashMap;
use std::fs;
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::str::FromStr;
use std::sync::Arc;
use std::{thread, time};
use tokio::sync::mpsc::{channel, Receiver, Sender};
@@ -265,12 +267,12 @@ impl Sandbox {
pub fn online_cpu_memory(&self, req: &OnlineCPUMemRequest) -> Result<()> {
if req.nb_cpus > 0 {
// online cpus
online_cpus(&self.logger, req.nb_cpus as i32)?;
online_cpus(&self.logger, req.nb_cpus as i32).context("online cpus")?;
}
if !req.cpu_only {
// online memory
online_memory(&self.logger)?;
online_memory(&self.logger).context("online memory")?;
}
if req.nb_cpus == 0 {
@@ -434,23 +436,33 @@ fn online_resources(logger: &Logger, path: &str, pattern: &str, num: i32) -> Res
// max wait for all CPUs to online will use 50 * 100 = 5 seconds.
const ONLINE_CPUMEM_WATI_MILLIS: u64 = 50;
const ONLINE_CPUMEM_MAX_RETRIES: u32 = 100;
const ONLINE_CPUMEM_MAX_RETRIES: i32 = 100;
#[instrument]
fn online_cpus(logger: &Logger, num: i32) -> Result<i32> {
let mut onlined_count: i32 = 0;
let mut onlined_cpu_count = onlined_cpus().context("onlined cpu count")?;
// for some vmms, like dragonball, they will online cpus for us
// so check first whether agent need to do the online operation
if onlined_cpu_count >= num {
return Ok(num);
}
for i in 0..ONLINE_CPUMEM_MAX_RETRIES {
let r = online_resources(
// online num resources
online_resources(
logger,
SYSFS_CPU_ONLINE_PATH,
SYSFS_CPU_PATH,
r"cpu[0-9]+",
num - onlined_count,
);
num - onlined_cpu_count,
)
.context("online cpu resource")?;
onlined_count += r?;
if onlined_count == num {
info!(logger, "online {} CPU(s) after {} retries", num, i);
onlined_cpu_count = onlined_cpus().context("onlined cpu count")?;
if onlined_cpu_count >= num {
info!(
logger,
"Currently {} onlined CPU(s) after {} retries", onlined_cpu_count, i
);
return Ok(num);
}
thread::sleep(time::Duration::from_millis(ONLINE_CPUMEM_WATI_MILLIS));
@@ -465,10 +477,18 @@ fn online_cpus(logger: &Logger, num: i32) -> Result<i32> {
#[instrument]
fn online_memory(logger: &Logger) -> Result<()> {
online_resources(logger, SYSFS_MEMORY_ONLINE_PATH, r"memory[0-9]+", -1)?;
online_resources(logger, SYSFS_MEMORY_ONLINE_PATH, r"memory[0-9]+", -1)
.context("online memory resource")?;
Ok(())
}
fn onlined_cpus() -> Result<i32> {
let content =
fs::read_to_string(SYSFS_CPU_ONLINE_PATH).context("read sysfs cpu online file")?;
let online_cpu_set = CpuSet::from_str(content.trim())?;
Ok(online_cpu_set.len() as i32)
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -180,9 +180,9 @@ dependencies = [
[[package]]
name = "crossbeam-channel"
version = "0.5.7"
version = "0.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf2b3e8478797446514c91ef04bafcb59faba183e621ad488df88983cc14128c"
checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
dependencies = [
"cfg-if",
"crossbeam-utils",
@@ -209,11 +209,12 @@ dependencies = [
[[package]]
name = "dbs-address-space"
version = "0.2.2"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bcc37dc0b8ffae1c5911d13ae630dc7a9020fa0de0edd178d6ab71daf56c8fc"
checksum = "95e20d28a9cd13bf00d0ecd1bd073d242242b04f0acb663d7adfc659f8879322"
dependencies = [
"arc-swap",
"lazy_static",
"libc",
"nix 0.23.2",
"thiserror",
@@ -247,9 +248,9 @@ dependencies = [
[[package]]
name = "dbs-boot"
version = "0.3.1"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a74a8c05a1674d3032e610b4f201c7440c345559bad3dfe6b455ce195785108"
checksum = "5466a92f75aa928a9103dcb2088f6d1638ef9da8945fad7389a73864dfa0182c"
dependencies = [
"dbs-arch",
"kvm-bindings",
@@ -300,9 +301,9 @@ dependencies = [
[[package]]
name = "dbs-upcall"
version = "0.2.0"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "699e62afa444ae4b00d474fd91bc37785ba050acdfbe179731c81898e32efc3f"
checksum = "ea3a78128fd0be8b8b10257675c262b378dc5d00b1e18157736a6c27e45ce4fb"
dependencies = [
"anyhow",
"dbs-utils",
@@ -330,9 +331,9 @@ dependencies = [
[[package]]
name = "dbs-virtio-devices"
version = "0.2.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88e5c6c48b766afb95851b04b6b193871a59d0b2a3ed19990d4f8f651ae5c668"
checksum = "24d671cc3e5f98b84ef6b6bed007d28f72f16d3aea8eb38e2d42b00b2973c1d8"
dependencies = [
"byteorder",
"caps",
@@ -346,7 +347,7 @@ dependencies = [
"kvm-ioctls",
"libc",
"log",
"nix 0.23.2",
"nix 0.24.3",
"nydus-api",
"nydus-blobfs",
"nydus-rafs",
@@ -444,13 +445,13 @@ dependencies = [
[[package]]
name = "errno"
version = "0.2.8"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a"
dependencies = [
"errno-dragonfly",
"libc",
"winapi",
"windows-sys 0.48.0",
]
[[package]]
@@ -482,7 +483,7 @@ dependencies = [
"cfg-if",
"libc",
"redox_syscall",
"windows-sys",
"windows-sys 0.45.0",
]
[[package]]
@@ -618,9 +619,9 @@ dependencies = [
[[package]]
name = "getrandom"
version = "0.2.8"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
dependencies = [
"cfg-if",
"libc",
@@ -670,7 +671,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3"
dependencies = [
"libc",
"windows-sys",
"windows-sys 0.45.0",
]
[[package]]
@@ -841,7 +842,7 @@ dependencies = [
"libc",
"log",
"wasi",
"windows-sys",
"windows-sys 0.45.0",
]
[[package]]
@@ -1044,7 +1045,7 @@ dependencies = [
"libc",
"redox_syscall",
"smallvec",
"windows-sys",
"windows-sys 0.45.0",
]
[[package]]
@@ -1120,16 +1121,16 @@ checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342"
[[package]]
name = "rustix"
version = "0.36.8"
version = "0.36.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644"
checksum = "14e4d67015953998ad0eb82887a0eb0129e18a7e2f3b7b0f6c422fddcd503d62"
dependencies = [
"bitflags",
"errno",
"io-lifetimes",
"libc",
"linux-raw-sys",
"windows-sys",
"windows-sys 0.45.0",
]
[[package]]
@@ -1167,18 +1168,18 @@ dependencies = [
[[package]]
name = "serde"
version = "1.0.152"
version = "1.0.156"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb"
checksum = "314b5b092c0ade17c00142951e50ced110ec27cea304b1037c6969246c2469a4"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.152"
version = "1.0.156"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e"
checksum = "d7e29c4601e36bcec74a223228dce795f4cd3616341a4af93520ca1a837c087d"
dependencies = [
"proc-macro2",
"quote",
@@ -1187,9 +1188,9 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.93"
version = "1.0.96"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cad406b69c91885b5107daf2c29572f6c8cdb3c66826821e286c533490c0bc76"
checksum = "057d394a50403bcac12672b2b18fb387ab6d289d957dab67dd201875391e52f1"
dependencies = [
"itoa",
"ryu",
@@ -1422,7 +1423,7 @@ dependencies = [
"pin-project-lite",
"socket2",
"tokio-macros",
"windows-sys",
"windows-sys 0.45.0",
]
[[package]]
@@ -1610,7 +1611,16 @@ version = "0.45.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
dependencies = [
"windows-targets",
"windows-targets 0.42.1",
]
[[package]]
name = "windows-sys"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets 0.48.0",
]
[[package]]
@@ -1619,13 +1629,28 @@ version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
"windows_aarch64_gnullvm 0.42.1",
"windows_aarch64_msvc 0.42.1",
"windows_i686_gnu 0.42.1",
"windows_i686_msvc 0.42.1",
"windows_x86_64_gnu 0.42.1",
"windows_x86_64_gnullvm 0.42.1",
"windows_x86_64_msvc 0.42.1",
]
[[package]]
name = "windows-targets"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5"
dependencies = [
"windows_aarch64_gnullvm 0.48.0",
"windows_aarch64_msvc 0.48.0",
"windows_i686_gnu 0.48.0",
"windows_i686_msvc 0.48.0",
"windows_x86_64_gnu 0.48.0",
"windows_x86_64_gnullvm 0.48.0",
"windows_x86_64_msvc 0.48.0",
]
[[package]]
@@ -1634,42 +1659,84 @@ version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
[[package]]
name = "windows_aarch64_msvc"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
[[package]]
name = "windows_aarch64_msvc"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
[[package]]
name = "windows_i686_gnu"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
[[package]]
name = "windows_i686_gnu"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
[[package]]
name = "windows_i686_msvc"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
[[package]]
name = "windows_i686_msvc"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
[[package]]
name = "windows_x86_64_gnu"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
[[package]]
name = "windows_x86_64_gnu"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
[[package]]
name = "windows_x86_64_msvc"
version = "0.42.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
[[package]]
name = "windows_x86_64_msvc"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
[[package]]
name = "xattr"
version = "0.2.3"

View File

@@ -12,16 +12,16 @@ edition = "2018"
[dependencies]
arc-swap = "1.5.0"
bytes = "1.1.0"
dbs-address-space = "0.2.0"
dbs-address-space = "0.3.0"
dbs-allocator = "0.1.0"
dbs-arch = "0.2.0"
dbs-boot = "0.3.0"
dbs-boot = "0.4.0"
dbs-device = "0.2.0"
dbs-interrupt = { version = "0.2.0", features = ["kvm-irq"] }
dbs-legacy-devices = "0.1.0"
dbs-upcall = { version = "0.2.0", optional = true }
dbs-upcall = { version = "0.3.0", optional = true }
dbs-utils = "0.2.0"
dbs-virtio-devices = { version = "0.2.0", optional = true, features = ["virtio-mmio"] }
dbs-virtio-devices = { version = "0.3.1", optional = true, features = ["virtio-mmio"] }
kvm-bindings = "0.6.0"
kvm-ioctls = "0.12.0"
lazy_static = "1.2"
@@ -55,3 +55,5 @@ virtio-blk = ["dbs-virtio-devices/virtio-blk", "virtio-queue"]
virtio-net = ["dbs-virtio-devices/virtio-net", "virtio-queue"]
# virtio-fs only work on atomic-guest-memory
virtio-fs = ["dbs-virtio-devices/virtio-fs", "virtio-queue", "atomic-guest-memory"]
virtio-mem = ["dbs-virtio-devices/virtio-mem", "virtio-queue", "atomic-guest-memory"]
virtio-balloon = ["dbs-virtio-devices/virtio-balloon", "virtio-queue"]

View File

@@ -19,6 +19,8 @@ use crate::vmm::Vmm;
use self::VmConfigError::*;
use self::VmmActionError::MachineConfig;
#[cfg(feature = "virtio-balloon")]
pub use crate::device_manager::balloon_dev_mgr::{BalloonDeviceConfigInfo, BalloonDeviceError};
#[cfg(feature = "virtio-blk")]
pub use crate::device_manager::blk_dev_mgr::{
BlockDeviceConfigInfo, BlockDeviceConfigUpdateInfo, BlockDeviceError, BlockDeviceMgr,
@@ -27,6 +29,8 @@ pub use crate::device_manager::blk_dev_mgr::{
pub use crate::device_manager::fs_dev_mgr::{
FsDeviceConfigInfo, FsDeviceConfigUpdateInfo, FsDeviceError, FsDeviceMgr, FsMountConfigInfo,
};
#[cfg(feature = "virtio-mem")]
pub use crate::device_manager::mem_dev_mgr::{MemDeviceConfigInfo, MemDeviceError};
#[cfg(feature = "virtio-net")]
pub use crate::device_manager::virtio_net_dev_mgr::{
VirtioNetDeviceConfigInfo, VirtioNetDeviceConfigUpdateInfo, VirtioNetDeviceError,
@@ -34,7 +38,6 @@ pub use crate::device_manager::virtio_net_dev_mgr::{
};
#[cfg(feature = "virtio-vsock")]
pub use crate::device_manager::vsock_dev_mgr::{VsockDeviceConfigInfo, VsockDeviceError};
#[cfg(feature = "hotplug")]
pub use crate::vcpu::{VcpuResizeError, VcpuResizeInfo};
@@ -97,6 +100,20 @@ pub enum VmmActionError {
/// The action `ResizeVcpu` Failed
#[error("vcpu resize error : {0}")]
ResizeVcpu(#[source] VcpuResizeError),
/// Cannot access address space.
#[error("Cannot access address space.")]
AddressSpaceNotInitialized,
#[cfg(feature = "virtio-mem")]
/// Mem device related errors.
#[error("virtio-mem device error: {0}")]
Mem(#[source] MemDeviceError),
#[cfg(feature = "virtio-balloon")]
/// Balloon device related errors.
#[error("virtio-balloon device error: {0}")]
Balloon(#[source] BalloonDeviceError),
}
/// This enum represents the public interface of the VMM. Each action contains various
@@ -172,6 +189,15 @@ pub enum VmmAction {
#[cfg(feature = "hotplug")]
/// Resize Vcpu number in the guest.
ResizeVcpu(VcpuResizeInfo),
#[cfg(feature = "virtio-mem")]
/// Add a new mem device or update one that already exists using the `MemDeviceConfig` as input.
InsertMemDevice(MemDeviceConfigInfo),
#[cfg(feature = "virtio-balloon")]
/// Add a new balloon device or update one that already exists using the `BalloonDeviceConfig`
/// as input.
InsertBalloonDevice(BalloonDeviceConfigInfo),
}
/// The enum represents the response sent by the VMM in case of success. The response is either
@@ -274,6 +300,12 @@ impl VmmService {
}
#[cfg(feature = "hotplug")]
VmmAction::ResizeVcpu(vcpu_resize_cfg) => self.resize_vcpu(vmm, vcpu_resize_cfg),
#[cfg(feature = "virtio-mem")]
VmmAction::InsertMemDevice(mem_cfg) => self.add_mem_device(vmm, event_mgr, mem_cfg),
#[cfg(feature = "virtio-balloon")]
VmmAction::InsertBalloonDevice(balloon_cfg) => {
self.add_balloon_device(vmm, event_mgr, balloon_cfg)
}
};
debug!("send vmm response: {:?}", response);
@@ -626,12 +658,6 @@ impl VmmService {
#[cfg(feature = "hotplug")]
fn resize_vcpu(&mut self, vmm: &mut Vmm, config: VcpuResizeInfo) -> VmmRequestResult {
if !cfg!(target_arch = "x86_64") {
// TODO: Arm need to support vcpu hotplug. issue: #6010
warn!("This arch do not support vm resize!");
return Ok(VmmData::Empty);
}
if !cfg!(feature = "dbs-upcall") {
warn!("We only support cpu resize through upcall server in the guest kernel now, please enable dbs-upcall feature.");
return Ok(VmmData::Empty);
@@ -654,6 +680,62 @@ impl VmmService {
Ok(VmmData::Empty)
}
#[cfg(feature = "virtio-mem")]
fn add_mem_device(
&mut self,
vmm: &mut Vmm,
event_mgr: &mut EventManager,
config: MemDeviceConfigInfo,
) -> VmmRequestResult {
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
let ctx = vm
.create_device_op_context(Some(event_mgr.epoll_manager()))
.map_err(|e| {
if let StartMicroVmError::UpcallServerNotReady = e {
VmmActionError::UpcallServerNotReady
} else {
VmmActionError::StartMicroVm(e)
}
})?;
vm.device_manager_mut()
.mem_manager
.insert_or_update_device(ctx, config)
.map(|_| VmmData::Empty)
.map_err(VmmActionError::Mem)
}
#[cfg(feature = "virtio-balloon")]
fn add_balloon_device(
&mut self,
vmm: &mut Vmm,
event_mgr: &mut EventManager,
config: BalloonDeviceConfigInfo,
) -> VmmRequestResult {
let vm = vmm.get_vm_mut().ok_or(VmmActionError::InvalidVMID)?;
if config.size_mib != 0 {
info!("add_balloon_device: wait prealloc");
vm.stop_prealloc().map_err(VmmActionError::StartMicroVm)?;
}
let ctx = vm
.create_device_op_context(Some(event_mgr.epoll_manager()))
.map_err(|e| {
if let StartMicroVmError::UpcallServerNotReady = e {
VmmActionError::UpcallServerNotReady
} else {
VmmActionError::StartMicroVm(e)
}
})?;
vm.device_manager_mut()
.balloon_manager
.insert_or_update_device(ctx, config)
.map(|_| VmmData::Empty)
.map_err(VmmActionError::Balloon)
}
}
fn handle_cpu_topology(
@@ -1462,4 +1544,84 @@ mod tests {
t.check_request();
}
}
#[cfg(feature = "virtio-mem")]
#[test]
fn test_vmm_action_insert_mem_device() {
skip_if_not_root!();
let tests = &mut [
// hotplug unready
TestData::new(
VmmAction::InsertMemDevice(MemDeviceConfigInfo::default()),
InstanceState::Running,
&|result| {
assert!(matches!(
result,
Err(VmmActionError::StartMicroVm(
StartMicroVmError::UpcallMissVsock
))
));
let err_string = format!("{}", result.unwrap_err());
let expected_err = String::from(
"failed to boot the VM: \
the upcall client needs a virtio-vsock device for communication",
);
assert_eq!(err_string, expected_err);
},
),
// success
TestData::new(
VmmAction::InsertMemDevice(MemDeviceConfigInfo::default()),
InstanceState::Uninitialized,
&|result| {
assert!(result.is_ok());
},
),
];
for t in tests.iter_mut() {
t.check_request();
}
}
#[cfg(feature = "virtio-balloon")]
#[test]
fn test_vmm_action_insert_balloon_device() {
skip_if_not_root!();
let tests = &mut [
// hotplug unready
TestData::new(
VmmAction::InsertBalloonDevice(BalloonDeviceConfigInfo::default()),
InstanceState::Running,
&|result| {
assert!(matches!(
result,
Err(VmmActionError::StartMicroVm(
StartMicroVmError::UpcallMissVsock
))
));
let err_string = format!("{}", result.unwrap_err());
let expected_err = String::from(
"failed to boot the VM: \
the upcall client needs a virtio-vsock device for communication",
);
assert_eq!(err_string, expected_err);
},
),
// success
TestData::new(
VmmAction::InsertBalloonDevice(BalloonDeviceConfigInfo::default()),
InstanceState::Uninitialized,
&|result| {
assert!(result.is_ok());
},
),
];
for t in tests.iter_mut() {
t.check_request();
}
}
}

View File

@@ -0,0 +1,419 @@
// Copyright 2020 Alibaba Cloud. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
use dbs_virtio_devices as virtio;
use serde_derive::{Deserialize, Serialize};
use slog::{error, info};
use virtio::balloon::{Balloon, BalloonConfig};
use virtio::Error as VirtIoError;
use crate::address_space_manager::GuestAddressSpaceImpl;
use crate::config_manager::{ConfigItem, DeviceConfigInfo, DeviceConfigInfos};
use crate::device_manager::DbsMmioV2Device;
use crate::device_manager::{DeviceManager, DeviceMgrError, DeviceOpContext};
// The flag of whether to use the shared irq.
const USE_SHARED_IRQ: bool = true;
// The flag of whether to use the generic irq.
const USE_GENERIC_IRQ: bool = false;
/// Errors associated with `BalloonDeviceConfig`.
#[derive(Debug, thiserror::Error)]
pub enum BalloonDeviceError {
/// The balloon device was already used.
#[error("the virtio-balloon ID was already added to a different device")]
BalloonDeviceAlreadyExists,
/// Cannot perform the requested operation after booting the microVM.
#[error("the update operation is not allowed after boot")]
UpdateNotAllowedPostBoot,
/// guest memory error
#[error("failed to access guest memory, {0}")]
GuestMemoryError(#[source] vm_memory::mmap::Error),
/// create balloon device error
#[error("failed to create virtio-balloon device, {0}")]
CreateBalloonDevice(#[source] virtio::Error),
/// hotplug balloon device error
#[error("cannot hotplug virtio-balloon device, {0}")]
HotplugDeviceFailed(#[source] DeviceMgrError),
/// create mmio device error
#[error("cannot create virtio-balloon mmio device, {0}")]
CreateMmioDevice(#[source] DeviceMgrError),
/// Cannot initialize a balloon device or add a device to the MMIO Bus.
#[error("failure while registering balloon device: {0}")]
RegisterBalloonDevice(#[source] DeviceMgrError),
/// resize balloon device error
#[error("failure while resizing virtio-balloon device, {0}")]
ResizeFailed(#[source] VirtIoError),
/// The balloon device id doesn't exist.
#[error("invalid balloon device id '{0}'")]
InvalidDeviceId(String),
/// balloon device does not exist
#[error("balloon device does not exist")]
NotExist,
/// The device manager errors.
#[error("DeviceManager error: {0}")]
DeviceManager(#[source] DeviceMgrError),
}
/// Configuration information for a virtio-balloon device.
#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
pub struct BalloonDeviceConfigInfo {
/// Unique identifier of the balloon device
pub balloon_id: String,
/// Resize balloon size in mib
pub size_mib: u64,
/// Use shared irq
pub use_shared_irq: Option<bool>,
/// Use generic irq
pub use_generic_irq: Option<bool>,
/// VIRTIO_BALLOON_F_DEFLATE_ON_OOM
pub f_deflate_on_oom: bool,
/// VIRTIO_BALLOON_F_REPORTING
pub f_reporting: bool,
}
impl ConfigItem for BalloonDeviceConfigInfo {
type Err = BalloonDeviceError;
fn id(&self) -> &str {
&self.balloon_id
}
fn check_conflicts(&self, other: &Self) -> Result<(), BalloonDeviceError> {
if self.balloon_id.as_str() == other.balloon_id.as_str() {
Err(BalloonDeviceError::BalloonDeviceAlreadyExists)
} else {
Ok(())
}
}
}
/// Balloon Device Info
pub type BalloonDeviceInfo = DeviceConfigInfo<BalloonDeviceConfigInfo>;
impl ConfigItem for BalloonDeviceInfo {
type Err = BalloonDeviceError;
fn id(&self) -> &str {
&self.config.balloon_id
}
fn check_conflicts(&self, other: &Self) -> Result<(), BalloonDeviceError> {
if self.config.balloon_id.as_str() == other.config.balloon_id.as_str() {
Err(BalloonDeviceError::BalloonDeviceAlreadyExists)
} else {
Ok(())
}
}
}
/// Wrapper for the collection that holds all the Balloon Devices Configs
#[derive(Clone)]
pub struct BalloonDeviceMgr {
/// A list of `BalloonDeviceConfig` objects.
info_list: DeviceConfigInfos<BalloonDeviceConfigInfo>,
pub(crate) use_shared_irq: bool,
}
impl BalloonDeviceMgr {
/// Inserts `balloon_cfg` in the virtio-balloon device configuration list.
/// If an entry with the same id already exists, it will attempt to update
/// the existing entry.
pub fn insert_or_update_device(
&mut self,
mut ctx: DeviceOpContext,
balloon_cfg: BalloonDeviceConfigInfo,
) -> std::result::Result<(), BalloonDeviceError> {
if !cfg!(feature = "hotplug") && ctx.is_hotplug {
error!(ctx.logger(), "hotplug feature has been disabled.";
"subsystem" => "balloon_dev_mgr",);
return Err(BalloonDeviceError::UpdateNotAllowedPostBoot);
}
let epoll_mgr = ctx
.get_epoll_mgr()
.map_err(BalloonDeviceError::DeviceManager)?;
// If the id of the drive already exists in the list, the operation is update.
if let Some(index) = self.get_index_of_balloon_dev(&balloon_cfg.balloon_id) {
// Update an existing balloon device
if ctx.is_hotplug {
info!(ctx.logger(), "resize virtio balloon size to {:?}", balloon_cfg.size_mib; "subsystem" => "balloon_dev_mgr");
self.update_balloon_size(index, balloon_cfg.size_mib)?;
}
self.info_list.insert_or_update(&balloon_cfg)?;
} else {
// Create a new balloon device
if !self.info_list.is_empty() {
error!(ctx.logger(), "only support one balloon device!"; "subsystem" => "balloon_dev_mgr");
return Err(BalloonDeviceError::BalloonDeviceAlreadyExists);
}
if !ctx.is_hotplug {
self.info_list.insert_or_update(&balloon_cfg)?;
return Ok(());
}
info!(ctx.logger(), "hotplug balloon device: {}", balloon_cfg.balloon_id; "subsystem" => "balloon_dev_mgr");
let device = Box::new(
virtio::balloon::Balloon::new(
epoll_mgr,
BalloonConfig {
f_deflate_on_oom: balloon_cfg.f_deflate_on_oom,
f_reporting: balloon_cfg.f_reporting,
},
)
.map_err(BalloonDeviceError::CreateBalloonDevice)?,
);
let mmio_dev =
DeviceManager::create_mmio_virtio_device_with_device_change_notification(
device,
&mut ctx,
balloon_cfg.use_shared_irq.unwrap_or(self.use_shared_irq),
balloon_cfg.use_generic_irq.unwrap_or(USE_GENERIC_IRQ),
)
.map_err(BalloonDeviceError::CreateMmioDevice)?;
ctx.insert_hotplug_mmio_device(&mmio_dev, None)
.map_err(|e| {
error!(
ctx.logger(),
"hotplug balloon device {} error: {}",
&balloon_cfg.balloon_id, e;
"subsystem" => "balloon_dev_mgr"
);
BalloonDeviceError::HotplugDeviceFailed(e)
})?;
let index = self.info_list.insert_or_update(&balloon_cfg)?;
self.info_list[index].set_device(mmio_dev);
}
Ok(())
}
/// Attaches all virtio-balloon devices from the BalloonDevicesConfig.
pub fn attach_devices(
&mut self,
ctx: &mut DeviceOpContext,
) -> std::result::Result<(), BalloonDeviceError> {
let epoll_mgr = ctx
.get_epoll_mgr()
.map_err(BalloonDeviceError::DeviceManager)?;
for info in self.info_list.iter_mut() {
info!(ctx.logger(), "attach balloon device: {}", info.config.balloon_id; "subsystem" => "balloon_dev_mgr");
let device = Balloon::new(
epoll_mgr.clone(),
BalloonConfig {
f_deflate_on_oom: info.config.f_deflate_on_oom,
f_reporting: info.config.f_reporting,
},
)
.map_err(BalloonDeviceError::CreateBalloonDevice)?;
let mmio_dev =
DeviceManager::create_mmio_virtio_device_with_device_change_notification(
Box::new(device),
ctx,
info.config.use_shared_irq.unwrap_or(self.use_shared_irq),
info.config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ),
)
.map_err(BalloonDeviceError::RegisterBalloonDevice)?;
info.set_device(mmio_dev);
}
Ok(())
}
fn update_balloon_size(
&self,
index: usize,
size_mib: u64,
) -> std::result::Result<(), BalloonDeviceError> {
let device = self.info_list[index]
.device
.as_ref()
.ok_or_else(|| BalloonDeviceError::NotExist)?;
if let Some(mmio_dev) = device.as_any().downcast_ref::<DbsMmioV2Device>() {
let guard = mmio_dev.state();
let inner_dev = guard.get_inner_device();
if let Some(balloon_dev) = inner_dev
.as_any()
.downcast_ref::<Balloon<GuestAddressSpaceImpl>>()
{
return balloon_dev
.set_size(size_mib)
.map_err(BalloonDeviceError::ResizeFailed);
}
}
Ok(())
}
fn get_index_of_balloon_dev(&self, balloon_id: &str) -> Option<usize> {
self.info_list
.iter()
.position(|info| info.config.balloon_id.eq(balloon_id))
}
}
impl Default for BalloonDeviceMgr {
/// Create a new `BalloonDeviceMgr` object..
fn default() -> Self {
BalloonDeviceMgr {
info_list: DeviceConfigInfos::new(),
use_shared_irq: USE_SHARED_IRQ,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::test_utils::tests::create_vm_for_test;
impl Default for BalloonDeviceConfigInfo {
fn default() -> Self {
BalloonDeviceConfigInfo {
balloon_id: "".to_string(),
size_mib: 0,
use_generic_irq: None,
use_shared_irq: None,
f_deflate_on_oom: false,
f_reporting: false,
}
}
}
#[test]
fn test_balloon_config_check_conflicts() {
let config = BalloonDeviceConfigInfo::default();
let mut config2 = BalloonDeviceConfigInfo::default();
assert!(config.check_conflicts(&config2).is_err());
config2.balloon_id = "dummy_balloon".to_string();
assert!(config.check_conflicts(&config2).is_ok());
}
#[test]
fn test_create_balloon_devices_configs() {
let mgr = BalloonDeviceMgr::default();
assert_eq!(mgr.info_list.len(), 0);
assert_eq!(mgr.get_index_of_balloon_dev(""), None);
}
#[test]
fn test_balloon_insert_or_update_device() {
//Init vm for test.
let mut vm = create_vm_for_test();
// Test for standard config
let device_op_ctx = DeviceOpContext::new(
Some(vm.epoll_manager().clone()),
vm.device_manager(),
Some(vm.vm_as().unwrap().clone()),
None,
false,
Some(vm.vm_config().clone()),
vm.shared_info().clone(),
);
let dummy_balloon_device = BalloonDeviceConfigInfo::default();
vm.device_manager_mut()
.balloon_manager
.insert_or_update_device(device_op_ctx, dummy_balloon_device)
.unwrap();
assert_eq!(vm.device_manager().balloon_manager.info_list.len(), 1);
}
#[test]
fn test_balloon_attach_device() {
//Init vm and insert balloon config for test.
let mut vm = create_vm_for_test();
let device_op_ctx = DeviceOpContext::new(
Some(vm.epoll_manager().clone()),
vm.device_manager(),
Some(vm.vm_as().unwrap().clone()),
None,
false,
Some(vm.vm_config().clone()),
vm.shared_info().clone(),
);
let dummy_balloon_device = BalloonDeviceConfigInfo::default();
vm.device_manager_mut()
.balloon_manager
.insert_or_update_device(device_op_ctx, dummy_balloon_device)
.unwrap();
assert_eq!(vm.device_manager().balloon_manager.info_list.len(), 1);
// Test for standard config
let mut device_op_ctx = DeviceOpContext::new(
Some(vm.epoll_manager().clone()),
vm.device_manager(),
Some(vm.vm_as().unwrap().clone()),
None,
false,
Some(vm.vm_config().clone()),
vm.shared_info().clone(),
);
assert!(vm
.device_manager_mut()
.balloon_manager
.attach_devices(&mut device_op_ctx)
.is_ok());
assert_eq!(vm.device_manager().balloon_manager.info_list.len(), 1);
}
#[test]
fn test_balloon_update_device() {
//Init vm for test.
let mut vm = create_vm_for_test();
let device_op_ctx = DeviceOpContext::new(
Some(vm.epoll_manager().clone()),
vm.device_manager(),
Some(vm.vm_as().unwrap().clone()),
None,
false,
Some(vm.vm_config().clone()),
vm.shared_info().clone(),
);
let dummy_balloon_device = BalloonDeviceConfigInfo::default();
vm.device_manager_mut()
.balloon_manager
.insert_or_update_device(device_op_ctx, dummy_balloon_device)
.unwrap();
assert_eq!(vm.device_manager().balloon_manager.info_list.len(), 1);
let mut device_op_ctx = DeviceOpContext::new(
Some(vm.epoll_manager().clone()),
vm.device_manager(),
Some(vm.vm_as().unwrap().clone()),
None,
false,
Some(vm.vm_config().clone()),
vm.shared_info().clone(),
);
assert!(vm
.device_manager_mut()
.balloon_manager
.attach_devices(&mut device_op_ctx)
.is_ok());
assert_eq!(vm.device_manager().balloon_manager.info_list.len(), 1);
assert!(vm
.device_manager()
.balloon_manager
.update_balloon_size(0, 200)
.is_ok());
}
}

View File

@@ -871,6 +871,8 @@ mod tests {
Some(vm.vm_as().unwrap().clone()),
None,
false,
Some(vm.vm_config().clone()),
vm.shared_info().clone(),
);
let dummy_file = TempFile::new().unwrap();
@@ -907,6 +909,8 @@ mod tests {
Some(vm.vm_as().unwrap().clone()),
None,
false,
Some(vm.vm_config().clone()),
vm.shared_info().clone(),
);
vm.device_manager_mut()

View File

@@ -0,0 +1,733 @@
// Copyright 2020 Alibaba Cloud. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
use std::io;
use std::sync::{Arc, Mutex};
use dbs_address_space::{
AddressSpace, AddressSpaceError, AddressSpaceRegion, MPOL_MF_MOVE, MPOL_PREFERRED, USABLE_END,
};
use dbs_utils::epoll_manager::EpollManager;
use dbs_virtio_devices as virtio;
use kvm_bindings::kvm_userspace_memory_region;
use kvm_ioctls::VmFd;
use nix::sys::mman;
use serde_derive::{Deserialize, Serialize};
use slog::{debug, error, info, warn};
use virtio::mem::{Mem, MemRegionFactory};
use virtio::Error as VirtIoError;
use vm_memory::{
Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestRegionMmap, GuestUsize, MmapRegion,
};
use crate::address_space_manager::GuestAddressSpaceImpl;
use crate::config_manager::{ConfigItem, DeviceConfigInfo, DeviceConfigInfos};
use crate::device_manager::DbsMmioV2Device;
use crate::device_manager::{DeviceManager, DeviceMgrError, DeviceOpContext};
use crate::vm::VmConfigInfo;
// The flag of whether to use the shared irq.
const USE_SHARED_IRQ: bool = true;
// The flag of whether to use the generic irq.
const USE_GENERIC_IRQ: bool = false;
const HUGE_PAGE_2M: usize = 0x200000;
// max numa node ids on host
const MAX_NODE: u32 = 64;
/// Errors associated with `MemDeviceConfig`.
#[derive(Debug, thiserror::Error)]
pub enum MemDeviceError {
/// The mem device was already used.
#[error("the virtio-mem ID was already added to a different device")]
MemDeviceAlreadyExists,
/// Cannot perform the requested operation after booting the microVM.
#[error("the update operation is not allowed after boot")]
UpdateNotAllowedPostBoot,
/// insert mem device error
#[error("cannot add virtio-mem device, {0}")]
InsertDeviceFailed(#[source] DeviceMgrError),
/// create mem device error
#[error("cannot create virito-mem device, {0}")]
CreateMemDevice(#[source] DeviceMgrError),
/// create mmio device error
#[error("cannot create virito-mem mmio device, {0}")]
CreateMmioDevice(#[source] DeviceMgrError),
/// resize mem device error
#[error("failure while resizing virtio-mem device, {0}")]
ResizeFailed(#[source] VirtIoError),
/// mem device does not exist
#[error("mem device does not exist")]
DeviceNotExist,
/// address space region error
#[error("address space region error, {0}")]
AddressSpaceRegion(#[source] AddressSpaceError),
/// Cannot initialize a mem device or add a device to the MMIO Bus.
#[error("failure while registering mem device: {0}")]
RegisterMemDevice(#[source] DeviceMgrError),
/// The mem device id doesn't exist.
#[error("invalid mem device id '{0}'")]
InvalidDeviceId(String),
/// The device manager errors.
#[error("DeviceManager error: {0}")]
DeviceManager(#[source] DeviceMgrError),
}
/// Configuration information for a virtio-mem device.
#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
pub struct MemDeviceConfigInfo {
/// Unique identifier of the pmem device
pub mem_id: String,
/// Memory size mib
pub size_mib: u64,
/// Memory capacity mib
pub capacity_mib: u64,
/// Use multi_region or not
pub multi_region: bool,
/// host numa node id
pub host_numa_node_id: Option<u32>,
/// guest numa node id
pub guest_numa_node_id: Option<u16>,
/// Use shared irq
pub use_shared_irq: Option<bool>,
/// Use generic irq
pub use_generic_irq: Option<bool>,
}
impl ConfigItem for MemDeviceConfigInfo {
type Err = MemDeviceError;
fn id(&self) -> &str {
&self.mem_id
}
fn check_conflicts(&self, other: &Self) -> Result<(), MemDeviceError> {
if self.mem_id.as_str() == other.mem_id.as_str() {
Err(MemDeviceError::MemDeviceAlreadyExists)
} else {
Ok(())
}
}
}
/// Mem Device Info
pub type MemDeviceInfo = DeviceConfigInfo<MemDeviceConfigInfo>;
impl ConfigItem for MemDeviceInfo {
type Err = MemDeviceError;
fn id(&self) -> &str {
&self.config.mem_id
}
fn check_conflicts(&self, other: &Self) -> Result<(), MemDeviceError> {
if self.config.mem_id.as_str() == other.config.mem_id.as_str() {
Err(MemDeviceError::MemDeviceAlreadyExists)
} else {
Ok(())
}
}
}
/// Wrapper for the collection that holds all the Mem Devices Configs
#[derive(Clone)]
pub struct MemDeviceMgr {
/// A list of `MemDeviceConfig` objects.
info_list: DeviceConfigInfos<MemDeviceConfigInfo>,
pub(crate) use_shared_irq: bool,
}
impl MemDeviceMgr {
/// Inserts `mem_cfg` in the virtio-mem device configuration list.
/// If an entry with the same id already exists, it will attempt to update
/// the existing entry.
pub fn insert_or_update_device(
&mut self,
mut ctx: DeviceOpContext,
mem_cfg: MemDeviceConfigInfo,
) -> std::result::Result<(), MemDeviceError> {
if !cfg!(feature = "hotplug") && ctx.is_hotplug {
error!(ctx.logger(), "hotplug feature has been disabled.";
"subsystem" => "virito-mem");
return Err(MemDeviceError::UpdateNotAllowedPostBoot);
}
let epoll_mgr = ctx.get_epoll_mgr().map_err(MemDeviceError::DeviceManager)?;
// If the id of the drive already exists in the list, the operation is update.
if let Some(index) = self.get_index_of_mem_dev(&mem_cfg.mem_id) {
// Update an existing memory device
if ctx.is_hotplug {
info!(
ctx.logger(),
"update memory device: {}, size: 0x{:x}MB.",
mem_cfg.mem_id,
mem_cfg.size_mib;
"subsystem" => "virito-mem"
);
self.update_memory_size(index, mem_cfg.size_mib)?;
}
self.info_list.insert_or_update(&mem_cfg)?;
} else {
// Create a new memory device
if !ctx.is_hotplug {
self.info_list.insert_or_update(&mem_cfg)?;
return Ok(());
}
info!(
ctx.logger(),
"hot-add memory device: {}, size: 0x{:x}MB.", mem_cfg.mem_id, mem_cfg.size_mib;
"subsystem" => "virito-mem"
);
let device = Self::create_memory_device(&mem_cfg, &ctx, &epoll_mgr)
.map_err(MemDeviceError::CreateMemDevice)?;
let mmio_device =
DeviceManager::create_mmio_virtio_device_with_device_change_notification(
Box::new(device),
&mut ctx,
mem_cfg.use_shared_irq.unwrap_or(self.use_shared_irq),
mem_cfg.use_generic_irq.unwrap_or(USE_GENERIC_IRQ),
)
.map_err(MemDeviceError::CreateMmioDevice)?;
#[cfg(not(test))]
ctx.insert_hotplug_mmio_device(&mmio_device, None)
.map_err(|e| {
error!(
ctx.logger(),
"failed to hot-add virtio-mem device {}, {}", &mem_cfg.mem_id, e;
"subsystem" => "virito-mem"
);
MemDeviceError::InsertDeviceFailed(e)
})?;
let index = self.info_list.insert_or_update(&mem_cfg)?;
self.info_list[index].set_device(mmio_device);
}
Ok(())
}
/// Attaches all virtio-mem devices from the MemDevicesConfig.
pub fn attach_devices(
&mut self,
ctx: &mut DeviceOpContext,
) -> std::result::Result<(), MemDeviceError> {
let epoll_mgr = ctx.get_epoll_mgr().map_err(MemDeviceError::DeviceManager)?;
for info in self.info_list.iter_mut() {
let config = &info.config;
info!(
ctx.logger(),
"attach virtio-mem device {}, size 0x{:x}.", config.mem_id, config.size_mib;
"subsystem" => "virito-mem"
);
// Ignore virtio-mem device with zero memory capacity.
if config.size_mib == 0 {
debug!(
ctx.logger(),
"ignore zero-sizing memory device {}.", config.mem_id;
"subsystem" => "virito-mem"
);
continue;
}
let device = Self::create_memory_device(config, ctx, &epoll_mgr)
.map_err(MemDeviceError::CreateMemDevice)?;
let mmio_device =
DeviceManager::create_mmio_virtio_device_with_device_change_notification(
Box::new(device),
ctx,
config.use_shared_irq.unwrap_or(self.use_shared_irq),
config.use_generic_irq.unwrap_or(USE_GENERIC_IRQ),
)
.map_err(MemDeviceError::RegisterMemDevice)?;
info.set_device(mmio_device);
}
Ok(())
}
fn get_index_of_mem_dev(&self, mem_id: &str) -> Option<usize> {
self.info_list
.iter()
.position(|info| info.config.mem_id.eq(mem_id))
}
fn create_memory_device(
config: &MemDeviceConfigInfo,
ctx: &DeviceOpContext,
epoll_mgr: &EpollManager,
) -> std::result::Result<virtio::mem::Mem<GuestAddressSpaceImpl>, DeviceMgrError> {
let factory = Arc::new(Mutex::new(MemoryRegionFactory::new(
ctx,
config.mem_id.clone(),
config.host_numa_node_id,
)?));
let mut capacity_mib = config.capacity_mib;
if capacity_mib == 0 {
capacity_mib = *USABLE_END >> 20;
}
// get boot memory size for calculate alignment
let boot_mem_size = {
let boot_size = (ctx.get_vm_config()?.mem_size_mib << 20) as u64;
// increase 1G memory because of avoiding mmio hole
match boot_size {
x if x > dbs_boot::layout::MMIO_LOW_START => x + (1 << 30),
_ => boot_size,
}
};
virtio::mem::Mem::new(
config.mem_id.clone(),
capacity_mib,
config.size_mib,
config.multi_region,
config.guest_numa_node_id,
epoll_mgr.clone(),
factory,
boot_mem_size,
)
.map_err(DeviceMgrError::Virtio)
}
/// Removes all virtio-mem devices
pub fn remove_devices(&self, ctx: &mut DeviceOpContext) -> Result<(), DeviceMgrError> {
for info in self.info_list.iter() {
if let Some(device) = &info.device {
DeviceManager::destroy_mmio_virtio_device(device.clone(), ctx)?;
}
}
Ok(())
}
fn update_memory_size(
&self,
index: usize,
size_mib: u64,
) -> std::result::Result<(), MemDeviceError> {
let device = self.info_list[index]
.device
.as_ref()
.ok_or_else(|| MemDeviceError::DeviceNotExist)?;
if let Some(mmio_dev) = device.as_any().downcast_ref::<DbsMmioV2Device>() {
let guard = mmio_dev.state();
let inner_dev = guard.get_inner_device();
if let Some(mem_dev) = inner_dev
.as_any()
.downcast_ref::<Mem<GuestAddressSpaceImpl>>()
{
return mem_dev
.set_requested_size(size_mib)
.map_err(MemDeviceError::ResizeFailed);
}
}
Ok(())
}
}
impl Default for MemDeviceMgr {
/// Create a new `MemDeviceMgr` object..
fn default() -> Self {
MemDeviceMgr {
info_list: DeviceConfigInfos::new(),
use_shared_irq: USE_SHARED_IRQ,
}
}
}
struct MemoryRegionFactory {
mem_id: String,
vm_as: GuestAddressSpaceImpl,
address_space: AddressSpace,
vm_config: VmConfigInfo,
vm_fd: Arc<VmFd>,
logger: Arc<slog::Logger>,
host_numa_node_id: Option<u32>,
instance_id: String,
}
impl MemoryRegionFactory {
fn new(
ctx: &DeviceOpContext,
mem_id: String,
host_numa_node_id: Option<u32>,
) -> Result<Self, DeviceMgrError> {
let vm_as = ctx.get_vm_as()?;
let address_space = ctx.get_address_space()?;
let vm_config = ctx.get_vm_config()?;
let logger = Arc::new(ctx.logger().new(slog::o!()));
let shared_info = ctx.shared_info.read().unwrap();
let instance_id = shared_info.id.clone();
Ok(MemoryRegionFactory {
mem_id,
vm_as,
address_space,
vm_config,
vm_fd: ctx.vm_fd.clone(),
logger,
host_numa_node_id,
instance_id,
})
}
fn configure_anon_mem(&self, mmap_reg: &MmapRegion) -> Result<(), VirtIoError> {
unsafe {
mman::madvise(
mmap_reg.as_ptr() as *mut libc::c_void,
mmap_reg.size(),
mman::MmapAdvise::MADV_DONTFORK,
)
}
.map_err(VirtIoError::Madvise)?;
Ok(())
}
fn configure_numa(&self, mmap_reg: &MmapRegion, node_id: u32) -> Result<(), VirtIoError> {
let nodemask = 1_u64
.checked_shl(node_id)
.ok_or(VirtIoError::InvalidInput)?;
let res = unsafe {
libc::syscall(
libc::SYS_mbind,
mmap_reg.as_ptr() as *mut libc::c_void,
mmap_reg.size(),
MPOL_PREFERRED,
&nodemask as *const u64,
MAX_NODE,
MPOL_MF_MOVE,
)
};
if res < 0 {
warn!(
self.logger,
"failed to mbind memory to host_numa_node_id {}: this may affect performance",
node_id;
"subsystem" => "virito-mem"
);
}
Ok(())
}
fn configure_thp(&mut self, mmap_reg: &MmapRegion) -> Result<(), VirtIoError> {
debug!(
self.logger,
"Setting MADV_HUGEPAGE on AddressSpaceRegion addr {:x?} len {:x?}",
mmap_reg.as_ptr(),
mmap_reg.size();
"subsystem" => "virito-mem"
);
// Safe because we just create the MmapRegion
unsafe {
mman::madvise(
mmap_reg.as_ptr() as *mut libc::c_void,
mmap_reg.size(),
mman::MmapAdvise::MADV_HUGEPAGE,
)
}
.map_err(VirtIoError::Madvise)?;
Ok(())
}
fn map_to_kvm(
&mut self,
slot: u32,
reg: &Arc<AddressSpaceRegion>,
mmap_reg: &MmapRegion,
) -> Result<(), VirtIoError> {
let host_addr = mmap_reg.as_ptr() as u64;
let flags = 0u32;
let mem_region = kvm_userspace_memory_region {
slot,
guest_phys_addr: reg.start_addr().raw_value(),
memory_size: reg.len(),
userspace_addr: host_addr,
flags,
};
// Safe because the user mem region is just created, and kvm slot is allocated
// by resource allocator.
unsafe { self.vm_fd.set_user_memory_region(mem_region) }
.map_err(VirtIoError::SetUserMemoryRegion)?;
Ok(())
}
}
impl MemRegionFactory for MemoryRegionFactory {
fn create_region(
&mut self,
guest_addr: GuestAddress,
region_len: GuestUsize,
kvm_slot: u32,
) -> std::result::Result<Arc<GuestRegionMmap>, VirtIoError> {
// create address space region
let mem_type = self.vm_config.mem_type.as_str();
let mut mem_file_path = self.vm_config.mem_file_path.clone();
let mem_file_name = format!(
"/virtiomem_{}_{}",
self.instance_id.as_str(),
self.mem_id.as_str()
);
mem_file_path.push_str(mem_file_name.as_str());
let region = Arc::new(
AddressSpaceRegion::create_default_memory_region(
guest_addr,
region_len,
self.host_numa_node_id,
mem_type,
mem_file_path.as_str(),
false,
true,
)
.map_err(|e| {
error!(self.logger, "failed to insert address space region: {}", e);
// dbs-virtio-devices should not depend on dbs-address-space.
// So here io::Error is used instead of AddressSpaceError directly.
VirtIoError::IOError(io::Error::new(
io::ErrorKind::Other,
format!(
"invalid address space region ({0:#x}, {1:#x})",
guest_addr.0, region_len
),
))
})?,
);
info!(
self.logger,
"VM: mem_type: {} mem_file_path: {}, numa_node_id: {:?} file_offset: {:?}",
mem_type,
mem_file_path,
self.host_numa_node_id,
region.file_offset();
"subsystem" => "virito-mem"
);
let mmap_region = MmapRegion::build(
region.file_offset().cloned(),
region_len as usize,
region.prot_flags(),
region.perm_flags(),
)
.map_err(VirtIoError::NewMmapRegion)?;
let host_addr: u64 = mmap_region.as_ptr() as u64;
// thp
if mem_type == "hugeanon" || mem_type == "hugeshmem" {
self.configure_thp(&mmap_region)?;
}
// Handle numa
if let Some(numa_node_id) = self.host_numa_node_id {
self.configure_numa(&mmap_region, numa_node_id)?;
}
// add to guest memory mapping
self.map_to_kvm(kvm_slot, &region, &mmap_region)?;
info!(
self.logger,
"kvm set user memory region: slot: {}, flags: {}, guest_phys_addr: {:X}, memory_size: {}, userspace_addr: {:X}",
kvm_slot,
0,
guest_addr.raw_value(),
region_len,
host_addr;
"subsystem" => "virito-mem"
);
// All value should be valid.
let memory_region = Arc::new(
GuestRegionMmap::new(mmap_region, guest_addr).map_err(VirtIoError::InsertMmap)?,
);
let vm_as_new = self
.vm_as
.memory()
.insert_region(memory_region.clone())
.map_err(VirtIoError::InsertMmap)?;
self.vm_as.lock().unwrap().replace(vm_as_new);
self.address_space.insert_region(region).map_err(|e| {
error!(self.logger, "failed to insert address space region: {}", e);
// dbs-virtio-devices should not depend on dbs-address-space.
// So here io::Error is used instead of AddressSpaceError directly.
VirtIoError::IOError(io::Error::new(
io::ErrorKind::Other,
format!(
"invalid address space region ({0:#x}, {1:#x})",
guest_addr.0, region_len
),
))
})?;
Ok(memory_region)
}
fn restore_region_addr(
&self,
guest_addr: GuestAddress,
) -> std::result::Result<*mut u8, VirtIoError> {
let memory = self.vm_as.memory();
// NOTE: We can't clone `GuestRegionMmap` reference directly!!!
//
// Since an important role of the member `mapping` (type is
// `MmapRegion`) in `GuestRegionMmap` is to mmap the memory during
// construction and munmap the memory during drop. However, when the
// life time of cloned data is over, the drop operation will be
// performed, which will munmap the origional mmap memory, which will
// cause some memory in dragonall to be inaccessable. And remember the
// data structure that was cloned is still alive now, when its life time
// is over, it will perform the munmap operation again, which will cause
// a memory exception!
memory
.get_host_address(guest_addr)
.map_err(VirtIoError::GuestMemory)
}
fn get_host_numa_node_id(&self) -> Option<u32> {
self.host_numa_node_id
}
fn set_host_numa_node_id(&mut self, host_numa_node_id: Option<u32>) {
self.host_numa_node_id = host_numa_node_id;
}
}
#[cfg(test)]
mod tests {
use vm_memory::GuestMemoryRegion;
use super::*;
use crate::test_utils::tests::create_vm_for_test;
impl Default for MemDeviceConfigInfo {
fn default() -> Self {
MemDeviceConfigInfo {
mem_id: "".to_string(),
size_mib: 0,
capacity_mib: 1024,
multi_region: true,
host_numa_node_id: None,
guest_numa_node_id: None,
use_generic_irq: None,
use_shared_irq: None,
}
}
}
#[test]
fn test_mem_config_check_conflicts() {
let config = MemDeviceConfigInfo::default();
let mut config2 = MemDeviceConfigInfo::default();
assert!(config.check_conflicts(&config2).is_err());
config2.mem_id = "dummy_mem".to_string();
assert!(config.check_conflicts(&config2).is_ok());
}
#[test]
fn test_create_mem_devices_configs() {
let mgr = MemDeviceMgr::default();
assert_eq!(mgr.info_list.len(), 0);
assert_eq!(mgr.get_index_of_mem_dev(""), None);
}
#[test]
fn test_mem_insert_or_update_device() {
// Init vm for test.
let mut vm = create_vm_for_test();
// We don't need to use virtio-mem before start vm
// Test for standard config with hotplug
let device_op_ctx = DeviceOpContext::new(
Some(vm.epoll_manager().clone()),
vm.device_manager(),
Some(vm.vm_as().unwrap().clone()),
vm.vm_address_space().cloned(),
true,
Some(VmConfigInfo::default()),
vm.shared_info().clone(),
);
let dummy_mem_device = MemDeviceConfigInfo::default();
vm.device_manager_mut()
.mem_manager
.insert_or_update_device(device_op_ctx, dummy_mem_device)
.unwrap();
assert_eq!(vm.device_manager().mem_manager.info_list.len(), 1);
}
#[test]
fn test_mem_attach_device() {
// Init vm and insert mem config for test.
let mut vm = create_vm_for_test();
let dummy_mem_device = MemDeviceConfigInfo::default();
vm.device_manager_mut()
.mem_manager
.info_list
.insert_or_update(&dummy_mem_device)
.unwrap();
assert_eq!(vm.device_manager().mem_manager.info_list.len(), 1);
// Test for standard config
let mut device_op_ctx = DeviceOpContext::new(
Some(vm.epoll_manager().clone()),
vm.device_manager(),
Some(vm.vm_as().unwrap().clone()),
vm.vm_address_space().cloned(),
false,
Some(VmConfigInfo::default()),
vm.shared_info().clone(),
);
vm.device_manager_mut()
.mem_manager
.attach_devices(&mut device_op_ctx)
.unwrap();
assert_eq!(vm.device_manager().mem_manager.info_list.len(), 1);
}
#[test]
fn test_mem_create_region() {
let vm = create_vm_for_test();
let ctx = DeviceOpContext::new(
Some(vm.epoll_manager().clone()),
vm.device_manager(),
Some(vm.vm_as().unwrap().clone()),
vm.vm_address_space().cloned(),
true,
Some(VmConfigInfo::default()),
vm.shared_info().clone(),
);
let mem_id = String::from("mem0");
let guest_addr = GuestAddress(0x1_0000_0000);
let region_len = 0x1000_0000;
let kvm_slot = 2;
// no vfio manager, no numa node
let mut factory = MemoryRegionFactory::new(&ctx, mem_id, None).unwrap();
let region_opt = factory.create_region(guest_addr, region_len, kvm_slot);
assert_eq!(region_opt.unwrap().len(), region_len);
}
}

View File

@@ -55,6 +55,7 @@ impl DeviceVirtioRegionHandler {
None,
file_offset,
region.flags(),
region.prot(),
false,
));

View File

@@ -7,7 +7,7 @@
use std::collections::HashMap;
use std::io;
use std::sync::{Arc, Mutex, MutexGuard};
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
use arc_swap::ArcSwap;
use dbs_address_space::AddressSpace;
@@ -45,9 +45,10 @@ use dbs_upcall::{
use dbs_virtio_devices::vsock::backend::VsockInnerConnector;
use crate::address_space_manager::GuestAddressSpaceImpl;
use crate::api::v1::InstanceInfo;
use crate::error::StartMicroVmError;
use crate::resource_manager::ResourceManager;
use crate::vm::{KernelConfigInfo, Vm};
use crate::vm::{KernelConfigInfo, Vm, VmConfigInfo};
use crate::IoManagerCached;
/// Virtual machine console device manager.
@@ -89,6 +90,18 @@ mod memory_region_handler;
#[cfg(feature = "virtio-fs")]
pub use self::memory_region_handler::*;
#[cfg(feature = "virtio-mem")]
/// Device manager for virtio-mem devices.
pub mod mem_dev_mgr;
#[cfg(feature = "virtio-mem")]
use self::mem_dev_mgr::MemDeviceMgr;
#[cfg(feature = "virtio-balloon")]
/// Device manager for virtio-balloon devices.
pub mod balloon_dev_mgr;
#[cfg(feature = "virtio-balloon")]
use self::balloon_dev_mgr::BalloonDeviceMgr;
macro_rules! info(
($l:expr, $($args:tt)+) => {
slog::info!($l, $($args)+; slog::o!("subsystem" => "device_manager"))
@@ -248,6 +261,8 @@ pub struct DeviceOpContext {
upcall_client: Option<Arc<UpcallClient<DevMgrService>>>,
#[cfg(feature = "dbs-virtio-devices")]
virtio_devices: Vec<Arc<DbsMmioV2Device>>,
vm_config: Option<VmConfigInfo>,
shared_info: Arc<RwLock<InstanceInfo>>,
}
impl DeviceOpContext {
@@ -257,6 +272,8 @@ impl DeviceOpContext {
vm_as: Option<GuestAddressSpaceImpl>,
address_space: Option<AddressSpace>,
is_hotplug: bool,
vm_config: Option<VmConfigInfo>,
shared_info: Arc<RwLock<InstanceInfo>>,
) -> Self {
let irq_manager = device_mgr.irq_manager.clone();
let res_manager = device_mgr.res_manager.clone();
@@ -282,11 +299,21 @@ impl DeviceOpContext {
upcall_client: None,
#[cfg(feature = "dbs-virtio-devices")]
virtio_devices: Vec::new(),
vm_config,
shared_info,
}
}
pub(crate) fn create_boot_ctx(vm: &Vm, epoll_mgr: Option<EpollManager>) -> Self {
Self::new(epoll_mgr, vm.device_manager(), None, None, false)
Self::new(
epoll_mgr,
vm.device_manager(),
None,
None,
false,
Some(vm.vm_config().clone()),
vm.shared_info().clone(),
)
}
pub(crate) fn get_vm_as(&self) -> Result<GuestAddressSpaceImpl> {
@@ -296,6 +323,27 @@ impl DeviceOpContext {
}
}
pub(crate) fn get_vm_config(&self) -> Result<VmConfigInfo> {
match self.vm_config.as_ref() {
Some(v) => Ok(v.clone()),
None => Err(DeviceMgrError::InvalidOperation),
}
}
pub(crate) fn get_address_space(&self) -> Result<AddressSpace> {
match self.address_space.as_ref() {
Some(v) => Ok(v.clone()),
None => Err(DeviceMgrError::InvalidOperation),
}
}
pub(crate) fn get_epoll_mgr(&self) -> Result<EpollManager> {
match self.epoll_mgr.as_ref() {
Some(v) => Ok(v.clone()),
None => Err(DeviceMgrError::InvalidOperation),
}
}
pub(crate) fn logger(&self) -> &slog::Logger {
&self.logger
}
@@ -386,6 +434,8 @@ impl DeviceOpContext {
Some(vm_as),
vm.vm_address_space().cloned(),
true,
Some(vm.vm_config().clone()),
vm.shared_info().clone(),
);
ctx.upcall_client = vm.upcall_client().clone();
ctx
@@ -463,7 +513,7 @@ pub struct DeviceManager {
res_manager: Arc<ResourceManager>,
vm_fd: Arc<VmFd>,
pub(crate) logger: slog::Logger,
pub(crate) shared_info: Arc<RwLock<InstanceInfo>>,
pub(crate) con_manager: ConsoleManager,
pub(crate) legacy_manager: Option<LegacyDeviceManager>,
#[cfg(target_arch = "aarch64")]
@@ -481,6 +531,12 @@ pub struct DeviceManager {
#[cfg(feature = "virtio-fs")]
fs_manager: Arc<Mutex<FsDeviceMgr>>,
#[cfg(feature = "virtio-mem")]
pub(crate) mem_manager: MemDeviceMgr,
#[cfg(feature = "virtio-balloon")]
pub(crate) balloon_manager: BalloonDeviceMgr,
}
impl DeviceManager {
@@ -490,6 +546,7 @@ impl DeviceManager {
res_manager: Arc<ResourceManager>,
epoll_manager: EpollManager,
logger: &slog::Logger,
shared_info: Arc<RwLock<InstanceInfo>>,
) -> Self {
DeviceManager {
io_manager: Arc::new(ArcSwap::new(Arc::new(IoManager::new()))),
@@ -498,6 +555,7 @@ impl DeviceManager {
res_manager,
vm_fd,
logger: logger.new(slog::o!()),
shared_info,
con_manager: ConsoleManager::new(epoll_manager, logger),
legacy_manager: None,
@@ -511,6 +569,10 @@ impl DeviceManager {
virtio_net_manager: VirtioNetDeviceMgr::default(),
#[cfg(feature = "virtio-fs")]
fs_manager: Arc::new(Mutex::new(FsDeviceMgr::default())),
#[cfg(feature = "virtio-mem")]
mem_manager: MemDeviceMgr::default(),
#[cfg(feature = "virtio-balloon")]
balloon_manager: BalloonDeviceMgr::default(),
}
}
@@ -636,9 +698,9 @@ impl DeviceManager {
vm_as: GuestAddressSpaceImpl,
epoll_mgr: EpollManager,
kernel_config: &mut KernelConfigInfo,
com1_sock_path: Option<String>,
dmesg_fifo: Option<Box<dyn io::Write + Send>>,
address_space: Option<&AddressSpace>,
vm_config: &VmConfigInfo,
) -> std::result::Result<(), StartMicroVmError> {
let mut ctx = DeviceOpContext::new(
Some(epoll_mgr),
@@ -646,8 +708,12 @@ impl DeviceManager {
Some(vm_as),
address_space.cloned(),
false,
Some(vm_config.clone()),
self.shared_info.clone(),
);
let com1_sock_path = vm_config.serial_path.clone();
self.create_legacy_devices(&mut ctx)?;
self.init_legacy_devices(dmesg_fifo, com1_sock_path, &mut ctx)?;
@@ -710,6 +776,8 @@ impl DeviceManager {
Some(vm_as),
address_space.cloned(),
true,
None,
self.shared_info.clone(),
);
#[cfg(feature = "virtio-blk")]
@@ -874,6 +942,24 @@ impl DeviceManager {
)
}
/// Create an Virtio MMIO transport layer device for the virtio backend device with configure
/// change notification enabled.
pub fn create_mmio_virtio_device_with_device_change_notification(
device: DbsVirtioDevice,
ctx: &mut DeviceOpContext,
use_shared_irq: bool,
use_generic_irq: bool,
) -> std::result::Result<Arc<DbsMmioV2Device>, DeviceMgrError> {
let features = DRAGONBALL_FEATURE_PER_QUEUE_NOTIFY;
DeviceManager::create_mmio_virtio_device_with_features(
device,
ctx,
Some(features),
use_shared_irq,
use_generic_irq,
)
}
/// Create an Virtio MMIO transport layer device for the virtio backend device with specified
/// features.
pub fn create_mmio_virtio_device_with_features(
@@ -1019,6 +1105,7 @@ mod tests {
use vm_memory::{GuestAddress, MmapRegion};
use super::*;
#[cfg(target_arch = "x86_64")]
use crate::vm::CpuTopology;
impl DeviceManager {
@@ -1029,6 +1116,10 @@ mod tests {
let epoll_manager = EpollManager::default();
let res_manager = Arc::new(ResourceManager::new(None));
let logger = slog_scope::logger().new(slog::o!());
let shared_info = Arc::new(RwLock::new(InstanceInfo::new(
String::from("dragonball"),
String::from("1"),
)));
DeviceManager {
vm_fd: Arc::clone(&vm_fd),
@@ -1047,10 +1138,15 @@ mod tests {
virtio_net_manager: VirtioNetDeviceMgr::default(),
#[cfg(feature = "virtio-vsock")]
vsock_manager: VsockDeviceMgr::default(),
#[cfg(feature = "virtio-mem")]
mem_manager: MemDeviceMgr::default(),
#[cfg(feature = "virtio-balloon")]
balloon_manager: BalloonDeviceMgr::default(),
#[cfg(target_arch = "aarch64")]
mmio_device_info: HashMap::new(),
logger,
shared_info,
}
}
}
@@ -1090,7 +1186,7 @@ mod tests {
},
vpmu_feature: 0,
};
vm.set_vm_config(vm_config);
vm.set_vm_config(vm_config.clone());
vm.init_guest_memory().unwrap();
vm.setup_interrupt_controller().unwrap();
let vm_as = vm.vm_as().cloned().unwrap();
@@ -1116,8 +1212,8 @@ mod tests {
event_mgr.epoll_manager(),
&mut cmdline,
None,
None,
address_space.as_ref(),
&vm_config,
)
.unwrap();
let guard = mgr.io_manager.load();
@@ -1141,6 +1237,8 @@ mod tests {
Some(vm.vm_as().unwrap().clone()),
vm.vm_address_space().cloned(),
true,
Some(vm.vm_config().clone()),
vm.shared_info().clone(),
);
let guest_addr = GuestAddress(0x200000000000);

View File

@@ -192,6 +192,11 @@ pub enum StartMicroVmError {
/// Virtio-fs errors.
#[error("virtio-fs errors: {0}")]
FsDeviceError(#[source] device_manager::fs_dev_mgr::FsDeviceError),
#[cfg(feature = "virtio-balloon")]
/// Virtio-balloon errors.
#[error("virtio-balloon errors: {0}")]
BalloonDeviceError(#[source] device_manager::balloon_dev_mgr::BalloonDeviceError),
}
/// Errors associated with starting the instance.

View File

@@ -374,11 +374,17 @@ impl VcpuManager {
entry_addr: GuestAddress,
) -> Result<()> {
info!("create boot vcpus");
self.create_vcpus(
self.vcpu_config.boot_vcpu_count,
Some(request_ts),
Some(entry_addr),
)?;
let boot_vcpu_count = if cfg!(target_arch = "aarch64") {
// On aarch64, kvm doesn't allow to call KVM_CREATE_VCPU ioctl after vm has been booted
// because of vgic check. To support vcpu hotplug/hotunplug feature, we should create
// all the vcpufd at booting procedure.
// SetVmConfiguration API will ensure max_vcpu_count >= boot_vcpu_count, so it is safe
// to directly use max_vcpu_count here.
self.vcpu_config.max_vcpu_count
} else {
self.vcpu_config.boot_vcpu_count
};
self.create_vcpus(boot_vcpu_count, Some(request_ts), Some(entry_addr))?;
Ok(())
}
@@ -1213,7 +1219,10 @@ mod tests {
assert!(vcpu_manager
.create_boot_vcpus(TimestampUs::default(), GuestAddress(0))
.is_ok());
#[cfg(target_arch = "x86_64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 1);
#[cfg(target_arch = "aarch64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 3);
// test start boot vcpus
assert!(vcpu_manager.start_boot_vcpus(BpfProgram::default()).is_ok());
@@ -1267,8 +1276,14 @@ mod tests {
assert!(vcpu_manager
.create_boot_vcpus(TimestampUs::default(), GuestAddress(0))
.is_ok());
#[cfg(target_arch = "x86_64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 1);
#[cfg(target_arch = "aarch64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 3);
assert!(vcpu_manager.start_boot_vcpus(BpfProgram::default()).is_ok());
#[cfg(target_arch = "aarch64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 2);
// invalid cpuid for pause
let cpu_indexes = vec![2];
@@ -1304,9 +1319,14 @@ mod tests {
assert!(vcpu_manager
.create_boot_vcpus(TimestampUs::default(), GuestAddress(0))
.is_ok());
#[cfg(target_arch = "x86_64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 1);
#[cfg(target_arch = "aarch64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 3);
assert!(vcpu_manager.start_boot_vcpus(BpfProgram::default()).is_ok());
#[cfg(target_arch = "aarch64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 2);
// invalid cpuid for exit
let cpu_indexes = vec![2];
@@ -1330,9 +1350,14 @@ mod tests {
assert!(vcpu_manager
.create_boot_vcpus(TimestampUs::default(), GuestAddress(0))
.is_ok());
#[cfg(target_arch = "x86_64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 1);
#[cfg(target_arch = "aarch64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 3);
assert!(vcpu_manager.start_boot_vcpus(BpfProgram::default()).is_ok());
#[cfg(target_arch = "aarch64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 2);
// exit all success
assert!(vcpu_manager.exit_all_vcpus().is_ok());
@@ -1351,9 +1376,14 @@ mod tests {
assert!(vcpu_manager
.create_boot_vcpus(TimestampUs::default(), GuestAddress(0))
.is_ok());
#[cfg(target_arch = "x86_64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 1);
#[cfg(target_arch = "aarch64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 3);
assert!(vcpu_manager.start_boot_vcpus(BpfProgram::default()).is_ok());
#[cfg(target_arch = "aarch64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 2);
// invalid cpuid for exit
let cpu_indexes = vec![2];
@@ -1377,9 +1407,14 @@ mod tests {
assert!(vcpu_manager
.create_boot_vcpus(TimestampUs::default(), GuestAddress(0))
.is_ok());
#[cfg(target_arch = "x86_64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 1);
#[cfg(target_arch = "aarch64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 3);
assert!(vcpu_manager.start_boot_vcpus(BpfProgram::default()).is_ok());
#[cfg(target_arch = "aarch64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 2);
// revalidate all success
assert!(vcpu_manager.revalidate_all_vcpus_cache().is_ok());
@@ -1395,9 +1430,14 @@ mod tests {
assert!(vcpu_manager
.create_boot_vcpus(TimestampUs::default(), GuestAddress(0))
.is_ok());
#[cfg(target_arch = "x86_64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 1);
#[cfg(target_arch = "aarch64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 3);
assert!(vcpu_manager.start_boot_vcpus(BpfProgram::default()).is_ok());
#[cfg(target_arch = "aarch64")]
assert_eq!(get_present_unstart_vcpus(&vcpu_manager), 2);
// set vcpus in hotplug action
let cpu_ids = vec![0];

View File

@@ -6,70 +6,35 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the THIRD-PARTY file.
use std::collections::HashMap;
use std::fmt::Debug;
use std::ops::Deref;
use std::sync::MutexGuard;
use dbs_arch::gic::GICDevice;
use dbs_arch::pmu::initialize_pmu;
use dbs_arch::{DeviceInfoForFDT, DeviceType, VpmuFeatureLevel};
use dbs_arch::{MMIODeviceInfo, VpmuFeatureLevel};
use dbs_boot::fdt_utils::*;
use dbs_boot::InitrdConfig;
use dbs_utils::epoll_manager::EpollManager;
use dbs_utils::time::TimestampUs;
use linux_loader::loader::Cmdline;
use vm_memory::{GuestAddressSpace, GuestMemory};
use linux_loader::cmdline::{Cmdline, Error as CmdlineError};
use vm_memory::GuestAddressSpace;
use vmm_sys_util::eventfd::EventFd;
use super::{Vm, VmError};
use crate::address_space_manager::{GuestAddressSpaceImpl, GuestMemoryImpl};
use crate::error::{Error, StartMicroVmError};
use crate::event_manager::EventManager;
use crate::vcpu::VcpuManager;
/// Configures the system and should be called once per vm before starting vcpu threads.
/// For aarch64, we only setup the FDT.
///
/// # Arguments
///
/// * `guest_mem` - The memory to be used by the guest.
/// * `cmdline` - The kernel commandline.
/// * `vcpu_mpidr` - Array of MPIDR register values per vcpu.
/// * `device_info` - A hashmap containing the attached devices for building FDT device nodes.
/// * `gic_device` - The GIC device.
/// * `initrd` - Information about an optional initrd.
#[allow(clippy::borrowed_box)]
fn configure_system<T: DeviceInfoForFDT + Clone + Debug, M: GuestMemory>(
guest_mem: &M,
cmdline: &str,
vcpu_mpidr: Vec<u64>,
device_info: Option<&HashMap<(DeviceType, String), T>>,
gic_device: &Box<dyn GICDevice>,
initrd: &Option<super::InitrdConfig>,
vpmu_feature: &VpmuFeatureLevel,
) -> super::Result<()> {
dbs_boot::fdt::create_fdt(
guest_mem,
vcpu_mpidr,
cmdline,
device_info,
gic_device,
initrd,
vpmu_feature,
)
.map_err(Error::BootSystem)?;
Ok(())
}
#[cfg(target_arch = "aarch64")]
impl Vm {
/// Gets a reference to the irqchip of the VM
#[allow(clippy::borrowed_box)]
pub fn get_irqchip(&self) -> &Box<dyn GICDevice> {
self.irqchip_handle.as_ref().unwrap()
pub fn get_irqchip(&self) -> &dyn GICDevice {
self.irqchip_handle.as_ref().unwrap().as_ref()
}
/// Creates the irq chip in-kernel device model.
pub fn setup_interrupt_controller(&mut self) -> std::result::Result<(), StartMicroVmError> {
let vcpu_count = self.vm_config.vcpu_count;
let vcpu_count = self.vm_config.max_vcpu_count;
self.irqchip_handle = Some(
dbs_arch::gic::create_gic(&self.vm_fd, vcpu_count.into())
@@ -99,12 +64,11 @@ impl Vm {
/// Initialize the virtual machine instance.
///
/// It initialize the virtual machine instance by:
/// 1) initialize virtual machine global state and configuration.
/// 2) create system devices, such as interrupt controller.
/// 3) create and start IO devices, such as serial, console, block, net, vsock etc.
/// 4) create and initialize vCPUs.
/// 5) configure CPU power management features.
/// 6) load guest kernel image.
/// 1) Initialize virtual machine reset event fd.
/// 2) Create and initialize vCPUs.
/// 3) Create and initialize interrupt controller.
/// 4) Create and initialize vPMU device.
/// 5) Create and initialize devices, such as virtio, block, net, vsock, vfio etc.
pub fn init_microvm(
&mut self,
epoll_mgr: EpollManager,
@@ -139,6 +103,52 @@ impl Vm {
Ok(())
}
/// Generate fdt information about VM.
fn get_fdt_vm_info<'a>(
&'a self,
vm_memory: &'a GuestMemoryImpl,
cmdline: &'a str,
initrd_config: Option<&'a InitrdConfig>,
vcpu_manager: &'a MutexGuard<VcpuManager>,
) -> FdtVmInfo {
let guest_memory = vm_memory.memory();
let vcpu_mpidr = vcpu_manager
.vcpus()
.into_iter()
.map(|cpu| cpu.get_mpidr())
.collect();
let vm_config = self.vm_config();
let mut vcpu_boot_onlined = vec![1; vm_config.vcpu_count as usize];
vcpu_boot_onlined.resize(vm_config.max_vcpu_count as usize, 0);
let vpmu_feature = vcpu_manager.vpmu_feature();
// This configuration is used for passing cache information into guest.
// TODO: dragonball-sandbox #274; kata-containers #6969
let cache_passthrough_enabled = false;
let fdt_vcpu_info = FdtVcpuInfo::new(
vcpu_mpidr,
vcpu_boot_onlined,
vpmu_feature,
cache_passthrough_enabled,
);
FdtVmInfo::new(guest_memory, cmdline, initrd_config, fdt_vcpu_info)
}
// This method is used for passing cache/numa information into guest
// TODO: dragonball-sandbox #274,#275; kata-containers #6969
/// Generate fdt information about cache/numa
fn get_fdt_numa_info(&self) -> FdtNumaInfo {
FdtNumaInfo::default()
}
/// Generate fdt information about devices
fn get_fdt_device_info(&self) -> FdtDeviceInfo<MMIODeviceInfo> {
FdtDeviceInfo::new(
self.device_manager().get_mmio_device_info(),
self.get_irqchip(),
)
}
/// Execute system architecture specific configurations.
///
/// 1) set guest kernel boot parameters
@@ -150,24 +160,23 @@ impl Vm {
initrd: Option<InitrdConfig>,
) -> std::result::Result<(), StartMicroVmError> {
let vcpu_manager = self.vcpu_manager().map_err(StartMicroVmError::Vcpu)?;
let vpmu_feature = vcpu_manager.vpmu_feature();
let vcpu_mpidr = vcpu_manager
.vcpus()
.into_iter()
.map(|cpu| cpu.get_mpidr())
.collect();
let guest_memory = vm_memory.memory();
let cmdline_cstring = cmdline
.as_cstring()
.map_err(StartMicroVmError::ProcessCommandlne)?;
let fdt_vm_info = self.get_fdt_vm_info(
vm_memory,
cmdline_cstring
.to_str()
.map_err(|_| StartMicroVmError::ProcessCommandlne(CmdlineError::InvalidAscii))?,
initrd.as_ref(),
&vcpu_manager,
);
let fdt_numa_info = self.get_fdt_numa_info();
let fdt_device_info = self.get_fdt_device_info();
configure_system(
guest_memory,
cmdline.as_cstring().unwrap().to_str().unwrap(),
vcpu_mpidr,
self.device_manager.get_mmio_device_info(),
self.get_irqchip(),
&initrd,
&vpmu_feature,
)
.map_err(StartMicroVmError::ConfigureSystem)
dbs_boot::fdt::create_fdt(fdt_vm_info, fdt_numa_info, fdt_device_info)
.map(|_| ())
.map_err(|e| StartMicroVmError::ConfigureSystem(Error::BootSystem(e)))
}
pub(crate) fn register_events(

View File

@@ -241,6 +241,7 @@ impl Vm {
resource_manager.clone(),
epoll_manager.clone(),
&logger,
api_shared_info.clone(),
);
Ok(Vm {
@@ -413,6 +414,19 @@ impl Vm {
(dragonball_version, instance_id)
}
pub(crate) fn stop_prealloc(&mut self) -> std::result::Result<(), StartMicroVmError> {
if self.address_space.is_initialized() {
return self
.address_space
.wait_prealloc(true)
.map_err(StartMicroVmError::AddressManagerError);
}
Err(StartMicroVmError::AddressManagerError(
AddressManagerError::GuestMemoryNotInitialized,
))
}
}
impl Vm {
@@ -481,7 +495,6 @@ impl Vm {
) -> std::result::Result<(), StartMicroVmError> {
info!(self.logger, "VM: initializing devices ...");
let com1_sock_path = self.vm_config.serial_path.clone();
let kernel_config = self
.kernel_config
.as_mut()
@@ -503,9 +516,9 @@ impl Vm {
vm_as.clone(),
epoll_manager,
kernel_config,
com1_sock_path,
self.dmesg_fifo.take(),
self.address_space.address_space(),
&self.vm_config,
)?;
info!(self.logger, "VM: start devices");
@@ -888,6 +901,7 @@ impl Vm {
#[cfg(test)]
pub mod tests {
#[cfg(target_arch = "x86_64")]
use kvm_ioctls::VcpuExit;
use linux_loader::cmdline::Cmdline;
use test_utils::skip_if_not_root;

1
src/libs/Cargo.lock generated
View File

@@ -508,6 +508,7 @@ dependencies = [
"num_cpus",
"oci",
"regex",
"safe-path",
"serde",
"serde_json",
"slog",

View File

@@ -27,6 +27,7 @@ thiserror = "1.0"
toml = "0.5.8"
oci = { path = "../oci" }
safe-path = { path = "../safe-path" }
[dev-dependencies]
tempfile = "3"

View File

@@ -316,6 +316,10 @@ pub const KATA_ANNO_CFG_VFIO_MODE: &str = "io.katacontainers.config.runtime.vfio
pub const KATA_ANNO_CFG_HYPERVISOR_PREFETCH_FILES_LIST: &str =
"io.katacontainers.config.hypervisor.prefetch_files.list";
/// A sandbox annotation for sandbox level volume sharing with host.
pub const KATA_ANNO_CFG_SANDBOX_BIND_MOUNTS: &str =
"io.katacontainers.config.runtime.sandbox_bind_mounts";
/// A helper structure to query configuration information by check annotations.
#[derive(Debug, Default, Deserialize)]
pub struct Annotation {
@@ -950,6 +954,16 @@ impl Annotation {
KATA_ANNO_CFG_VFIO_MODE => {
config.runtime.vfio_mode = value.to_string();
}
KATA_ANNO_CFG_SANDBOX_BIND_MOUNTS => {
let args: Vec<String> = value
.to_string()
.split_ascii_whitespace()
.map(str::to_string)
.collect();
for arg in args {
config.runtime.sandbox_bind_mounts.push(arg.to_string());
}
}
_ => {
warn!(sl!(), "Annotation {} not enabled", key);
}

View File

@@ -8,7 +8,8 @@ use std::path::Path;
use super::default;
use crate::config::{ConfigOps, TomlConfig};
use crate::{eother, resolve_path, validate_path};
use crate::mount::split_bind_mounts;
use crate::{eother, validate_path};
/// Type of runtime VirtContainer.
pub const RUNTIME_NAME_VIRTCONTAINER: &str = "virt_container";
@@ -146,7 +147,14 @@ impl ConfigOps for Runtime {
}
for bind in conf.runtime.sandbox_bind_mounts.iter_mut() {
resolve_path!(*bind, "sandbox bind mount `{}` is invalid: {}")?;
// Split the bind mount, canonicalize the path and then append rw mode to it.
let (real_path, mode) = split_bind_mounts(bind);
match Path::new(real_path).canonicalize() {
Err(e) => return Err(eother!("sandbox bind mount `{}` is invalid: {}", bind, e)),
Ok(path) => {
*bind = format!("{}{}", path.display(), mode);
}
}
}
Ok(())
@@ -176,7 +184,12 @@ impl ConfigOps for Runtime {
}
for bind in conf.runtime.sandbox_bind_mounts.iter() {
validate_path!(*bind, "sandbox bind mount `{}` is invalid: {}")?;
// Just validate the real_path.
let (real_path, _mode) = split_bind_mounts(bind);
validate_path!(
real_path.to_owned(),
"sandbox bind mount `{}` is invalid: {}"
)?;
}
Ok(())

View File

@@ -26,7 +26,7 @@ pub enum Error {
}
/// Assigned CPU resources for a Linux container.
#[derive(Default, Debug)]
#[derive(Clone, Default, Debug)]
pub struct LinuxContainerCpuResources {
shares: u64,
period: u64,

View File

@@ -5,7 +5,7 @@
//
use anyhow::{anyhow, Context, Result};
use std::{collections::HashMap, path::PathBuf};
use std::{collections::HashMap, fs, path::PathBuf};
/// Prefix to mark a volume as Kata special.
pub const KATA_VOLUME_TYPE_PREFIX: &str = "kata:";
@@ -25,6 +25,15 @@ pub const KATA_MOUNT_INFO_FILE_NAME: &str = "mountInfo.json";
/// KATA_DIRECT_VOLUME_ROOT_PATH is the root path used for concatenating with the direct-volume mount info file path
pub const KATA_DIRECT_VOLUME_ROOT_PATH: &str = "/run/kata-containers/shared/direct-volumes";
/// SANDBOX_BIND_MOUNTS_DIR is for sandbox bindmounts
pub const SANDBOX_BIND_MOUNTS_DIR: &str = "sandbox-mounts";
/// SANDBOX_BIND_MOUNTS_RO is for sandbox bindmounts with readonly
pub const SANDBOX_BIND_MOUNTS_RO: &str = ":ro";
/// SANDBOX_BIND_MOUNTS_RO is for sandbox bindmounts with readwrite
pub const SANDBOX_BIND_MOUNTS_RW: &str = ":rw";
/// Information about a mount.
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
pub struct Mount {
@@ -71,6 +80,27 @@ pub struct DirectVolumeMountInfo {
pub options: Vec<String>,
}
/// join_path joins user provided volumepath with kata direct-volume root path
/// the volume_path is base64-encoded and then safely joined to the end of path prefix
pub fn join_path(prefix: &str, volume_path: &str) -> Result<PathBuf> {
if volume_path.is_empty() {
return Err(anyhow!("volume path must not be empty"));
}
let b64_encoded_path = base64::encode(volume_path.as_bytes());
Ok(safe_path::scoped_join(prefix, b64_encoded_path)?)
}
/// get DirectVolume mountInfo from mountinfo.json.
pub fn get_volume_mount_info(volume_path: &str) -> Result<DirectVolumeMountInfo> {
let mount_info_file_path =
join_path(KATA_DIRECT_VOLUME_ROOT_PATH, volume_path)?.join(KATA_MOUNT_INFO_FILE_NAME);
let mount_info_file = fs::read_to_string(mount_info_file_path)?;
let mount_info: DirectVolumeMountInfo = serde_json::from_str(&mount_info_file)?;
Ok(mount_info)
}
/// Check whether a mount type is a marker for Kata specific volume.
pub fn is_kata_special_volume(ty: &str) -> bool {
ty.len() > KATA_VOLUME_TYPE_PREFIX.len() && ty.starts_with(KATA_VOLUME_TYPE_PREFIX)
@@ -128,6 +158,28 @@ impl NydusExtraOptions {
serde_json::from_slice(&extra_options_buf).context("deserialize nydus's extraoption")
}
}
/// sandbox bindmount format: /path/to/dir, or /path/to/dir:ro[:rw]
/// the real path is without suffix ":ro" or ":rw".
pub fn split_bind_mounts(bindmount: &str) -> (&str, &str) {
let (real_path, mode) = if bindmount.ends_with(SANDBOX_BIND_MOUNTS_RO) {
(
bindmount.trim_end_matches(SANDBOX_BIND_MOUNTS_RO),
SANDBOX_BIND_MOUNTS_RO,
)
} else if bindmount.ends_with(SANDBOX_BIND_MOUNTS_RW) {
(
bindmount.trim_end_matches(SANDBOX_BIND_MOUNTS_RW),
SANDBOX_BIND_MOUNTS_RW,
)
} else {
// default bindmount format
(bindmount, "")
};
(real_path, mode)
}
#[cfg(test)]
mod tests {
use super::*;
@@ -137,6 +189,18 @@ mod tests {
assert!(!is_kata_special_volume("kata:"));
}
#[test]
fn test_split_bind_mounts() {
let test01 = "xxx0:ro";
let test02 = "xxx2:rw";
let test03 = "xxx3:is";
let test04 = "xxx4";
assert_eq!(split_bind_mounts(test01), ("xxx0", ":ro"));
assert_eq!(split_bind_mounts(test02), ("xxx2", ":rw"));
assert_eq!(split_bind_mounts(test03), ("xxx3:is", ""));
assert_eq!(split_bind_mounts(test04), ("xxx4", ""));
}
#[test]
fn test_is_kata_guest_mount_volume() {
assert!(is_kata_guest_mount_volume("kata:guest-mount:nfs"));

View File

@@ -13,7 +13,7 @@ use crate::Error;
///
/// The `U32Set` may be used to save CPUs parsed from a CPU list file or NUMA nodes parsed from
/// a NUMA node list file.
#[derive(Default, Debug)]
#[derive(Clone, Default, Debug)]
pub struct U32Set(Vec<u32>);
impl U32Set {

View File

@@ -15,6 +15,7 @@ serde_json = "1.0.73"
# (by stopping the compiler from removing log calls).
slog = { version = "2.5.2", features = ["dynamic-keys", "max_level_trace", "release_max_level_debug"] }
slog-json = "2.4.0"
slog-term = "2.9.0"
slog-async = "2.7.0"
slog-scope = "4.4.0"

View File

@@ -39,6 +39,28 @@ const LOG_LEVELS: &[(&str, slog::Level)] = &[
const DEFAULT_SUBSYSTEM: &str = "root";
// Creates a logger which prints output as human readable text to the terminal
pub fn create_term_logger(level: slog::Level) -> (slog::Logger, slog_async::AsyncGuard) {
let term_drain = slog_term::term_compact().fuse();
// Ensure only a unique set of key/value fields is logged
let unique_drain = UniqueDrain::new(term_drain).fuse();
// Allow runtime filtering of records by log level
let filter_drain = RuntimeLevelFilter::new(unique_drain, level).fuse();
// Ensure the logger is thread-safe
let (async_drain, guard) = slog_async::Async::new(filter_drain)
.thread_name("slog-async-logger".into())
.build_with_guard();
// Add some "standard" fields
let logger = slog::Logger::root(async_drain.fuse(), o!("subsystem" => DEFAULT_SUBSYSTEM));
(logger, guard)
}
// Creates a logger which prints output as JSON
// XXX: 'writer' param used to make testing possible.
pub fn create_logger<W>(
name: &str,

View File

@@ -366,7 +366,8 @@ message OnlineCPUMemRequest {
// resources are connected asynchronously and the agent returns immediately.
bool wait = 1;
// NbCpus specifies the number of CPUs that were added and the agent has to online.
// NbCpus specifies the number of CPUs that should be onlined in the guest.
// Special value 0 means agent will skip this check.
uint32 nb_cpus = 2;
// CpuOnly specifies whether only online CPU or not.

View File

@@ -241,6 +241,17 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "debc29dde2e69f9e47506b525f639ed42300fc014a3e007832592448fa8e4599"
[[package]]
name = "atty"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
"hermit-abi 0.1.19",
"libc",
"winapi",
]
[[package]]
name = "autocfg"
version = "1.1.0"
@@ -644,11 +655,12 @@ dependencies = [
[[package]]
name = "dbs-address-space"
version = "0.2.2"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bcc37dc0b8ffae1c5911d13ae630dc7a9020fa0de0edd178d6ab71daf56c8fc"
checksum = "95e20d28a9cd13bf00d0ecd1bd073d242242b04f0acb663d7adfc659f8879322"
dependencies = [
"arc-swap",
"lazy_static",
"libc",
"nix 0.23.2",
"thiserror",
@@ -682,9 +694,9 @@ dependencies = [
[[package]]
name = "dbs-boot"
version = "0.3.1"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a74a8c05a1674d3032e610b4f201c7440c345559bad3dfe6b455ce195785108"
checksum = "5466a92f75aa928a9103dcb2088f6d1638ef9da8945fad7389a73864dfa0182c"
dependencies = [
"dbs-arch",
"kvm-bindings",
@@ -735,9 +747,9 @@ dependencies = [
[[package]]
name = "dbs-upcall"
version = "0.2.0"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "699e62afa444ae4b00d474fd91bc37785ba050acdfbe179731c81898e32efc3f"
checksum = "ea3a78128fd0be8b8b10257675c262b378dc5d00b1e18157736a6c27e45ce4fb"
dependencies = [
"anyhow",
"dbs-utils",
@@ -765,9 +777,9 @@ dependencies = [
[[package]]
name = "dbs-virtio-devices"
version = "0.2.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88e5c6c48b766afb95851b04b6b193871a59d0b2a3ed19990d4f8f651ae5c668"
checksum = "24d671cc3e5f98b84ef6b6bed007d28f72f16d3aea8eb38e2d42b00b2973c1d8"
dependencies = [
"byteorder",
"caps",
@@ -781,7 +793,7 @@ dependencies = [
"kvm-ioctls",
"libc",
"log",
"nix 0.23.2",
"nix 0.24.3",
"nydus-api",
"nydus-blobfs",
"nydus-rafs",
@@ -827,6 +839,27 @@ dependencies = [
"subtle",
]
[[package]]
name = "dirs-next"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1"
dependencies = [
"cfg-if 1.0.0",
"dirs-sys-next",
]
[[package]]
name = "dirs-sys-next"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
dependencies = [
"libc",
"redox_users",
"winapi",
]
[[package]]
name = "dlv-list"
version = "0.3.0"
@@ -1212,6 +1245,15 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "hermit-abi"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
dependencies = [
"libc",
]
[[package]]
name = "hermit-abi"
version = "0.2.6"
@@ -1481,6 +1523,7 @@ dependencies = [
"num_cpus",
"oci",
"regex",
"safe-path 0.1.0",
"serde",
"serde_json",
"slog",
@@ -1618,6 +1661,7 @@ dependencies = [
"slog-async",
"slog-json",
"slog-scope",
"slog-term",
]
[[package]]
@@ -1845,7 +1889,16 @@ version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
dependencies = [
"hermit-abi",
"hermit-abi 0.2.6",
"libc",
]
[[package]]
name = "num_threads"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
dependencies = [
"libc",
]
@@ -2433,6 +2486,17 @@ dependencies = [
"bitflags",
]
[[package]]
name = "redox_users"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b"
dependencies = [
"getrandom 0.2.8",
"redox_syscall",
"thiserror",
]
[[package]]
name = "regex"
version = "1.7.1"
@@ -2844,6 +2908,19 @@ dependencies = [
"slog-scope",
]
[[package]]
name = "slog-term"
version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87d29185c55b7b258b4f120eab00f48557d4d9bc814f41713f449d35b0f8977c"
dependencies = [
"atty",
"slog",
"term",
"thread_local",
"time 0.3.20",
]
[[package]]
name = "smallvec"
version = "1.10.0"
@@ -2962,6 +3039,17 @@ dependencies = [
"windows-sys 0.42.0",
]
[[package]]
name = "term"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f"
dependencies = [
"dirs-next",
"rustversion",
"winapi",
]
[[package]]
name = "termcolor"
version = "1.2.0"
@@ -3042,6 +3130,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890"
dependencies = [
"itoa",
"libc",
"num_threads",
"serde",
"time-core",
"time-macros",

View File

@@ -3,3 +3,4 @@ members = [
"crates/shim",
"crates/shim-ctl",
]

View File

@@ -309,3 +309,12 @@ experimental=@DEFAULTEXPFEATURES@
# - When running single containers using a tool like ctr, container sizing information will be available.
static_sandbox_resource_mgmt=@DEFSTATICRESOURCEMGMT_DB@
# If specified, sandbox_bind_mounts identifieds host paths to be mounted(ro, rw) into the sandboxes shared path.
# This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory.
# If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts`
# These will not be exposed to the container workloads, and are only provided for potential guest services.
# Now it supports three kinds of bind mount format:
# - "/path/to", default readonly mode.
# - "/path/to:ro", readonly mode.
# - "/path/to:rw", readwrite mode.
sandbox_bind_mounts=@DEFBINDMOUNTS@

View File

@@ -117,5 +117,6 @@ impl_agent!(
get_ip_tables | crate::GetIPTablesRequest | crate::GetIPTablesResponse | None,
set_ip_tables | crate::SetIPTablesRequest | crate::SetIPTablesResponse | None,
get_volume_stats | crate::VolumeStatsRequest | crate::VolumeStatsResponse | None,
resize_volume | crate::ResizeVolumeRequest | crate::Empty | None
resize_volume | crate::ResizeVolumeRequest | crate::Empty | None,
online_cpu_mem | crate::OnlineCPUMemRequest | crate::Empty | None
);

View File

@@ -336,7 +336,7 @@ impl From<UpdateContainerRequest> for agent::UpdateContainerRequest {
fn from(from: UpdateContainerRequest) -> Self {
Self {
container_id: from.container_id,
resources: from_option(Some(from.resources)),
resources: from_option(from.resources),
..Default::default()
}
}

View File

@@ -54,6 +54,7 @@ pub trait Agent: AgentManager + HealthService + Send + Sync {
// sandbox
async fn create_sandbox(&self, req: CreateSandboxRequest) -> Result<Empty>;
async fn destroy_sandbox(&self, req: Empty) -> Result<Empty>;
async fn online_cpu_mem(&self, req: OnlineCPUMemRequest) -> Result<Empty>;
// network
async fn add_arp_neighbors(&self, req: AddArpNeighborRequest) -> Result<Empty>;

View File

@@ -201,7 +201,7 @@ pub struct ListProcessesRequest {
#[derive(PartialEq, Clone, Default)]
pub struct UpdateContainerRequest {
pub container_id: String,
pub resources: oci::LinuxResources,
pub resources: Option<oci::LinuxResources>,
pub mounts: Vec<oci::Mount>,
}

View File

@@ -32,7 +32,7 @@ kata-types = { path = "../../../libs/kata-types" }
logging = { path = "../../../libs/logging" }
shim-interface = { path = "../../../libs/shim-interface" }
dragonball = { path = "../../../dragonball", features = ["atomic-guest-memory", "virtio-vsock", "hotplug", "virtio-blk", "virtio-net", "virtio-fs","dbs-upcall"] }
dragonball = { path = "../../../dragonball", features = ["atomic-guest-memory", "virtio-vsock", "hotplug", "virtio-blk", "virtio-net", "virtio-fs", "dbs-upcall"] }
ch-config = { path = "ch-config", optional = true }

View File

@@ -494,6 +494,10 @@ impl CloudHypervisorInner {
Ok(())
}
pub(crate) async fn resize_vcpu(&self, old_vcpu: u32, new_vcpu: u32) -> Result<(u32, u32)> {
Ok((old_vcpu, new_vcpu))
}
pub(crate) async fn get_pids(&self) -> Result<Vec<u32>> {
Ok(Vec::<u32>::new())
}

View File

@@ -114,6 +114,11 @@ impl Hypervisor for CloudHypervisor {
inner.cleanup().await
}
async fn resize_vcpu(&self, old_vcpu: u32, new_vcpu: u32) -> Result<(u32, u32)> {
let inner = self.inner.read().await;
inner.resize_vcpu(old_vcpu, new_vcpu).await
}
async fn get_pids(&self) -> Result<Vec<u32>> {
let inner = self.inner.read().await;
inner.get_pids().await

View File

@@ -8,19 +8,21 @@ use std::{collections::HashMap, sync::Arc};
use anyhow::{anyhow, Context, Result};
use kata_sys_util::rand::RandomBytes;
use tokio::sync::Mutex;
use tokio::sync::{Mutex, RwLock};
use super::{
util::{get_host_path, get_virt_drive_name},
Device, DeviceConfig, DeviceType,
};
use crate::{
BlockConfig, BlockDevice, Hypervisor, KATA_BLK_DEV_TYPE, KATA_MMIO_BLK_DEV_TYPE,
VIRTIO_BLOCK_MMIO, VIRTIO_BLOCK_PCI,
};
use super::{
util::{get_host_path, get_virt_drive_name},
Device, DeviceConfig,
};
pub type ArcMutexDevice = Arc<Mutex<dyn Device>>;
const DEVICE_TYPE_BLOCK: &str = "b";
/// block_index and released_block_index are used to search an available block index
/// in Sandbox.
///
@@ -75,35 +77,27 @@ impl DeviceManager {
})
}
pub async fn new_device(&mut self, device_config: &DeviceConfig) -> Result<String> {
let device_id = if let Some(dev) = self.find_device(device_config).await {
dev
} else {
self.create_device(device_config)
.await
.context("failed to create device")?
};
Ok(device_id)
}
pub async fn try_add_device(&mut self, device_id: &str) -> Result<()> {
async fn try_add_device(&mut self, device_id: &str) -> Result<()> {
// find the device
let device = self
.devices
.get(device_id)
.context("failed to find device")?;
let mut device_guard = device.lock().await;
// attach device
let mut device_guard = device.lock().await;
let result = device_guard.attach(self.hypervisor.as_ref()).await;
// handle attach error
if let Err(e) = result {
if let DeviceConfig::BlockCfg(config) = device_guard.get_device_info().await {
self.shared_info.release_device_index(config.index);
if let DeviceType::Block(device) = device_guard.get_device_info().await {
self.shared_info.release_device_index(device.config.index);
};
drop(device_guard);
self.devices.remove(device_id);
return Err(e);
}
Ok(())
}
@@ -120,66 +114,97 @@ impl DeviceManager {
}
Err(e) => Err(e),
};
// if detach success, remove it from device manager
if result.is_ok() {
drop(device_guard);
// if detach success, remove it from device manager
self.devices.remove(device_id);
}
return result;
}
Err(anyhow!(
"device with specified ID hasn't been created. {}",
device_id
))
}
pub async fn get_device_info(&self, device_id: &str) -> Result<DeviceConfig> {
async fn get_device_info(&self, device_id: &str) -> Result<DeviceType> {
if let Some(dev) = self.devices.get(device_id) {
return Ok(dev.lock().await.get_device_info().await);
}
Err(anyhow!(
"device with specified ID hasn't been created. {}",
device_id
))
}
async fn find_device(&self, device_config: &DeviceConfig) -> Option<String> {
async fn find_device(&self, host_path: String) -> Option<String> {
for (device_id, dev) in &self.devices {
match dev.lock().await.get_device_info().await {
DeviceConfig::BlockCfg(config) => match device_config {
DeviceConfig::BlockCfg(ref config_new) => {
if config_new.path_on_host == config.path_on_host {
return Some(device_id.to_string());
}
DeviceType::Block(device) => {
if device.config.path_on_host == host_path {
return Some(device_id.to_string());
}
_ => {
continue;
}
},
}
_ => {
// TODO: support find other device type
continue;
}
}
}
None
}
async fn create_device(&mut self, device_config: &DeviceConfig) -> Result<String> {
fn get_dev_virt_path(&mut self, dev_type: &str) -> Result<Option<(u64, String)>> {
let virt_path = if dev_type == DEVICE_TYPE_BLOCK {
// generate virt path
let current_index = self.shared_info.declare_device_index()?;
let drive_name = get_virt_drive_name(current_index as i32)?;
let virt_path_name = format!("/dev/{}", drive_name);
Some((current_index, virt_path_name))
} else {
// only dev_type is block, otherwise, it's useless.
None
};
Ok(virt_path)
}
async fn new_device(&mut self, device_config: &DeviceConfig) -> Result<String> {
// device ID must be generated by manager instead of device itself
// in case of ID collision
let device_id = self.new_device_id()?;
let dev: ArcMutexDevice = match device_config {
DeviceConfig::BlockCfg(config) => self
.create_block_device(config, device_id.clone())
.await
.context("failed to create device")?,
DeviceConfig::BlockCfg(config) => {
// try to find the device, found and just return id.
if let Some(dev_id_matched) = self.find_device(config.path_on_host.clone()).await {
info!(
sl!(),
"device with host path:{:?} found. just return device id: {:?}",
config.path_on_host.clone(),
dev_id_matched
);
return Ok(dev_id_matched);
}
self.create_block_device(config, device_id.clone())
.await
.context("failed to create device")?
}
_ => {
return Err(anyhow!("invliad device type"));
}
};
// register device to devices
self.devices.insert(device_id.clone(), dev.clone());
Ok(device_id)
}
@@ -204,17 +229,23 @@ impl DeviceManager {
_ => "".to_string(),
};
block_config.driver_option = block_driver;
// generate virt path
let current_index = self.shared_info.declare_device_index()?;
block_config.index = current_index;
let drive_name = get_virt_drive_name(current_index as i32)?;
block_config.virt_path = format!("/dev/{}", drive_name);
// if the path on host is empty, we need to get device host path from the device major and minor number
// Otherwise, it might be rawfile based block device, the host path is already passed from the runtime, so we don't need to do anything here
if block_config.path_on_host.is_empty() {
block_config.path_on_host = get_host_path("b".to_owned(), config.major, config.minor)
.context("failed to get host path")?;
// generate block device index and virt path
// safe here, Block device always has virt_path.
if let Some(virt_path) = self.get_dev_virt_path(DEVICE_TYPE_BLOCK)? {
block_config.index = virt_path.0;
block_config.virt_path = virt_path.1;
}
// if the path on host is empty, we need to get device host path from the device major and minor number
// Otherwise, it might be rawfile based block device, the host path is already passed from the runtime,
// so we don't need to do anything here
if block_config.path_on_host.is_empty() {
block_config.path_on_host =
get_host_path(DEVICE_TYPE_BLOCK.to_owned(), config.major, config.minor)
.context("failed to get host path")?;
}
Ok(Arc::new(Mutex::new(BlockDevice::new(
device_id,
block_config,
@@ -237,3 +268,36 @@ impl DeviceManager {
Err(anyhow!("ID are exhausted"))
}
}
// Many scenarios have similar steps when adding devices. so to reduce duplicated code,
// we should create a common method abstracted and use it in various scenarios.
// do_handle_device:
// (1) new_device with DeviceConfig and return device_id;
// (2) try_add_device with device_id and do really add device;
// (3) return device info of device's info;
pub async fn do_handle_device(
d: &RwLock<DeviceManager>,
dev_info: &DeviceConfig,
) -> Result<DeviceType> {
let device_id = d
.write()
.await
.new_device(dev_info)
.await
.context("failed to create deviec")?;
d.write()
.await
.try_add_device(&device_id)
.await
.context("failed to add deivce")?;
let device_info = d
.read()
.await
.get_device_info(&device_id)
.await
.context("failed to get device info")?;
Ok(device_info)
}

View File

@@ -7,7 +7,7 @@
use std::{fs, path::Path, process::Command};
use crate::device::Device;
use crate::device::DeviceConfig;
use crate::device::DeviceType;
use crate::Hypervisor as hypervisor;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
use anyhow::anyhow;
@@ -166,7 +166,7 @@ impl Device for VfioConfig {
todo!()
}
async fn get_device_info(&self) -> DeviceConfig {
async fn get_device_info(&self) -> DeviceType {
todo!()
}

View File

@@ -5,7 +5,7 @@
//
use crate::device::Device;
use crate::device::DeviceConfig;
use crate::device::DeviceType;
use crate::Hypervisor as hypervisor;
use anyhow::Result;
use async_trait::async_trait;
@@ -47,7 +47,7 @@ impl Device for VhostUserConfig {
todo!()
}
async fn get_device_info(&self) -> DeviceConfig {
async fn get_device_info(&self) -> DeviceType {
todo!()
}

View File

@@ -6,7 +6,7 @@
pub const VIRTIO_BLOCK_MMIO: &str = "virtio-blk-mmio";
use crate::device::Device;
use crate::device::{DeviceConfig, DeviceType};
use crate::device::DeviceType;
use crate::Hypervisor as hypervisor;
use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
@@ -98,8 +98,8 @@ impl Device for BlockDevice {
Ok(Some(self.config.index))
}
async fn get_device_info(&self) -> DeviceConfig {
DeviceConfig::BlockCfg(self.config.clone())
async fn get_device_info(&self) -> DeviceType {
DeviceType::Block(self.clone())
}
async fn increase_attach_count(&mut self) -> Result<bool> {

View File

@@ -67,6 +67,9 @@ pub struct ShareFsDeviceConfig {
/// queue_num: queue number
pub queue_num: u64,
/// options: virtiofs device's config options.
pub options: Vec<String>,
}
#[derive(Debug, Clone)]

View File

@@ -53,7 +53,7 @@ pub trait Device: Send + Sync {
// detach is to unplug device from VM
async fn detach(&mut self, h: &dyn hypervisor) -> Result<Option<u64>>;
// get_device_info returns device config
async fn get_device_info(&self) -> DeviceConfig;
async fn get_device_info(&self) -> DeviceType;
// increase_attach_count is used to increase the attach count for a device
// return values:
// * true: no need to do real attach when current attach count is zero, skip following actions.

View File

@@ -13,7 +13,7 @@ use crate::{
use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use dragonball::{
api::v1::{BlockDeviceConfigInfo, BootSourceConfig},
api::v1::{BlockDeviceConfigInfo, BootSourceConfig, VcpuResizeInfo},
vm::VmConfigInfo,
};
use kata_sys_util::mount;
@@ -327,6 +327,53 @@ impl DragonballInner {
}
}
// check if resizing info is valid
// the error in this function is not ok to be tolerated, the container boot will fail
fn precheck_resize_vcpus(&self, old_vcpus: u32, new_vcpus: u32) -> Result<(u32, u32)> {
// old_vcpus > 0, safe for conversion
let current_vcpus = old_vcpus;
// a non-zero positive is required
if new_vcpus == 0 {
return Err(anyhow!("resize vcpu error: 0 vcpu resizing is invalid"));
}
// cannot exceed maximum value
if new_vcpus > self.config.cpu_info.default_maxvcpus {
return Err(anyhow!("resize vcpu error: cannot greater than maxvcpus"));
}
Ok((current_vcpus, new_vcpus))
}
// do the check before resizing, returns Result<(old, new)>
pub(crate) async fn resize_vcpu(&self, old_vcpus: u32, new_vcpus: u32) -> Result<(u32, u32)> {
if old_vcpus == new_vcpus {
info!(
sl!(),
"resize_vcpu: no need to resize vcpus because old_vcpus is equal to new_vcpus"
);
return Ok((new_vcpus, new_vcpus));
}
let (old_vcpus, new_vcpus) = self.precheck_resize_vcpus(old_vcpus, new_vcpus)?;
info!(
sl!(),
"check_resize_vcpus passed, passing new_vcpus = {:?} to vmm", new_vcpus
);
let cpu_resize_info = VcpuResizeInfo {
vcpu_count: Some(new_vcpus as u8),
};
self.vmm_instance
.resize_vcpu(&cpu_resize_info)
.context(format!(
"failed to do_resize_vcpus on new_vcpus={:?}",
new_vcpus
))?;
Ok((old_vcpus, new_vcpus))
}
pub fn set_hypervisor_config(&mut self, config: HypervisorConfig) {
self.config = config;
}

View File

@@ -157,7 +157,11 @@ impl DragonballInner {
.context("insert vsock")
}
fn parse_inline_virtiofs_args(&self, fs_cfg: &mut FsDeviceConfigInfo) -> Result<()> {
fn parse_inline_virtiofs_args(
&self,
fs_cfg: &mut FsDeviceConfigInfo,
options: &mut Vec<String>,
) -> Result<()> {
let mut debug = false;
let mut opt_list = String::new();
@@ -169,8 +173,8 @@ impl DragonballInner {
sl!(),
"args: {:?}", &self.config.shared_fs.virtio_fs_extra_args
);
let args = &self.config.shared_fs.virtio_fs_extra_args;
let _ = go_flag::parse_args_with_warnings::<String, _, _>(args, None, |flags| {
let mut args = self.config.shared_fs.virtio_fs_extra_args.clone();
let _ = go_flag::parse_args_with_warnings::<String, _, _>(&args, None, |flags| {
flags.add_flag("d", &mut debug);
flags.add_flag("thread-pool-size", &mut fs_cfg.thread_pool_size);
flags.add_flag("drop-sys-resource", &mut fs_cfg.drop_sys_resource);
@@ -178,6 +182,9 @@ impl DragonballInner {
})
.with_context(|| format!("parse args: {:?}", args))?;
// more options parsed for inline virtio-fs' custom config
args.append(options);
if debug {
warn!(
sl!(),
@@ -202,6 +209,7 @@ impl DragonballInner {
"xattr" => fs_cfg.xattr = true,
"no_xattr" => fs_cfg.xattr = false,
"cache_symlinks" => {} // inline virtiofs always cache symlinks
"no_readdir" => fs_cfg.no_readdir = true,
"trace" => warn!(
sl!(),
"Inline virtiofs \"-o trace\" option not supported yet, ignored."
@@ -234,16 +242,25 @@ impl DragonballInner {
xattr: true,
..Default::default()
};
self.do_add_fs_device(&config.fs_type, &mut fs_cfg)
let mut options = config.options.clone();
self.do_add_fs_device(&config.fs_type, &mut fs_cfg, &mut options)
}
fn do_add_fs_device(&self, fs_type: &str, fs_cfg: &mut FsDeviceConfigInfo) -> Result<()> {
fn do_add_fs_device(
&self,
fs_type: &str,
fs_cfg: &mut FsDeviceConfigInfo,
options: &mut Vec<String>,
) -> Result<()> {
match fs_type {
VIRTIO_FS => {
fs_cfg.mode = String::from("vhostuser");
}
INLINE_VIRTIO_FS => {
self.parse_inline_virtiofs_args(fs_cfg)?;
// All parameters starting with --patch-fs do not need to be processed, these are the parameters required by patch fs
options.retain(|x| !x.starts_with("--patch-fs"));
self.parse_inline_virtiofs_args(fs_cfg, options)?;
}
_ => {
return Err(anyhow!(
@@ -311,8 +328,12 @@ mod tests {
"--drop-sys-resource".to_string(),
"-d".to_string(),
];
let mut options: Vec<String> = Vec::new();
dragonball.config.shared_fs.virtio_fs_cache = "auto".to_string();
dragonball.parse_inline_virtiofs_args(&mut fs_cfg).unwrap();
dragonball
.parse_inline_virtiofs_args(&mut fs_cfg, &mut options)
.unwrap();
assert!(!fs_cfg.no_open);
assert!(fs_cfg.xattr);

View File

@@ -77,6 +77,12 @@ impl Hypervisor for Dragonball {
inner.save_vm().await
}
// returns Result<(old_vcpus, new_vcpus)>
async fn resize_vcpu(&self, old_vcpus: u32, new_vcpus: u32) -> Result<(u32, u32)> {
let inner = self.inner.read().await;
inner.resize_vcpu(old_vcpus, new_vcpus).await
}
async fn add_device(&self, device: DeviceType) -> Result<()> {
let mut inner = self.inner.write().await;
inner.add_device(device).await

View File

@@ -16,8 +16,8 @@ use crossbeam_channel::{unbounded, Receiver, Sender};
use dragonball::{
api::v1::{
BlockDeviceConfigInfo, BootSourceConfig, FsDeviceConfigInfo, FsMountConfigInfo,
InstanceInfo, InstanceState, VirtioNetDeviceConfigInfo, VmmAction, VmmActionError, VmmData,
VmmRequest, VmmResponse, VmmService, VsockDeviceConfigInfo,
InstanceInfo, InstanceState, VcpuResizeInfo, VirtioNetDeviceConfigInfo, VmmAction,
VmmActionError, VmmData, VmmRequest, VmmResponse, VmmService, VsockDeviceConfigInfo,
},
vm::VmConfigInfo,
Vmm,
@@ -248,6 +248,12 @@ impl VmmInstance {
Ok(())
}
pub fn resize_vcpu(&self, cfg: &VcpuResizeInfo) -> Result<()> {
self.handle_request(Request::Sync(VmmAction::ResizeVcpu(cfg.clone())))
.with_context(|| format!("Failed to resize_vm(hotplug vcpu), cfg: {:?}", cfg))?;
Ok(())
}
pub fn pause(&self) -> Result<()> {
todo!()
}

View File

@@ -78,6 +78,7 @@ pub trait Hypervisor: Send + Sync {
async fn pause_vm(&self) -> Result<()>;
async fn save_vm(&self) -> Result<()>;
async fn resume_vm(&self) -> Result<()>;
async fn resize_vcpu(&self, old_vcpus: u32, new_vcpus: u32) -> Result<(u32, u32)>; // returns (old_vcpus, new_vcpus)
// device manager
async fn add_device(&self, device: DeviceType) -> Result<()>;

View File

@@ -104,6 +104,11 @@ impl QemuInner {
todo!()
}
pub(crate) async fn resize_vcpu(&self, _old_vcpus: u32, _new_vcpus: u32) -> Result<(u32, u32)> {
info!(sl!(), "QemuInner::resize_vcpu()");
todo!()
}
pub(crate) async fn get_pids(&self) -> Result<Vec<u32>> {
info!(sl!(), "QemuInner::get_pids()");
todo!()

View File

@@ -118,6 +118,11 @@ impl Hypervisor for Qemu {
inner.cleanup().await
}
async fn resize_vcpu(&self, old_vcpus: u32, new_vcpus: u32) -> Result<(u32, u32)> {
let inner = self.inner.read().await;
inner.resize_vcpu(old_vcpus, new_vcpus).await
}
async fn get_pids(&self) -> Result<Vec<u32>> {
let inner = self.inner.read().await;
inner.get_pids().await

View File

@@ -41,4 +41,5 @@ logging = { path = "../../../libs/logging" }
oci = { path = "../../../libs/oci" }
actix-rt = "2.7.0"
persist = { path = "../persist"}
[features]

View File

@@ -26,6 +26,8 @@ use oci::LinuxResources;
use persist::sandbox_persist::Persist;
use tokio::sync::RwLock;
use crate::ResourceUpdateOp;
const OS_ERROR_NO_SUCH_PROCESS: i32 = 3;
pub struct CgroupArgs {
@@ -149,29 +151,51 @@ impl CgroupsResource {
&self,
cid: &str,
linux_resources: Option<&LinuxResources>,
op: ResourceUpdateOp,
h: &dyn Hypervisor,
) -> Result<()> {
let resource = self.calc_resource(linux_resources);
let changed = self.update_resources(cid, resource).await;
let new_resources = self.calc_resource(linux_resources);
let old_resources = self.update_resources(cid, new_resources.clone(), op).await;
if !changed {
return Ok(());
}
self.do_update_cgroups(h).await
}
async fn update_resources(&self, cid: &str, new_resource: Resources) -> bool {
let mut resources = self.resources.write().await;
let old_resource = resources.insert(cid.to_owned(), new_resource.clone());
if let Some(old_resource) = old_resource {
if old_resource == new_resource {
return false;
if let Some(old_resource) = old_resources.clone() {
if old_resource == new_resources {
return Ok(());
}
}
true
match self.do_update_cgroups(h).await {
Err(e) => {
// if update failed, we should roll back the records in resources
let mut resources = self.resources.write().await;
match op {
ResourceUpdateOp::Add => {
resources.remove(cid);
}
ResourceUpdateOp::Update | ResourceUpdateOp::Del => {
if let Some(old_resource) = old_resources {
resources.insert(cid.to_owned(), old_resource);
}
}
}
Err(e)
}
Ok(()) => Ok(()),
}
}
async fn update_resources(
&self,
cid: &str,
new_resource: Resources,
op: ResourceUpdateOp,
) -> Option<Resources> {
let mut resources = self.resources.write().await;
match op {
ResourceUpdateOp::Add | ResourceUpdateOp::Update => {
resources.insert(cid.to_owned(), new_resource.clone())
}
ResourceUpdateOp::Del => resources.remove(cid),
}
}
async fn do_update_cgroups(&self, h: &dyn Hypervisor) -> Result<()> {

View File

@@ -0,0 +1,188 @@
// Copyright (c) 2019-2022 Alibaba Cloud
// Copyright (c) 2019-2022 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
use std::{
cmp,
collections::{HashMap, HashSet},
convert::TryFrom,
sync::Arc,
};
use agent::{Agent, OnlineCPUMemRequest};
use anyhow::{Context, Ok, Result};
use hypervisor::Hypervisor;
use kata_types::{config::TomlConfig, cpu::LinuxContainerCpuResources};
use oci::LinuxCpu;
use tokio::sync::RwLock;
use crate::ResourceUpdateOp;
#[derive(Default, Debug, Clone)]
pub struct CpuResource {
/// Current number of vCPUs
pub(crate) current_vcpu: Arc<RwLock<u32>>,
/// Default number of vCPUs
pub(crate) default_vcpu: u32,
/// CpuResource of each container
pub(crate) container_cpu_resources: Arc<RwLock<HashMap<String, LinuxContainerCpuResources>>>,
}
impl CpuResource {
pub fn new(config: Arc<TomlConfig>) -> Result<Self> {
let hypervisor_name = config.runtime.hypervisor_name.clone();
let hypervisor_config = config
.hypervisor
.get(&hypervisor_name)
.context(format!("failed to get hypervisor {}", hypervisor_name))?;
Ok(Self {
current_vcpu: Arc::new(RwLock::new(hypervisor_config.cpu_info.default_vcpus as u32)),
default_vcpu: hypervisor_config.cpu_info.default_vcpus as u32,
container_cpu_resources: Arc::new(RwLock::new(HashMap::new())),
})
}
pub(crate) async fn update_cpu_resources(
&self,
cid: &str,
linux_cpus: Option<&LinuxCpu>,
op: ResourceUpdateOp,
hypervisor: &dyn Hypervisor,
agent: &dyn Agent,
) -> Result<()> {
self.update_container_cpu_resources(cid, linux_cpus, op)
.await
.context("update container cpu resources")?;
let vcpu_required = self
.calc_cpu_resources()
.await
.context("calculate vcpus required")?;
if vcpu_required == self.current_vcpu().await {
return Ok(());
}
let curr_vcpus = self
.do_update_cpu_resources(vcpu_required, op, hypervisor, agent)
.await?;
self.update_current_vcpu(curr_vcpus).await;
Ok(())
}
async fn current_vcpu(&self) -> u32 {
let current_vcpu = self.current_vcpu.read().await;
*current_vcpu
}
async fn update_current_vcpu(&self, new_vcpus: u32) {
let mut current_vcpu = self.current_vcpu.write().await;
*current_vcpu = new_vcpus;
}
// update container_cpu_resources field
async fn update_container_cpu_resources(
&self,
cid: &str,
linux_cpus: Option<&LinuxCpu>,
op: ResourceUpdateOp,
) -> Result<()> {
if let Some(cpu) = linux_cpus {
let container_resource = LinuxContainerCpuResources::try_from(cpu)?;
let mut resources = self.container_cpu_resources.write().await;
match op {
ResourceUpdateOp::Add => {
resources.insert(cid.to_owned(), container_resource);
}
ResourceUpdateOp::Update => {
let resource = resources.insert(cid.to_owned(), container_resource.clone());
if let Some(old_container_resource) = resource {
// the priority of cpu-quota is higher than cpuset when determine the number of vcpus.
// we should better ignore the resource update when update cpu only by cpuset if cpu-quota
// has been set previously.
if old_container_resource.quota() > 0 && container_resource.quota() < 0 {
resources.insert(cid.to_owned(), old_container_resource);
}
}
}
ResourceUpdateOp::Del => {
resources.remove(cid);
}
}
}
Ok(())
}
// calculates the total required vcpus by adding each container's requirements within the pod
async fn calc_cpu_resources(&self) -> Result<u32> {
let mut total_vcpu = 0;
let mut cpuset_vcpu: HashSet<u32> = HashSet::new();
let resources = self.container_cpu_resources.read().await;
for (_, cpu_resource) in resources.iter() {
let vcpu = cpu_resource.get_vcpus().unwrap_or(0) as u32;
cpuset_vcpu.extend(cpu_resource.cpuset().iter());
total_vcpu += vcpu;
}
// contrained only by cpuset
if total_vcpu == 0 && !cpuset_vcpu.is_empty() {
info!(sl!(), "(from cpuset)get vcpus # {:?}", cpuset_vcpu);
return Ok(cpuset_vcpu.len() as u32);
}
info!(
sl!(),
"(from cfs_quota&cfs_period)get vcpus count {}", total_vcpu
);
Ok(total_vcpu)
}
// do hotplug and hot-unplug the vcpu
async fn do_update_cpu_resources(
&self,
new_vcpus: u32,
op: ResourceUpdateOp,
hypervisor: &dyn Hypervisor,
agent: &dyn Agent,
) -> Result<u32> {
let old_vcpus = self.current_vcpu().await;
// when adding vcpus, ignore old_vcpus > new_vcpus
// when deleting vcpus, ignore old_vcpus < new_vcpus
if (op == ResourceUpdateOp::Add && old_vcpus > new_vcpus)
|| (op == ResourceUpdateOp::Del && old_vcpus < new_vcpus)
{
return Ok(old_vcpus);
}
// do not reduce computing power
// the number of vcpus would not be lower than the default size
let new_vcpus = cmp::max(new_vcpus, self.default_vcpu);
let (old, new) = hypervisor
.resize_vcpu(old_vcpus, new_vcpus)
.await
.context("resize vcpus")?;
if old < new {
let add = new - old;
info!(sl!(), "request to onlineCpuMem with {:?} cpus", add);
agent
.online_cpu_mem(OnlineCPUMemRequest {
wait: false,
nb_cpus: new,
cpu_only: true,
})
.await
.context("online vcpus")?;
}
Ok(new)
}
}

View File

@@ -13,17 +13,16 @@ use kata_types::{
cpu::LinuxContainerCpuResources, k8s::container_type,
};
// static resource that StaticResourceManager needs, this is the spec for the
// initial resource that InitialSizeManager needs, this is the spec for the
// sandbox/container's workload
#[derive(Clone, Copy, Debug)]
struct StaticResource {
struct InitialSize {
vcpu: u32,
mem_mb: u32,
}
// generate static resource(vcpu and memory in MiB) from spec's information
// used for static resource management
impl TryFrom<&oci::Spec> for StaticResource {
// generate initial resource(vcpu and memory in MiB) from spec's information
impl TryFrom<&oci::Spec> for InitialSize {
type Error = anyhow::Error;
fn try_from(spec: &oci::Spec) -> Result<Self> {
let mut vcpu: u32 = 0;
@@ -65,31 +64,32 @@ impl TryFrom<&oci::Spec> for StaticResource {
}
info!(
sl!(),
"static resource mgmt result: vcpu={}, mem_mb={}", vcpu, mem_mb
"(from PodSandbox's annotation / SingleContainer's spec) initial size: vcpu={}, mem_mb={}", vcpu, mem_mb
);
Ok(Self { vcpu, mem_mb })
}
}
// StaticResourceManager is responsible for static resource management
// InitialSizeManager is responsible for initial vcpu/mem management
//
// static resource management sizing information is optionally provided, either by
// inital vcpu/mem management sizing information is optionally provided, either by
// upper layer runtime (containerd / crio) or by the container spec itself (when it
// is a standalone single container such as the one started with *docker run*)
//
// the sizing information uses three values, cpu quota, cpu period and memory limit,
// and with above values it calculates the # vcpus and memory for the workload and
// add them to default value of the config
// and with above values it calculates the # vcpus and memory for the workload
//
// if the workload # of vcpus and memory is invalid for vmms, we still use default
// value in toml_config
#[derive(Clone, Copy, Debug)]
pub struct StaticResourceManager {
resource: StaticResource,
pub struct InitialSizeManager {
resource: InitialSize,
}
impl StaticResourceManager {
impl InitialSizeManager {
pub fn new(spec: &oci::Spec) -> Result<Self> {
Ok(Self {
resource: StaticResource::try_from(spec)
.context("failed to construct static resource")?,
resource: InitialSize::try_from(spec).context("failed to construct static resource")?,
})
}
@@ -100,8 +100,13 @@ impl StaticResourceManager {
.hypervisor
.get_mut(hypervisor_name)
.context("failed to get hypervisor config")?;
hv.cpu_info.default_vcpus += self.resource.vcpu as i32;
hv.memory_info.default_memory += self.resource.mem_mb;
if self.resource.vcpu > 0 {
hv.cpu_info.default_vcpus = self.resource.vcpu as i32
}
if self.resource.mem_mb > 0 {
hv.memory_info.default_memory = self.resource.mem_mb;
}
Ok(())
}
}
@@ -151,7 +156,7 @@ mod tests {
struct TestData<'a> {
desc: &'a str,
input: InputData,
result: StaticResource,
result: InitialSize,
}
fn get_test_data() -> Vec<TestData<'static>> {
@@ -163,7 +168,7 @@ mod tests {
quota: None,
memory: None,
},
result: StaticResource { vcpu: 0, mem_mb: 0 },
result: InitialSize { vcpu: 0, mem_mb: 0 },
},
TestData {
desc: "normal resource limit",
@@ -173,7 +178,7 @@ mod tests {
quota: Some(220_000),
memory: Some(1024 * 1024 * 512),
},
result: StaticResource {
result: InitialSize {
vcpu: 3,
mem_mb: 512,
},
@@ -183,7 +188,7 @@ mod tests {
}
#[test]
fn test_static_resource_mgmt_sandbox() {
fn test_initial_size_sandbox() {
let tests = get_test_data();
// run tests
@@ -210,22 +215,22 @@ mod tests {
..Default::default()
};
let static_resource = StaticResource::try_from(&spec);
let initial_size = InitialSize::try_from(&spec);
assert!(
static_resource.is_ok(),
initial_size.is_ok(),
"test[{}]: {:?} should be ok",
i,
d.desc
);
let static_resource = static_resource.unwrap();
let initial_size = initial_size.unwrap();
assert_eq!(
static_resource.vcpu, d.result.vcpu,
initial_size.vcpu, d.result.vcpu,
"test[{}]: {:?} vcpu should be {}",
i, d.desc, d.result.vcpu,
);
assert_eq!(
static_resource.mem_mb, d.result.mem_mb,
initial_size.mem_mb, d.result.mem_mb,
"test[{}]: {:?} memory should be {}",
i, d.desc, d.result.mem_mb,
);
@@ -233,7 +238,7 @@ mod tests {
}
#[test]
fn test_static_resource_mgmt_container() {
fn test_initial_size_container() {
let tests = get_test_data();
// run tests
@@ -261,22 +266,22 @@ mod tests {
..Default::default()
};
let static_resource = StaticResource::try_from(&spec);
let initial_size = InitialSize::try_from(&spec);
assert!(
static_resource.is_ok(),
initial_size.is_ok(),
"test[{}]: {:?} should be ok",
i,
d.desc
);
let static_resource = static_resource.unwrap();
let initial_size = initial_size.unwrap();
assert_eq!(
static_resource.vcpu, d.result.vcpu,
initial_size.vcpu, d.result.vcpu,
"test[{}]: {:?} vcpu should be {}",
i, d.desc, d.result.vcpu,
);
assert_eq!(
static_resource.mem_mb, d.result.mem_mb,
initial_size.mem_mb, d.result.mem_mb,
"test[{}]: {:?} memory should be {}",
i, d.desc, d.result.mem_mb,
);

View File

@@ -0,0 +1,8 @@
// Copyright (c) 2019-2023 Alibaba Cloud
// Copyright (c) 2019-2023 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
pub mod cpu;
pub mod initial_size;

View File

@@ -22,6 +22,7 @@ pub mod rootfs;
pub mod share_fs;
pub mod volume;
pub use manager::ResourceManager;
pub mod cpu_mem;
use kata_types::config::hypervisor::SharedFsInfo;
@@ -30,3 +31,10 @@ pub enum ResourceConfig {
Network(NetworkConfig),
ShareFs(SharedFsInfo),
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum ResourceUpdateOp {
Add,
Del,
Update,
}

View File

@@ -4,9 +4,8 @@
// SPDX-License-Identifier: Apache-2.0
//
use crate::network::NetworkConfig;
use crate::resource_persist::ResourceState;
use crate::{manager_inner::ResourceManagerInner, rootfs::Rootfs, volume::Volume, ResourceConfig};
use std::sync::Arc;
use agent::types::Device;
use agent::{Agent, Storage};
use anyhow::Result;
@@ -17,9 +16,13 @@ use kata_types::config::TomlConfig;
use kata_types::mount::Mount;
use oci::{Linux, LinuxResources};
use persist::sandbox_persist::Persist;
use std::sync::Arc;
use tokio::sync::RwLock;
use crate::network::NetworkConfig;
use crate::resource_persist::ResourceState;
use crate::ResourceUpdateOp;
use crate::{manager_inner::ResourceManagerInner, rootfs::Rootfs, volume::Volume, ResourceConfig};
pub struct ManagerArgs {
pub sid: String,
pub agent: Arc<dyn Agent>,
@@ -110,13 +113,14 @@ impl ResourceManager {
inner.dump().await
}
pub async fn update_cgroups(
pub async fn update_linux_resource(
&self,
cid: &str,
linux_resources: Option<&LinuxResources>,
) -> Result<()> {
op: ResourceUpdateOp,
) -> Result<Option<LinuxResources>> {
let inner = self.inner.read().await;
inner.update_cgroups(cid, linux_resources).await
inner.update_linux_resource(cid, linux_resources, op).await
}
pub async fn cleanup(&self) -> Result<()> {

View File

@@ -4,30 +4,34 @@
// SPDX-License-Identifier: Apache-2.0
//
use std::{sync::Arc, thread, vec};
use std::{sync::Arc, thread};
use crate::{network::NetworkConfig, resource_persist::ResourceState};
use agent::{types::Device, Agent, Storage};
use anyhow::{anyhow, Context, Ok, Result};
use async_trait::async_trait;
use hypervisor::{
device::{device_manager::DeviceManager, DeviceConfig},
device::{
device_manager::{do_handle_device, DeviceManager},
DeviceConfig, DeviceType,
},
BlockConfig, Hypervisor,
};
use kata_types::config::TomlConfig;
use kata_types::mount::Mount;
use oci::{Linux, LinuxResources};
use oci::{Linux, LinuxCpu, LinuxResources};
use persist::sandbox_persist::Persist;
use tokio::{runtime, sync::RwLock};
use crate::{
cgroups::{CgroupArgs, CgroupsResource},
cpu_mem::cpu::CpuResource,
manager::ManagerArgs,
network::{self, Network},
network::{self, Network, NetworkConfig},
resource_persist::ResourceState,
rootfs::{RootFsResource, Rootfs},
share_fs::{self, ShareFs},
share_fs::{self, sandbox_bind_mounts::SandboxBindMounts, ShareFs},
volume::{Volume, VolumeResource},
ResourceConfig,
ResourceConfig, ResourceUpdateOp,
};
pub(crate) struct ResourceManagerInner {
@@ -42,6 +46,7 @@ pub(crate) struct ResourceManagerInner {
pub rootfs_resource: RootFsResource,
pub volume_resource: VolumeResource,
pub cgroups_resource: CgroupsResource,
pub cpu_resource: CpuResource,
}
impl ResourceManagerInner {
@@ -51,12 +56,12 @@ impl ResourceManagerInner {
hypervisor: Arc<dyn Hypervisor>,
toml_config: Arc<TomlConfig>,
) -> Result<Self> {
let cgroups_resource = CgroupsResource::new(sid, &toml_config)?;
// create device manager
let dev_manager =
DeviceManager::new(hypervisor.clone()).context("failed to create device manager")?;
let cgroups_resource = CgroupsResource::new(sid, &toml_config)?;
let cpu_resource = CpuResource::new(toml_config.clone())?;
Ok(Self {
sid: sid.to_string(),
toml_config,
@@ -68,6 +73,7 @@ impl ResourceManagerInner {
rootfs_resource: RootFsResource::new(),
volume_resource: VolumeResource::new(),
cgroups_resource,
cpu_resource,
})
}
@@ -97,6 +103,12 @@ impl ResourceManagerInner {
.setup_device_before_start_vm(self.hypervisor.as_ref())
.await
.context("setup share fs device before start vm")?;
// setup sandbox bind mounts: setup = true
self.handle_sandbox_bindmounts(true)
.await
.context("failed setup sandbox bindmounts")?;
Some(share_fs)
} else {
None
@@ -249,6 +261,7 @@ impl ResourceManagerInner {
spec,
self.device_manager.as_ref(),
&self.sid,
self.agent.clone(),
)
.await
}
@@ -258,42 +271,23 @@ impl ResourceManagerInner {
for d in linux.devices.iter() {
match d.r#type.as_str() {
"b" => {
let device_info = DeviceConfig::BlockCfg(BlockConfig {
let dev_info = DeviceConfig::BlockCfg(BlockConfig {
major: d.major,
minor: d.minor,
..Default::default()
});
let device_id = self
.device_manager
.write()
.await
.new_device(&device_info)
.await
.context("failed to create deviec")?;
self.device_manager
.write()
let device_info = do_handle_device(&self.device_manager, &dev_info)
.await
.try_add_device(&device_id)
.await
.context("failed to add deivce")?;
// get complete device information
let dev_info = self
.device_manager
.read()
.await
.get_device_info(&device_id)
.await
.context("failed to get device info")?;
.context("do handle device")?;
// create agent device
if let DeviceConfig::BlockCfg(config) = dev_info {
if let DeviceType::Block(device) = device_info {
let agent_device = Device {
id: device_id.clone(),
id: device.device_id.clone(),
container_path: d.path.clone(),
field_type: config.driver_option,
vm_path: config.virt_path,
field_type: device.config.driver_option,
vm_path: device.config.virt_path,
..Default::default()
};
devices.push(agent_device);
@@ -308,14 +302,20 @@ impl ResourceManagerInner {
Ok(devices)
}
pub async fn update_cgroups(
&self,
cid: &str,
linux_resources: Option<&LinuxResources>,
) -> Result<()> {
self.cgroups_resource
.update_cgroups(cid, linux_resources, self.hypervisor.as_ref())
.await
async fn handle_sandbox_bindmounts(&self, setup: bool) -> Result<()> {
let bindmounts = self.toml_config.runtime.sandbox_bind_mounts.clone();
if bindmounts.is_empty() {
info!(sl!(), "sandbox bindmounts empty, just skip it.");
return Ok(());
}
let sb_bindmnt = SandboxBindMounts::new(self.sid.clone(), bindmounts)?;
if setup {
sb_bindmnt.setup_sandbox_bind_mounts()
} else {
sb_bindmnt.cleanup_sandbox_bind_mounts()
}
}
pub async fn cleanup(&self) -> Result<()> {
@@ -324,6 +324,12 @@ impl ResourceManagerInner {
.delete()
.await
.context("delete cgroup")?;
// cleanup sandbox bind mounts: setup = false
self.handle_sandbox_bindmounts(false)
.await
.context("failed to cleanup sandbox bindmounts")?;
// clean up share fs mount
if let Some(share_fs) = &self.share_fs {
share_fs
@@ -340,6 +346,57 @@ impl ResourceManagerInner {
self.rootfs_resource.dump().await;
self.volume_resource.dump().await;
}
pub async fn update_linux_resource(
&self,
cid: &str,
linux_resources: Option<&LinuxResources>,
op: ResourceUpdateOp,
) -> Result<Option<LinuxResources>> {
let linux_cpus = || -> Option<&LinuxCpu> { linux_resources.as_ref()?.cpu.as_ref() }();
// if static_sandbox_resource_mgmt, we will not have to update sandbox's cpu or mem resource
if !self.toml_config.runtime.static_sandbox_resource_mgmt {
self.cpu_resource
.update_cpu_resources(
cid,
linux_cpus,
op,
self.hypervisor.as_ref(),
self.agent.as_ref(),
)
.await?;
}
// we should firstly update the vcpus and mems, and then update the host cgroups
self.cgroups_resource
.update_cgroups(cid, linux_resources, op, self.hypervisor.as_ref())
.await?;
// update the linux resources for agent
self.agent_linux_resources(linux_resources)
}
fn agent_linux_resources(
&self,
linux_resources: Option<&LinuxResources>,
) -> Result<Option<LinuxResources>> {
let mut resources = match linux_resources {
Some(linux_resources) => linux_resources.clone(),
None => {
return Ok(None);
}
};
// clear the cpuset
// for example, if there are only 5 vcpus now, and the cpuset in LinuxResources is 0-2,6, guest os will report
// error when creating the container. so we choose to clear the cpuset here.
if let Some(cpu) = &mut resources.cpu {
cpu.cpus = String::new();
}
Ok(Some(resources))
}
}
#[async_trait]
@@ -386,6 +443,7 @@ impl Persist for ResourceManagerInner {
)
.await?,
toml_config: Arc::new(TomlConfig::default()),
cpu_resource: CpuResource::default(),
})
}
}

View File

@@ -10,7 +10,10 @@ use agent::Storage;
use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use hypervisor::{
device::{device_manager::DeviceManager, DeviceConfig},
device::{
device_manager::{do_handle_device, DeviceManager},
DeviceConfig, DeviceType,
},
BlockConfig,
};
use kata_types::mount::Mount;
@@ -46,18 +49,10 @@ impl BlockRootfs {
..Default::default()
};
let device_id = d
.write()
// create and insert block device into Kata VM
let device_info = do_handle_device(d, &DeviceConfig::BlockCfg(block_device_config.clone()))
.await
.new_device(&DeviceConfig::BlockCfg(block_device_config.clone()))
.await
.context("failed to create deviec")?;
d.write()
.await
.try_add_device(device_id.as_str())
.await
.context("failed to add deivce")?;
.context("do handle device failed.")?;
let mut storage = Storage {
fs_type: rootfs.fs_type.clone(),
@@ -66,17 +61,11 @@ impl BlockRootfs {
..Default::default()
};
// get complete device information
let dev_info = d
.read()
.await
.get_device_info(device_id.as_str())
.await
.context("failed to get device info")?;
if let DeviceConfig::BlockCfg(config) = dev_info {
storage.driver = config.driver_option;
storage.source = config.virt_path;
let mut device_id: String = "".to_owned();
if let DeviceType::Block(device) = device_info {
storage.driver = device.config.driver_option;
storage.source = device.config.virt_path;
device_id = device.device_id;
}
Ok(Self {

View File

@@ -18,6 +18,7 @@ pub use utils::{
mod virtio_fs_share_mount;
use virtio_fs_share_mount::VirtiofsShareMount;
pub use virtio_fs_share_mount::EPHEMERAL_PATH;
pub mod sandbox_bind_mounts;
use std::{collections::HashMap, fmt::Debug, path::PathBuf, sync::Arc};

View File

@@ -0,0 +1,155 @@
// Copyright (c) 2023 Alibaba Cloud
// Copyright (c) 2023 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
// Note:
// sandbox_bind_mounts supports kinds of mount patterns, for example:
// (1) "/path/to", with default readonly mode.
// (2) "/path/to:ro", same as (1).
// (3) "/path/to:rw", with readwrite mode.
//
// sandbox_bind_mounts: ["/path/to", "/path/to:rw", "/mnt/to:ro"]
//
use std::{
collections::HashMap,
fs,
path::{Path, PathBuf},
};
use anyhow::{anyhow, Context, Result};
use super::utils::{do_get_host_path, mkdir_with_permissions};
use kata_sys_util::{fs::get_base_name, mount};
use kata_types::mount::{SANDBOX_BIND_MOUNTS_DIR, SANDBOX_BIND_MOUNTS_RO, SANDBOX_BIND_MOUNTS_RW};
#[derive(Clone, Default, Debug)]
pub struct SandboxBindMounts {
sid: String,
host_mounts_path: PathBuf,
sandbox_bindmounts: Vec<String>,
}
impl SandboxBindMounts {
pub fn new(sid: String, sandbox_bindmounts: Vec<String>) -> Result<Self> {
// /run/kata-containers/shared/sandboxes/<sid>/rw/passthrough/sandbox-mounts
let bindmounts_path =
do_get_host_path(SANDBOX_BIND_MOUNTS_DIR, sid.as_str(), "", true, false);
let host_mounts_path = PathBuf::from(bindmounts_path);
Ok(SandboxBindMounts {
sid,
host_mounts_path,
sandbox_bindmounts,
})
}
fn parse_sandbox_bind_mounts<'a>(&self, bindmnt_src: &'a str) -> Result<(&'a str, &'a str)> {
// get the bindmount's r/w mode
let bindmount_mode = if bindmnt_src.ends_with(SANDBOX_BIND_MOUNTS_RW) {
SANDBOX_BIND_MOUNTS_RW
} else {
SANDBOX_BIND_MOUNTS_RO
};
// get the true bindmount from the string
let bindmount = bindmnt_src.trim_end_matches(bindmount_mode);
Ok((bindmount_mode, bindmount))
}
pub fn setup_sandbox_bind_mounts(&self) -> Result<()> {
let mut mounted_list: Vec<PathBuf> = Vec::new();
let mut mounted_map: HashMap<String, bool> = HashMap::new();
for src in &self.sandbox_bindmounts {
let (bindmount_mode, bindmount) = self
.parse_sandbox_bind_mounts(src)
.context("parse sandbox bind mounts failed")?;
// get the basename of the canonicalized mount path mnt_name: dirX
let mnt_name = get_base_name(bindmount)?
.into_string()
.map_err(|e| anyhow!("failed to get base name {:?}", e))?;
// if repeated mounted, do umount it and return error
if mounted_map.insert(mnt_name.clone(), true).is_some() {
for p in &mounted_list {
nix::mount::umount(p)
.context("mounted_map insert one repeated mounted, do umount it")?;
}
return Err(anyhow!(
"sandbox-bindmounts: path {} is already specified.",
bindmount
));
}
// mount_dest: /run/kata-containers/shared/sandboxes/<sid>/rw/passthrough/sandbox-mounts/dirX
let mount_dest = self.host_mounts_path.clone().join(mnt_name.as_str());
mkdir_with_permissions(self.host_mounts_path.clone().to_path_buf(), 0o750).context(
format!(
"create host mounts path {:?}",
self.host_mounts_path.clone()
),
)?;
info!(
sl!(),
"sandbox-bindmounts mount_src: {:?} => mount_dest: {:?}", bindmount, &mount_dest
);
// mount -o bind,ro host_shared mount_dest
// host_shared: ${bindmount}
mount::bind_mount_unchecked(Path::new(bindmount), &mount_dest, true).map_err(|e| {
for p in &mounted_list {
nix::mount::umount(p).unwrap_or_else(|x| {
format!("do umount failed: {:?}", x);
});
}
e
})?;
// default sandbox bind mounts mode is ro.
if bindmount_mode == SANDBOX_BIND_MOUNTS_RO {
info!(sl!(), "sandbox readonly bind mount.");
// dest_ro: /run/kata-containers/shared/sandboxes/<sid>/ro/passthrough/sandbox-mounts
let mount_dest_ro =
do_get_host_path(SANDBOX_BIND_MOUNTS_DIR, &self.sid, "", true, true);
let sandbox_bindmounts_ro = [mount_dest_ro, mnt_name.clone()].join("/");
mount::bind_remount(sandbox_bindmounts_ro, true)
.context("remount ro directory with ro permission")?;
}
mounted_list.push(mount_dest);
}
Ok(())
}
pub fn cleanup_sandbox_bind_mounts(&self) -> Result<()> {
for src in &self.sandbox_bindmounts {
let parsed_mnts = self
.parse_sandbox_bind_mounts(src)
.context("parse sandbox bind mounts")?;
let mnt_name = get_base_name(parsed_mnts.1)?
.into_string()
.map_err(|e| anyhow!("failed to convert to string{:?}", e))?;
// /run/kata-containers/shared/sandboxes/<sid>/passthrough/rw/sandbox-mounts/dir
let mnt_dest = self.host_mounts_path.join(mnt_name.as_str());
mount::umount_timeout(mnt_dest, 0).context("umount bindmount failed")?;
}
if fs::metadata(self.host_mounts_path.clone())?.is_dir() {
fs::remove_dir_all(self.host_mounts_path.clone()).context(format!(
"remove sandbox bindmount point {:?}.",
self.host_mounts_path.clone()
))?;
}
Ok(())
}
}

View File

@@ -56,6 +56,7 @@ pub(crate) async fn prepare_virtiofs(
fs_type: fs_type.to_string(),
queue_size: 0,
queue_num: 0,
options: vec![],
},
};
h.add_device(DeviceType::ShareFs(share_fs_device))

View File

@@ -4,13 +4,27 @@
// SPDX-License-Identifier: Apache-2.0
//
use std::path::{Path, PathBuf};
use std::{
os::unix::fs::PermissionsExt,
path::{Path, PathBuf},
};
use anyhow::Result;
use kata_sys_util::mount;
use super::*;
pub(crate) fn mkdir_with_permissions(path_target: PathBuf, mode: u32) -> Result<()> {
let new_path = &path_target;
std::fs::create_dir_all(new_path)
.context(format!("unable to create new path: {:?}", new_path))?;
// mode format: 0o750, ...
std::fs::set_permissions(new_path, std::fs::Permissions::from_mode(mode))?;
Ok(())
}
pub(crate) fn ensure_dir_exist(path: &Path) -> Result<()> {
if !path.exists() {
std::fs::create_dir_all(path).context(format!("failed to create directory {:?}", path))?;

View File

@@ -12,7 +12,6 @@ use kata_types::k8s::is_watchable_mount;
use kata_types::mount;
use nix::sys::stat::stat;
use std::fs;
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
const WATCHABLE_PATH_NAME: &str = "watchable";
@@ -21,7 +20,10 @@ pub const EPHEMERAL_PATH: &str = "/run/kata-containers/sandbox/ephemeral";
use super::{
get_host_rw_shared_path,
utils::{self, do_get_host_path, get_host_ro_shared_path, get_host_shared_path},
utils::{
self, do_get_host_path, get_host_ro_shared_path, get_host_shared_path,
mkdir_with_permissions,
},
ShareFsMount, ShareFsMountResult, ShareFsRootfsConfig, ShareFsVolumeConfig,
KATA_GUEST_SHARE_DIR, PASSTHROUGH_FS_DIR,
};
@@ -79,13 +81,11 @@ impl ShareFsMount for VirtiofsShareMount {
.join(PASSTHROUGH_FS_DIR)
.join(WATCHABLE_PATH_NAME);
fs::create_dir_all(&watchable_host_path).context(format!(
"unable to create watchable path: {:?}",
&watchable_host_path,
mkdir_with_permissions(watchable_host_path.clone(), 0o750).context(format!(
"unable to create watchable path {:?}",
watchable_host_path
))?;
fs::set_permissions(watchable_host_path, fs::Permissions::from_mode(0o750))?;
// path: /run/kata-containers/shared/containers/passthrough/watchable/config-map-name
let file_name = Path::new(&guest_path)
.file_name()

View File

@@ -4,29 +4,32 @@
// SPDX-License-Identifier: Apache-2.0
//
use anyhow::Result;
use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use std::{collections::HashMap, fs, path::Path};
use nix::sys::{stat, stat::SFlag};
use tokio::sync::RwLock;
use crate::share_fs::{do_get_guest_path, do_get_host_path};
use super::{share_fs_volume::generate_mount_path, Volume};
use agent::Storage;
use anyhow::{anyhow, Context};
use super::Volume;
use crate::volume::utils::{
generate_shared_path, volume_mount_info, DEFAULT_VOLUME_FS_TYPE, KATA_DIRECT_VOLUME_TYPE,
KATA_MOUNT_BIND_TYPE,
};
use hypervisor::{
device::{device_manager::DeviceManager, DeviceConfig},
device::{
device_manager::{do_handle_device, DeviceManager},
DeviceConfig, DeviceType,
},
BlockConfig,
};
use nix::sys::stat::{self, SFlag};
use tokio::sync::RwLock;
#[derive(Debug)]
#[derive(Clone)]
pub(crate) struct BlockVolume {
storage: Option<agent::Storage>,
mount: oci::Mount,
device_id: String,
}
/// BlockVolume: block device volume
/// BlockVolume for bind-mount block volume and direct block volume
impl BlockVolume {
pub(crate) async fn new(
d: &RwLock<DeviceManager>,
@@ -35,54 +38,71 @@ impl BlockVolume {
cid: &str,
sid: &str,
) -> Result<Self> {
let fstat = stat::stat(m.source.as_str()).context(format!("stat {}", m.source))?;
info!(sl!(), "device stat: {:?}", fstat);
let mut options = HashMap::new();
if read_only {
options.insert("read_only".to_string(), "true".to_string());
}
let mnt_src: &str = &m.source;
// default block device fs type: ext4.
let mut blk_dev_fstype = DEFAULT_VOLUME_FS_TYPE.to_string();
let block_device_config = &mut BlockConfig {
major: stat::major(fstat.st_rdev) as i64,
minor: stat::minor(fstat.st_rdev) as i64,
..Default::default()
let block_device_config = match m.r#type.as_str() {
KATA_MOUNT_BIND_TYPE => {
let fstat = stat::stat(mnt_src).context(format!("stat {}", m.source))?;
BlockConfig {
major: stat::major(fstat.st_rdev) as i64,
minor: stat::minor(fstat.st_rdev) as i64,
..Default::default()
}
}
KATA_DIRECT_VOLUME_TYPE => {
// get volume mountinfo from mountinfo.json
let v = volume_mount_info(mnt_src)
.context("deserde information from mountinfo.json")?;
// check volume type
if v.volume_type != KATA_DIRECT_VOLUME_TYPE {
return Err(anyhow!("volume type {:?} is invalid", v.volume_type));
}
let fstat = stat::stat(v.device.as_str())
.with_context(|| format!("stat volume device file: {}", v.device.clone()))?;
if SFlag::from_bits_truncate(fstat.st_mode) != SFlag::S_IFREG
&& SFlag::from_bits_truncate(fstat.st_mode) != SFlag::S_IFBLK
{
return Err(anyhow!(
"invalid volume device {:?} for volume type {:?}",
v.device,
v.volume_type
));
}
blk_dev_fstype = v.fs_type.clone();
BlockConfig {
path_on_host: v.device,
..Default::default()
}
}
_ => {
return Err(anyhow!(
"unsupport direct block volume r#type: {:?}",
m.r#type.as_str()
))
}
};
let device_id = d
.write()
// create and insert block device into Kata VM
let device_info = do_handle_device(d, &DeviceConfig::BlockCfg(block_device_config.clone()))
.await
.new_device(&DeviceConfig::BlockCfg(block_device_config.clone()))
.await
.context("failed to create deviec")?;
.context("do handle device failed.")?;
d.write()
// generate host guest shared path
let guest_path = generate_shared_path(m.destination.clone(), read_only, cid, sid)
.await
.try_add_device(device_id.as_str())
.await
.context("failed to add deivce")?;
let file_name = Path::new(&m.source).file_name().unwrap().to_str().unwrap();
let file_name = generate_mount_path(cid, file_name);
let guest_path = do_get_guest_path(&file_name, cid, true, false);
let host_path = do_get_host_path(&file_name, sid, cid, true, read_only);
fs::create_dir_all(&host_path)
.map_err(|e| anyhow!("failed to create rootfs dir {}: {:?}", host_path, e))?;
// get complete device information
let dev_info = d
.read()
.await
.get_device_info(&device_id)
.await
.context("failed to get device info")?;
.context("generate host-guest shared path failed")?;
// storage
let mut storage = Storage::default();
if let DeviceConfig::BlockCfg(config) = dev_info {
storage.driver = config.driver_option;
storage.source = config.virt_path;
}
let mut storage = agent::Storage {
mount_point: guest_path.clone(),
..Default::default()
};
storage.options = if read_only {
vec!["ro".to_string()]
@@ -90,21 +110,32 @@ impl BlockVolume {
Vec::new()
};
storage.mount_point = guest_path.clone();
// If the volume had specified the filesystem type, use it. Otherwise, set it
// to ext4 since but right now we only support it.
if m.r#type != "bind" {
storage.fs_type = m.r#type.clone();
} else {
storage.fs_type = "ext4".to_string();
// As the true Block Device wrapped in DeviceType, we need to
// get it out from the wrapper, and the device_id will be for
// BlockVolume.
// safe here, device_info is correct and only unwrap it.
let mut device_id = String::new();
if let DeviceType::Block(device) = device_info {
// blk, mmioblk
storage.driver = device.config.driver_option;
// /dev/vdX
storage.source = device.config.virt_path;
device_id = device.device_id;
}
// In some case, dest is device /dev/xxx
if m.destination.clone().starts_with("/dev") {
storage.fs_type = "bind".to_string();
storage.options.append(&mut m.options.clone());
} else {
// usually, the dest is directory.
storage.fs_type = blk_dev_fstype;
}
// mount
let mount = oci::Mount {
destination: m.destination.clone(),
r#type: m.r#type.clone(),
source: guest_path.clone(),
r#type: storage.fs_type.clone(),
source: guest_path,
options: m.options.clone(),
};
@@ -128,6 +159,7 @@ impl Volume for BlockVolume {
} else {
vec![]
};
Ok(s)
}
@@ -144,13 +176,22 @@ impl Volume for BlockVolume {
}
}
pub(crate) fn is_block_volume(m: &oci::Mount) -> bool {
if m.r#type != "bind" {
return false;
pub(crate) fn is_block_volume(m: &oci::Mount) -> Result<bool> {
let vol_types = vec![KATA_MOUNT_BIND_TYPE, KATA_DIRECT_VOLUME_TYPE];
if !vol_types.contains(&m.r#type.as_str()) {
return Ok(false);
}
if let Ok(fstat) = stat::stat(m.source.as_str()).context(format!("stat {}", m.source)) {
info!(sl!(), "device stat: {:?}", fstat);
return SFlag::from_bits_truncate(fstat.st_mode) == SFlag::S_IFBLK;
let fstat =
stat::stat(m.source.as_str()).context(format!("stat mount source {} failed.", m.source))?;
let s_flag = SFlag::from_bits_truncate(fstat.st_mode);
match m.r#type.as_str() {
// case: mount bind and block device
KATA_MOUNT_BIND_TYPE if s_flag == SFlag::S_IFBLK => Ok(true),
// case: directvol and directory
KATA_DIRECT_VOLUME_TYPE if s_flag == SFlag::S_IFDIR => Ok(true),
// else: unsupported or todo for other volume type.
_ => Ok(false),
}
false
}

View File

@@ -9,16 +9,18 @@ mod default_volume;
pub mod hugepage;
mod share_fs_volume;
mod shm_volume;
use async_trait::async_trait;
pub mod utils;
use std::{sync::Arc, vec::Vec};
use anyhow::{Context, Result};
use hypervisor::device::device_manager::DeviceManager;
use std::{sync::Arc, vec::Vec};
use async_trait::async_trait;
use tokio::sync::RwLock;
use crate::{share_fs::ShareFs, volume::block_volume::is_block_volume};
use self::hugepage::{get_huge_page_limits_map, get_huge_page_option};
use crate::{share_fs::ShareFs, volume::block_volume::is_block_volume};
use agent::Agent;
use hypervisor::device::device_manager::DeviceManager;
const BIND: &str = "bind";
@@ -52,6 +54,7 @@ impl VolumeResource {
spec: &oci::Spec,
d: &RwLock<DeviceManager>,
sid: &str,
agent: Arc<dyn Agent>,
) -> Result<Vec<Arc<dyn Volume>>> {
let mut volumes: Vec<Arc<dyn Volume>> = vec![];
let oci_mounts = &spec.mounts;
@@ -65,7 +68,7 @@ impl VolumeResource {
shm_volume::ShmVolume::new(m, shm_size)
.with_context(|| format!("new shm volume {:?}", m))?,
)
} else if is_block_volume(m) {
} else if is_block_volume(m).context("block volume type")? {
// handle block volume
Arc::new(
block_volume::BlockVolume::new(d, m, read_only, cid, sid)
@@ -85,7 +88,7 @@ impl VolumeResource {
)
} else if share_fs_volume::is_share_fs_volume(m) {
Arc::new(
share_fs_volume::ShareFsVolume::new(share_fs, m, cid, read_only)
share_fs_volume::ShareFsVolume::new(share_fs, m, cid, read_only, agent.clone())
.await
.with_context(|| format!("new share fs volume {:?}", m))?,
)

View File

@@ -5,11 +5,15 @@
//
use std::{
fs::File,
io::Read,
os::unix::fs::MetadataExt,
path::{Path, PathBuf},
str::FromStr,
sync::Arc,
};
use agent::Agent;
use anyhow::{anyhow, Context, Result};
use async_trait::async_trait;
use hypervisor::device::device_manager::DeviceManager;
@@ -19,6 +23,9 @@ use super::Volume;
use crate::share_fs::{MountedInfo, ShareFs, ShareFsVolumeConfig};
use kata_types::mount;
use crate::share_fs::DEFAULT_KATA_GUEST_SANDBOX_DIR;
use crate::share_fs::PASSTHROUGH_FS_DIR;
const SYS_MOUNT_PREFIX: [&str; 2] = ["/proc", "/sys"];
// copy file to container's rootfs if filesystem sharing is not supported, otherwise
@@ -39,6 +46,7 @@ impl ShareFsVolume {
m: &oci::Mount,
cid: &str,
readonly: bool,
agent: Arc<dyn Agent>,
) -> Result<Self> {
// The file_name is in the format of "sandbox-{uuid}-{file_name}"
let file_name = Path::new(&m.source).file_name().unwrap().to_str().unwrap();
@@ -61,11 +69,67 @@ impl ShareFsVolume {
Ok(src) => src,
};
// If the mount source is a file, we can copy it to the sandbox
if src.is_file() {
// TODO: copy file
debug!(sl!(), "FIXME: copy file {}", &m.source);
} else {
// This is where we set the value for the guest path
let dest = [
DEFAULT_KATA_GUEST_SANDBOX_DIR,
PASSTHROUGH_FS_DIR,
file_name.clone().as_str(),
]
.join("/");
debug!(
sl!(),
"copy local file {:?} to guest {:?}",
&m.source,
dest.clone()
);
// Read file metadata
let file_metadata = std::fs::metadata(src.clone())
.with_context(|| format!("Failed to read metadata from file: {:?}", src))?;
// Open file
let mut file = File::open(&src)
.with_context(|| format!("Failed to open file: {:?}", src))?;
// Open read file contents to buffer
let mut buffer = Vec::new();
file.read_to_end(&mut buffer)
.with_context(|| format!("Failed to read file: {:?}", src))?;
// Create gRPC request
let r = agent::CopyFileRequest {
path: dest.clone(),
file_size: file_metadata.len() as i64,
uid: file_metadata.uid() as i32,
gid: file_metadata.gid() as i32,
file_mode: file_metadata.mode(),
data: buffer,
..Default::default()
};
debug!(sl!(), "copy_file: {:?} to sandbox {:?}", &src, dest.clone());
// Issue gRPC request to agent
agent.copy_file(r).await.with_context(|| {
format!(
"copy file request failed: src: {:?}, dest: {:?}",
file_name, dest
)
})?;
// append oci::Mount structure to volume mounts
volume.mounts.push(oci::Mount {
destination: m.destination.clone(),
r#type: "bind".to_string(),
source: dest.clone(),
options: m.options.clone(),
})
} else {
// If not, we can ignore it. Let's issue a warning so that the user knows.
warn!(
sl!(),
"Ignoring non-regular file as FS sharing not supported. mount: {:?}", m
);

View File

@@ -0,0 +1,65 @@
// Copyright (c) 2022-2023 Alibaba Cloud
// Copyright (c) 2022-2023 Ant Group
//
// SPDX-License-Identifier: Apache-2.0
//
use std::{fs, path::Path};
use anyhow::{anyhow, Context, Result};
use crate::{
share_fs::{do_get_guest_path, do_get_host_path},
volume::share_fs_volume::generate_mount_path,
};
use kata_sys_util::eother;
use kata_types::mount::{get_volume_mount_info, DirectVolumeMountInfo};
pub const DEFAULT_VOLUME_FS_TYPE: &str = "ext4";
pub const KATA_MOUNT_BIND_TYPE: &str = "bind";
pub const KATA_DIRECT_VOLUME_TYPE: &str = "directvol";
pub const KATA_VFIO_VOLUME_TYPE: &str = "vfiovol";
pub const KATA_SPDK_VOLUME_TYPE: &str = "spdkvol";
// volume mount info load infomation from mountinfo.json
pub fn volume_mount_info(volume_path: &str) -> Result<DirectVolumeMountInfo> {
get_volume_mount_info(volume_path)
}
pub fn get_file_name<P: AsRef<Path>>(src: P) -> Result<String> {
let file_name = src
.as_ref()
.file_name()
.map(|v| v.to_os_string())
.ok_or_else(|| {
eother!(
"failed to get file name of path {}",
src.as_ref().to_string_lossy()
)
})?
.into_string()
.map_err(|e| anyhow!("failed to convert to string {:?}", e))?;
Ok(file_name)
}
pub(crate) async fn generate_shared_path(
dest: String,
read_only: bool,
cid: &str,
sid: &str,
) -> Result<String> {
let file_name = get_file_name(&dest).context("failed to get file name.")?;
let mount_name = generate_mount_path(cid, file_name.as_str());
let guest_path = do_get_guest_path(&mount_name, cid, true, false);
let host_path = do_get_host_path(&mount_name, sid, cid, true, read_only);
if dest.starts_with("/dev") {
fs::File::create(&host_path).context(format!("failed to create file {:?}", &host_path))?;
} else {
std::fs::create_dir_all(&host_path)
.map_err(|e| anyhow!("failed to create dir {}: {:?}", host_path, e))?;
}
Ok(guest_path)
}

View File

@@ -3,6 +3,7 @@
//
// SPDX-License-Identifier: Apache-2.0
//
use std::sync::Arc;
use anyhow::Result;

View File

@@ -26,12 +26,10 @@ pub trait Sandbox: Send + Sync {
async fn cleanup(&self) -> Result<()>;
async fn shutdown(&self) -> Result<()>;
// agent function
async fn agent_sock(&self) -> Result<String>;
// utils
async fn set_iptables(&self, is_ipv6: bool, data: Vec<u8>) -> Result<Vec<u8>>;
async fn get_iptables(&self, is_ipv6: bool) -> Result<Vec<u8>>;
async fn direct_volume_stats(&self, volume_path: &str) -> Result<String>;
async fn direct_volume_resize(&self, resize_req: agent::ResizeVolumeRequest) -> Result<()>;
async fn agent_sock(&self) -> Result<String>;
}

View File

@@ -13,4 +13,3 @@ pub mod manager;
pub use manager::RuntimeHandlerManager;
pub use shim_interface;
mod shim_mgmt;
mod static_resource;

Some files were not shown because too many files have changed in this diff Show More