From da35241a91ab5856dc5dac8a8671745ac9fc7a30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 12 Apr 2023 12:58:51 +0200 Subject: [PATCH 1/9] tests: k8s: Skip k8s-cpu-ns when testing TDX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TEEs do not support CPU / memory hotplug, thus this test must be skipped. Signed-off-by: Fabiano Fidêncio --- tests/integration/kubernetes/k8s-cpu-ns.bats | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration/kubernetes/k8s-cpu-ns.bats b/tests/integration/kubernetes/k8s-cpu-ns.bats index 0089e1c06..4d5f2e883 100644 --- a/tests/integration/kubernetes/k8s-cpu-ns.bats +++ b/tests/integration/kubernetes/k8s-cpu-ns.bats @@ -11,6 +11,7 @@ load "${BATS_TEST_DIRNAME}/tests_common.sh" setup() { [ "${KATA_HYPERVISOR}" == "firecracker" ] && skip "test not working see: ${fc_limitations}" [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "test not working see: ${dragonball_limitations}" + [ "${KATA_HYPERVISOR}" == "qemu-tdx" ] && skip "TEEs do not support memory / CPU hotplug" pod_name="constraints-cpu-test" container_name="first-cpu-container" @@ -27,6 +28,7 @@ setup() { @test "Check CPU constraints" { [ "${KATA_HYPERVISOR}" == "firecracker" ] && skip "test not working see: ${fc_limitations}" [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "test not working see: ${dragonball_limitations}" + [ "${KATA_HYPERVISOR}" == "qemu-tdx" ] && skip "TEEs do not support memory / CPU hotplug" # Create the pod kubectl create -f "${pod_config_dir}/pod-cpu.yaml" @@ -71,6 +73,7 @@ setup() { teardown() { [ "${KATA_HYPERVISOR}" == "firecracker" ] && skip "test not working see: ${fc_limitations}" [ "${KATA_HYPERVISOR}" == "dragonball" ] && skip "test not working see: ${dragonball_limitations}" + [ "${KATA_HYPERVISOR}" == "qemu-tdx" ] && skip "TEEs do not support memory / CPU hotplug" # Debugging information kubectl describe "pod/$pod_name" From d7fdf19e9bdc96bc006030df058d66fad8ec0e1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 12 Apr 2023 13:01:25 +0200 Subject: [PATCH 2/9] gha: tdx: Delete kata-deploy after the tests finish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We must ensure that no kata-deploy is left behind after the tests finish, otherwise it may interfere with the next run. Fixes: #6647 Signed-off-by: Fabiano Fidêncio --- .github/workflows/run-k8s-tests-on-tdx.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/run-k8s-tests-on-tdx.yaml b/.github/workflows/run-k8s-tests-on-tdx.yaml index 78e5d5a89..a842e2659 100644 --- a/.github/workflows/run-k8s-tests-on-tdx.yaml +++ b/.github/workflows/run-k8s-tests-on-tdx.yaml @@ -48,3 +48,21 @@ jobs: env: KATA_HYPERVISOR: ${{ matrix.vmm }} KUBECONFIG: /etc/rancher/k3s/k3s.yaml + + - name: Delete kata-deploy + if: always() + run: | + kubectl delete -f tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml + kubectl -n kube-system wait --timeout=10m --for=delete -l name=kata-deploy pod + + sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}|g" tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml + cat tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml + cat tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml | grep "${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}" || die "Failed to setup the tests image" + kubectl apply -f tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml + sleep 180s + + kubectl delete -f tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml + kubectl delete -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml + kubectl delete -f tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml + env: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml From 542bb0f3f305b7f91f1ec2798fbb1b096508f603 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 12 Apr 2023 14:57:03 +0200 Subject: [PATCH 3/9] gha: tdx: Set KUBECONFIG env at the job level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By doing this we avoid having to set it up on every step. Signed-off-by: Fabiano Fidêncio --- .github/workflows/run-k8s-tests-on-tdx.yaml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/run-k8s-tests-on-tdx.yaml b/.github/workflows/run-k8s-tests-on-tdx.yaml index a842e2659..e1d9ba764 100644 --- a/.github/workflows/run-k8s-tests-on-tdx.yaml +++ b/.github/workflows/run-k8s-tests-on-tdx.yaml @@ -20,6 +20,8 @@ jobs: vmm: - qemu-tdx runs-on: tdx + env: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml steps: - uses: actions/checkout@v3 with: @@ -35,8 +37,6 @@ jobs: kubectl apply -f tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml kubectl -n kube-system wait --timeout=10m --for=condition=Ready -l name=kata-deploy pod kubectl apply -f tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml - env: - KUBECONFIG: /etc/rancher/k3s/k3s.yaml - name: Run tests timeout-minutes: 30 @@ -47,7 +47,6 @@ jobs: popd env: KATA_HYPERVISOR: ${{ matrix.vmm }} - KUBECONFIG: /etc/rancher/k3s/k3s.yaml - name: Delete kata-deploy if: always() @@ -64,5 +63,3 @@ jobs: kubectl delete -f tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml kubectl delete -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml kubectl delete -f tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml - env: - KUBECONFIG: /etc/rancher/k3s/k3s.yaml From e31efc861cb347000d4109d53071f0b183caa15e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 12 Apr 2023 19:12:08 +0200 Subject: [PATCH 4/9] gha: tdx: Use the k3s overlay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As the TDX machine is using k3s, let's make sure we're deploying kat-deploy using the k3s overlay. Signed-off-by: Fabiano Fidêncio --- .github/workflows/run-k8s-tests-on-tdx.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-k8s-tests-on-tdx.yaml b/.github/workflows/run-k8s-tests-on-tdx.yaml index e1d9ba764..4c38b9d23 100644 --- a/.github/workflows/run-k8s-tests-on-tdx.yaml +++ b/.github/workflows/run-k8s-tests-on-tdx.yaml @@ -34,7 +34,7 @@ jobs: cat tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml | grep "${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}" || die "Failed to setup the tests image" kubectl apply -f tools/packaging/kata-deploy/kata-rbac/base/kata-rbac.yaml - kubectl apply -f tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml + kubectl apply -k tools/packaging/kata-deploy/kata-deploy/overlay/k3s kubectl -n kube-system wait --timeout=10m --for=condition=Ready -l name=kata-deploy pod kubectl apply -f tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml @@ -51,7 +51,7 @@ jobs: - name: Delete kata-deploy if: always() run: | - kubectl delete -f tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml + kubectl delete -k tools/packaging/kata-deploy/kata-deploy/overlay/k3s kubectl -n kube-system wait --timeout=10m --for=delete -l name=kata-deploy pod sed -i -e "s|quay.io/kata-containers/kata-deploy:latest|${{ inputs.registry }}/${{ inputs.repo }}:${{ inputs.tag }}|g" tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml From ea386700fe703da6c4aef81f7291175a98da2b1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 12 Apr 2023 21:29:12 +0200 Subject: [PATCH 5/9] kata-deploy: Update podOverhead for TDX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As TEEs cannot hotplug memory / CPU, we *must* consider the default values for those as part of the podOverhead. Signed-off-by: Fabiano Fidêncio --- .../kata-deploy/runtimeclasses/kata-runtimeClasses.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml b/tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml index daa4d1e2f..dc8644957 100644 --- a/tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml +++ b/tools/packaging/kata-deploy/runtimeclasses/kata-runtimeClasses.yaml @@ -19,8 +19,8 @@ metadata: handler: kata-qemu-tdx overhead: podFixed: - memory: "160Mi" - cpu: "250m" + memory: "2048Mi" + cpu: "1.0" scheduling: nodeSelector: katacontainers.io/kata-runtime: "true" From 5ec9ae0f0498b7366fc85ed1448d36e3c9b6ac35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 12 Apr 2023 21:13:41 +0200 Subject: [PATCH 6/9] kata-deploy: Use readinessProbe to ensure everything is ready MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit readinessProbe will help us to only have the kata-deploy pod marked as Ready when it finishes all the needed configurations in the node. Related: #6649 Signed-off-by: Fabiano Fidêncio --- .../packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml | 3 +++ tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml | 3 +++ tools/packaging/kata-deploy/scripts/kata-deploy.sh | 2 ++ 3 files changed, 8 insertions(+) diff --git a/tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml b/tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml index 095876b73..23c3efe02 100644 --- a/tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml +++ b/tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml @@ -21,6 +21,9 @@ spec: image: quay.io/kata-containers/kata-deploy:latest imagePullPolicy: Always command: [ "bash", "-c", "/opt/kata-artifacts/scripts/kata-deploy.sh reset" ] + readinessProbe: + exec: + command: [ "bash", "-c", "[ -f /opt/kata/kata-deployed ]", "&&", "bash", "-c", "[ $? == 1 ]" ] env: - name: NODE_NAME valueFrom: diff --git a/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml b/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml index 97e98ee74..5b5d835b6 100644 --- a/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml +++ b/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml @@ -18,6 +18,9 @@ spec: - name: kube-kata image: quay.io/kata-containers/kata-deploy:latest imagePullPolicy: Always + readinessProbe: + exec: + command: [ "bash", "-c", "[ -f /opt/kata/kata-deployed ]", "&&", "bash", "-c", "[ $? == 0 ]" ] lifecycle: preStop: exec: diff --git a/tools/packaging/kata-deploy/scripts/kata-deploy.sh b/tools/packaging/kata-deploy/scripts/kata-deploy.sh index 8991e04fc..f6206bc96 100755 --- a/tools/packaging/kata-deploy/scripts/kata-deploy.sh +++ b/tools/packaging/kata-deploy/scripts/kata-deploy.sh @@ -310,11 +310,13 @@ function main() { install_artifacts configure_cri_runtime "$runtime" kubectl label node "$NODE_NAME" --overwrite katacontainers.io/kata-runtime=true + touch /opt/kata/kata-deployed ;; cleanup) cleanup_cri_runtime "$runtime" kubectl label node "$NODE_NAME" --overwrite katacontainers.io/kata-runtime=cleanup remove_artifacts + rm /opt/kata/kata-deployed ;; reset) reset_runtime $runtime From 3b76abb3664980b83b134b490f5ce200af09e49d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 12 Apr 2023 15:39:49 +0200 Subject: [PATCH 7/9] kata-deploy: Ensure node is ready after CRI Engine restart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's ensure the node is ready after the CRI Engine restart, otherwise we may proceed and scripts may simply fail if they try to deploy a pod while the CRI Engine is not yet restarted (and, consequently, the node is not Ready). Related: #6649 Signed-off-by: Fabiano Fidêncio --- tools/packaging/kata-deploy/scripts/kata-deploy.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/packaging/kata-deploy/scripts/kata-deploy.sh b/tools/packaging/kata-deploy/scripts/kata-deploy.sh index f6206bc96..cc36e6367 100755 --- a/tools/packaging/kata-deploy/scripts/kata-deploy.sh +++ b/tools/packaging/kata-deploy/scripts/kata-deploy.sh @@ -63,6 +63,15 @@ function install_artifacts() { chmod +x /opt/kata/runtime-rs/bin/* } +function wait_till_node_is_ready() { + local ready="False" + + while ! [[ "${ready}" == "True" ]]; do + sleep 2s + ready=$(kubectl get node $NODE_NAME -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}') + done +} + function configure_cri_runtime() { configure_different_shims_base @@ -76,6 +85,8 @@ function configure_cri_runtime() { esac systemctl daemon-reload systemctl restart "$1" + + wait_till_node_is_ready } function configure_different_shims_base() { @@ -266,6 +277,8 @@ function reset_runtime() { if [ "$1" == "crio" ] || [ "$1" == "containerd" ]; then systemctl restart kubelet fi + + wait_till_node_is_ready } function main() { From f478b9115ea15f7ddeacf4997a49a669bd784738 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 13 Apr 2023 07:06:36 +0200 Subject: [PATCH 8/9] clh: tdx: Update timeouts for confidential guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Booting up TDX takes more time than booting up a normal VM. Those values are being already used as part of the CCv0 branch, and we're just bringing them to the `main` branch as well. Signed-off-by: Fabiano Fidêncio --- src/runtime/virtcontainers/clh.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 1cf40c4d2..6ae99d673 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -73,12 +73,12 @@ const ( // Values based on: clhTimeout = 10 clhAPITimeout = 1 - clhAPITimeoutConfidentialGuest = 10 + clhAPITimeoutConfidentialGuest = 20 // Timeout for hot-plug - hotplug devices can take more time, than usual API calls // Use longer time timeout for it. clhHotPlugAPITimeout = 5 clhStopSandboxTimeout = 3 - clhStopSandboxTimeoutConfidentialGuest = 5 + clhStopSandboxTimeoutConfidentialGuest = 10 clhSocket = "clh.sock" clhAPISocket = "clh-api.sock" virtioFsSocket = "virtiofsd.sock" From dc662333df06646c22ebe747a5f338d579ee11fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Thu, 13 Apr 2023 22:42:52 +0200 Subject: [PATCH 9/9] runtime: Increase the dial_timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When testing on AKS, we've been hitting the dial_timeout every now and then. Let's increase it to 45 seconds (instead of 30) for all the VMMs, and to 60 seconfs in case of TEEs. Signed-off-by: Fabiano Fidêncio --- src/runtime-rs/config/configuration-dragonball.toml.in | 4 ++-- src/runtime/config/configuration-acrn.toml.in | 4 ++-- src/runtime/config/configuration-clh.toml.in | 4 ++-- src/runtime/config/configuration-fc.toml.in | 4 ++-- src/runtime/config/configuration-qemu-tdx.toml.in | 4 ++-- src/runtime/config/configuration-qemu.toml.in | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/runtime-rs/config/configuration-dragonball.toml.in b/src/runtime-rs/config/configuration-dragonball.toml.in index 8b963e12d..174f270e7 100644 --- a/src/runtime-rs/config/configuration-dragonball.toml.in +++ b/src/runtime-rs/config/configuration-dragonball.toml.in @@ -206,8 +206,8 @@ container_pipe_size=@PIPESIZE@ #debug_console_enabled = true # Agent connection dialing timeout value in seconds -# (default: 30) -#dial_timeout = 30 +# (default: 45) +dial_timeout = 45 [runtime] # If enabled, the runtime will log additional debug messages to the diff --git a/src/runtime/config/configuration-acrn.toml.in b/src/runtime/config/configuration-acrn.toml.in index 2d2b7065e..ef0207589 100644 --- a/src/runtime/config/configuration-acrn.toml.in +++ b/src/runtime/config/configuration-acrn.toml.in @@ -154,8 +154,8 @@ disable_selinux=@DEFDISABLESELINUX@ #debug_console_enabled = true # Agent connection dialing timeout value in seconds -# (default: 30) -#dial_timeout = 30 +# (default: 45) +dial_timeout = 45 [runtime] # If enabled, the runtime will log additional debug messages to the diff --git a/src/runtime/config/configuration-clh.toml.in b/src/runtime/config/configuration-clh.toml.in index d79770487..d6653bce9 100644 --- a/src/runtime/config/configuration-clh.toml.in +++ b/src/runtime/config/configuration-clh.toml.in @@ -305,8 +305,8 @@ block_device_driver = "virtio-blk" #debug_console_enabled = true # Agent connection dialing timeout value in seconds -# (default: 30) -#dial_timeout = 30 +# (default: 45) +dial_timeout = 45 [runtime] # If enabled, the runtime will log additional debug messages to the diff --git a/src/runtime/config/configuration-fc.toml.in b/src/runtime/config/configuration-fc.toml.in index 10dc17700..e28316cfa 100644 --- a/src/runtime/config/configuration-fc.toml.in +++ b/src/runtime/config/configuration-fc.toml.in @@ -284,8 +284,8 @@ kernel_modules=[] #debug_console_enabled = true # Agent connection dialing timeout value in seconds -# (default: 30) -#dial_timeout = 30 +# (default: 45) +dial_timeout = 45 [runtime] # If enabled, the runtime will log additional debug messages to the diff --git a/src/runtime/config/configuration-qemu-tdx.toml.in b/src/runtime/config/configuration-qemu-tdx.toml.in index 6cecabdba..b9c130e65 100644 --- a/src/runtime/config/configuration-qemu-tdx.toml.in +++ b/src/runtime/config/configuration-qemu-tdx.toml.in @@ -529,8 +529,8 @@ kernel_modules=[] #debug_console_enabled = true # Agent connection dialing timeout value in seconds -# (default: 30) -#dial_timeout = 30 +# (default: 60) +dial_timeout = 60 [runtime] # If enabled, the runtime will log additional debug messages to the diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in index 4fb5a8ba0..6446b0d0d 100644 --- a/src/runtime/config/configuration-qemu.toml.in +++ b/src/runtime/config/configuration-qemu.toml.in @@ -535,8 +535,8 @@ kernel_modules=[] #debug_console_enabled = true # Agent connection dialing timeout value in seconds -# (default: 30) -#dial_timeout = 30 +# (default: 45) +dial_timeout = 45 [runtime] # If enabled, the runtime will log additional debug messages to the