From 55329301079b1c869b4ea95044f97b7679c66a50 Mon Sep 17 00:00:00 2001 From: Bin Liu Date: Wed, 27 Jul 2022 00:32:03 +0800 Subject: [PATCH 1/5] agent: do some rollback works if case of do_create_container failed In some cases do_create_container may return an error, mostly due to `container.start(process)` call. This commit will do some rollback works if this function failed. Fixes: #4749 Signed-off-by: Bin Liu (cherry picked from commit 09672eb2daa4d1e48f99c58521a0bb2809a0feec) --- src/agent/src/rpc.rs | 70 ++++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/src/agent/src/rpc.rs b/src/agent/src/rpc.rs index 3004be5c0..f8085bb98 100644 --- a/src/agent/src/rpc.rs +++ b/src/agent/src/rpc.rs @@ -249,7 +249,20 @@ impl AgentService { info!(sl!(), "no process configurations!"); return Err(anyhow!(nix::Error::EINVAL)); }; - ctr.start(p).await?; + + // if starting container failed, we will do some rollback work + // to ensure no resources are leaked. + if let Err(err) = ctr.start(p).await { + error!(sl!(), "failed to start container: {:?}", err); + if let Err(e) = ctr.destroy().await { + error!(sl!(), "failed to destroy container: {:?}", e); + } + if let Err(e) = remove_container_resources(&mut s, &cid) { + error!(sl!(), "failed to remove container resources: {:?}", e); + } + return Err(err); + } + s.update_shared_pidns(&ctr)?; s.add_container(ctr); info!(sl!(), "created container!"); @@ -295,27 +308,6 @@ impl AgentService { req: protocols::agent::RemoveContainerRequest, ) -> Result<()> { let cid = req.container_id.clone(); - let mut cmounts: Vec = vec![]; - - let mut remove_container_resources = |sandbox: &mut Sandbox| -> Result<()> { - // Find the sandbox storage used by this container - let mounts = sandbox.container_mounts.get(&cid); - if let Some(mounts) = mounts { - for m in mounts.iter() { - if sandbox.storages.get(m).is_some() { - cmounts.push(m.to_string()); - } - } - } - - for m in cmounts.iter() { - sandbox.unset_and_remove_sandbox_storage(m)?; - } - - sandbox.container_mounts.remove(cid.as_str()); - sandbox.containers.remove(cid.as_str()); - Ok(()) - }; if req.timeout == 0 { let s = Arc::clone(&self.sandbox); @@ -329,7 +321,7 @@ impl AgentService { .destroy() .await?; - remove_container_resources(&mut sandbox)?; + remove_container_resources(&mut sandbox, &cid)?; return Ok(()); } @@ -361,8 +353,7 @@ impl AgentService { let s = self.sandbox.clone(); let mut sandbox = s.lock().await; - - remove_container_resources(&mut sandbox)?; + remove_container_resources(&mut sandbox, &cid)?; Ok(()) } @@ -1752,6 +1743,35 @@ fn update_container_namespaces( Ok(()) } +fn remove_container_resources(sandbox: &mut Sandbox, cid: &str) -> Result<()> { + let mut cmounts: Vec = vec![]; + + // Find the sandbox storage used by this container + let mounts = sandbox.container_mounts.get(cid); + if let Some(mounts) = mounts { + for m in mounts.iter() { + if sandbox.storages.get(m).is_some() { + cmounts.push(m.to_string()); + } + } + } + + for m in cmounts.iter() { + if let Err(err) = sandbox.unset_and_remove_sandbox_storage(m) { + error!( + sl!(), + "failed to unset_and_remove_sandbox_storage for container {}, error: {:?}", + cid, + err + ); + } + } + + sandbox.container_mounts.remove(cid); + sandbox.containers.remove(cid); + Ok(()) +} + fn append_guest_hooks(s: &Sandbox, oci: &mut Spec) -> Result<()> { if let Some(ref guest_hooks) = s.hooks { let mut hooks = oci.hooks.take().unwrap_or_default(); From 25b1a522911b1792b0b641323ef341aa0d119e8a Mon Sep 17 00:00:00 2001 From: Chelsea Mafrica Date: Fri, 12 Aug 2022 10:35:10 -0700 Subject: [PATCH 2/5] runtime: tracing: End root span at end of trace The root span should exist the duration of the trace. Defer ending span until the end of the trace instead of end of function. Add the span to the service struct to do so. Fixes #4902 Signed-off-by: Chelsea Mafrica (cherry picked from commit fcc1e0c6172de5ed27c7aa69bcb632a7d8aef566) --- src/runtime/pkg/containerd-shim-v2/create.go | 3 ++- src/runtime/pkg/containerd-shim-v2/service.go | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/runtime/pkg/containerd-shim-v2/create.go b/src/runtime/pkg/containerd-shim-v2/create.go index 6b14a94c7..65113ac1b 100644 --- a/src/runtime/pkg/containerd-shim-v2/create.go +++ b/src/runtime/pkg/containerd-shim-v2/create.go @@ -97,9 +97,10 @@ func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*con } // create root span + // rootSpan will be ended when the entire trace is ended rootSpan, newCtx := katatrace.Trace(s.ctx, shimLog, "rootSpan", shimTracingTags) s.rootCtx = newCtx - defer rootSpan.End() + s.rootSpan = rootSpan // create span span, newCtx := katatrace.Trace(s.rootCtx, shimLog, "create", shimTracingTags) diff --git a/src/runtime/pkg/containerd-shim-v2/service.go b/src/runtime/pkg/containerd-shim-v2/service.go index 9e703c9e2..b9e8460fb 100644 --- a/src/runtime/pkg/containerd-shim-v2/service.go +++ b/src/runtime/pkg/containerd-shim-v2/service.go @@ -28,6 +28,7 @@ import ( "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" + otelTrace "go.opentelemetry.io/otel/trace" "golang.org/x/sys/unix" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils" @@ -122,8 +123,9 @@ type exit struct { type service struct { sandbox vc.VCSandbox - ctx context.Context - rootCtx context.Context // root context for tracing + ctx context.Context + rootCtx context.Context // root context for tracing + rootSpan otelTrace.Span containers map[string]*container @@ -946,6 +948,7 @@ func (s *service) Shutdown(ctx context.Context, r *taskAPI.ShutdownRequest) (_ * s.mu.Unlock() span.End() + s.rootSpan.End() katatrace.StopTracing(s.rootCtx) return empty, nil From 8f8b93d75386ada68e57b80597a590c936059142 Mon Sep 17 00:00:00 2001 From: Miao Xia Date: Fri, 19 Aug 2022 09:26:33 +0800 Subject: [PATCH 3/5] kernel: Add CONFIG_CGROUP_HUGETLB=y as part of the cgroup fragments Kata guest os cgroup is not work properly kata guest kernel config option CONFIG_CGROUP_HUGETLB is not set, leading to: root@clr-b08d402cc29d44719bb582392b7b3466 ls /sys/fs/cgroup/hugetlb/ ls: cannot access '/sys/fs/cgroup/hugetlb/': No such file or directory Fixes: #4953 Signed-off-by: Miao Xia (cherry picked from commit 731d39df452c57b2e182094b2a3eed2ca4238e68) --- tools/packaging/kernel/configs/fragments/common/cgroup.conf | 1 + tools/packaging/kernel/kata_config_version | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/packaging/kernel/configs/fragments/common/cgroup.conf b/tools/packaging/kernel/configs/fragments/common/cgroup.conf index 429983a5f..1976f440d 100644 --- a/tools/packaging/kernel/configs/fragments/common/cgroup.conf +++ b/tools/packaging/kernel/configs/fragments/common/cgroup.conf @@ -12,6 +12,7 @@ CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_HUGETLB=y CONFIG_CGROUP_PERF=y CONFIG_SOCK_CGROUP_DATA=y diff --git a/tools/packaging/kernel/kata_config_version b/tools/packaging/kernel/kata_config_version index 49541f721..5595fa46c 100644 --- a/tools/packaging/kernel/kata_config_version +++ b/tools/packaging/kernel/kata_config_version @@ -1 +1 @@ -94 +95 From 38801e5bf17a48fc8996e3f652df7e4360735ae5 Mon Sep 17 00:00:00 2001 From: Archana Shinde Date: Tue, 30 Aug 2022 12:48:16 -0700 Subject: [PATCH 4/5] release: Adapt kata-deploy for 2.5.1 kata-deploy files must be adapted to a new release. The cases where it happens are when the release goes from -> to: * main -> stable: * kata-deploy-stable / kata-cleanup-stable: are removed * stable -> stable: * kata-deploy / kata-cleanup: bump the release to the new one. There are no changes when doing an alpha release, as the files on the "main" branch always point to the "latest" and "stable" tags. Signed-off-by: Archana Shinde --- tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml | 2 +- tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml b/tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml index 96b01c014..69f042bef 100644 --- a/tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml +++ b/tools/packaging/kata-deploy/kata-cleanup/base/kata-cleanup.yaml @@ -18,7 +18,7 @@ spec: katacontainers.io/kata-runtime: cleanup containers: - name: kube-kata-cleanup - image: quay.io/kata-containers/kata-deploy:2.5.0 + image: quay.io/kata-containers/kata-deploy:2.5.1 imagePullPolicy: Always command: [ "bash", "-c", "/opt/kata-artifacts/scripts/kata-deploy.sh reset" ] env: diff --git a/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml b/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml index f1ca83d1a..d629d1d67 100644 --- a/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml +++ b/tools/packaging/kata-deploy/kata-deploy/base/kata-deploy.yaml @@ -16,7 +16,7 @@ spec: serviceAccountName: kata-label-node containers: - name: kube-kata - image: quay.io/kata-containers/kata-deploy:2.5.0 + image: quay.io/kata-containers/kata-deploy:2.5.1 imagePullPolicy: Always lifecycle: preStop: From d6437435a29594329be31e1b8980239857022c8d Mon Sep 17 00:00:00 2001 From: Archana Shinde Date: Tue, 30 Aug 2022 12:48:16 -0700 Subject: [PATCH 5/5] release: Kata Containers 2.5.1 f74155cd5 release: Adapt kata-deploy for 2.5.1 cd898d28c runtime: clh: Use the new 'payload' interface e8512320c runtime: clh: Re-generate the client code 2e3ae3f23 agent-ctl: Get rid of compiler warning 14a4551d5 versions: Upgrade rust supported version to 1.59.0 012837260 versions: Update kernel to 5.15.63 69505695b agent-ctl/trace-forwarder: udpate thread_local dependency 48a94f36a agent/runk: update regex dependency 1a396a178 dep: update nix dependency c0b5ba230 versions: Upgrade to Cloud Hypervisor v26.0 Signed-off-by: Archana Shinde --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 437459cd9..73462a5a1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.5.0 +2.5.1