Merge pull request #717 from bergwolf/signal

fix issues with short life time container/exec processes
2026-02-07 15:44:20 +01:00 · 2020-09-17 11:00:30 +08:00
parent 059e6426e9 22876b2da6
commit a06142fc5d
4 changed files with 43 additions and 25 deletions
--- a/src/agent/rustjail/src/process.rs
+++ b/src/agent/rustjail/src/process.rs
@@ -7,6 +7,7 @@
 use libc::pid_t;
 use std::fs::File;
 use std::os::unix::io::RawFd;
+use std::sync::mpsc::Sender;

 // use crate::configs::{Capabilities, Rlimit};
 // use crate::cgroups::Manager as CgroupManager;
@@ -45,6 +46,7 @@ pub struct Process {
    pub pid: pid_t,

    pub exit_code: i32,
+    pub exit_watchers: Vec<Sender<i32>>,
    pub oci: OCIProcess,
    pub logger: Logger,
 }
@@ -95,6 +97,7 @@ impl Process {
            init,
            pid: -1,
            exit_code: 0,
+            exit_watchers: Vec::new(),
            oci: ocip.clone(),
            logger: logger.clone(),
        };
--- a/src/agent/src/rpc.rs
+++ b/src/agent/src/rpc.rs
@@ -4,6 +4,7 @@
 //

 use std::path::Path;
+use std::sync::mpsc::{channel, Sender};
 use std::sync::{Arc, Mutex};
 use ttrpc;

@@ -365,6 +366,7 @@ impl agentService {
        let pid: pid_t;
        let mut exit_pipe_r: RawFd = -1;
        let mut buf: Vec<u8> = vec![0, 1];
+        let (exit_send, exit_recv) = channel();

        info!(
            sl!(),
@@ -382,6 +384,7 @@ impl agentService {
                exit_pipe_r = p.exit_pipe_r.unwrap();
            }

+            p.exit_watchers.push(exit_send);
            pid = p.pid;
        }

@@ -398,9 +401,16 @@ impl agentService {
            }
        };

-        // need to close all fds
-        let mut p = ctr.processes.get_mut(&pid).unwrap();
+        let mut p = match ctr.processes.get_mut(&pid) {
+            Some(p) => p,
+            None => {
+                // Lost race, pick up exit code from channel
+                resp.status = exit_recv.recv().unwrap();
+                return Ok(resp);
+            }
+        };

+        // need to close all fds
        if p.parent_stdin.is_some() {
            let _ = unistd::close(p.parent_stdin.unwrap());
        }
@@ -427,6 +437,11 @@ impl agentService {
        p.term_master = None;

        resp.status = p.exit_code;
+        // broadcast exit code to all parallel watchers
+        for s in p.exit_watchers.iter() {
+            // Just ignore errors in case any watcher quits unexpectedly
+            let _ = s.send(p.exit_code);
+        }

        ctr.processes.remove(&pid);

--- a/src/runtime/containerd-shim-v2/delete.go
+++ b/src/runtime/containerd-shim-v2/delete.go
@@ -9,25 +9,21 @@ import (
 	"context"
 	"path"

+	"github.com/containerd/containerd/api/types/task"
 	"github.com/containerd/containerd/mount"
 	"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
-	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
 )

 func deleteContainer(ctx context.Context, s *service, c *container) error {
-	status, err := s.sandbox.StatusContainer(c.id)
-	if err != nil && !isNotFound(err) {
-		return err
-	}
-	if !c.cType.IsSandbox() && err == nil {
-		if status.State.State != types.StateStopped {
-			_, err = s.sandbox.StopContainer(c.id, false)
+	if !c.cType.IsSandbox() {
+		if c.status != task.StatusStopped {
+			_, err := s.sandbox.StopContainer(c.id, false)
 			if err != nil {
 				return err
 			}
 		}

-		if _, err = s.sandbox.DeleteContainer(c.id); err != nil {
+		if _, err := s.sandbox.DeleteContainer(c.id); err != nil {
 			return err
 		}
 	}
--- a/src/runtime/containerd-shim-v2/service.go
+++ b/src/runtime/containerd-shim-v2/service.go
@@ -681,20 +681,7 @@ func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (_ *ptypes.E
 		return nil, err
 	}

-	// According to CRI specs, kubelet will call StopPodSandbox()
-	// at least once before calling RemovePodSandbox, and this call
-	// is idempotent, and must not return an error if all relevant
-	// resources have already been reclaimed. And in that call it will
-	// send a SIGKILL signal first to try to stop the container, thus
-	// once the container has terminated, here should ignore this signal
-	// and return directly.
-	if signum == syscall.SIGKILL || signum == syscall.SIGTERM {
-		if c.status == task.StatusStopped {
-			shimLog.WithField("sandbox", s.sandbox.ID()).WithField("container", c.id).Debug("Container has already been stopped")
-			return empty, nil
-		}
-	}
-
+	processStatus := c.status
 	processID := c.id
 	if r.ExecID != "" {
 		execs, err := c.getExec(r.ExecID)
@@ -710,6 +697,23 @@ func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (_ *ptypes.E
 			}).Debug("Id of exec process to be signalled is empty")
 			return empty, errors.New("The exec process does not exist")
 		}
+		processStatus = execs.status
+	}
+
+	// According to CRI specs, kubelet will call StopPodSandbox()
+	// at least once before calling RemovePodSandbox, and this call
+	// is idempotent, and must not return an error if all relevant
+	// resources have already been reclaimed. And in that call it will
+	// send a SIGKILL signal first to try to stop the container, thus
+	// once the container has terminated, here should ignore this signal
+	// and return directly.
+	if (signum == syscall.SIGKILL || signum == syscall.SIGTERM) && processStatus == task.StatusStopped {
+		shimLog.WithFields(logrus.Fields{
+			"sandbox":   s.sandbox.ID(),
+			"container": c.id,
+			"exec-id":   r.ExecID,
+		}).Debug("process has already stopped")
+		return empty, nil
 	}

 	return empty, s.sandbox.SignalProcess(c.id, processID, signum, r.All)