Merge pull request #717 from bergwolf/signal

fix issues with short life time container/exec processes
This commit is contained in:
Fupan Li
2020-09-17 11:00:30 +08:00
committed by GitHub
4 changed files with 43 additions and 25 deletions

View File

@@ -7,6 +7,7 @@
use libc::pid_t;
use std::fs::File;
use std::os::unix::io::RawFd;
use std::sync::mpsc::Sender;
// use crate::configs::{Capabilities, Rlimit};
// use crate::cgroups::Manager as CgroupManager;
@@ -45,6 +46,7 @@ pub struct Process {
pub pid: pid_t,
pub exit_code: i32,
pub exit_watchers: Vec<Sender<i32>>,
pub oci: OCIProcess,
pub logger: Logger,
}
@@ -95,6 +97,7 @@ impl Process {
init,
pid: -1,
exit_code: 0,
exit_watchers: Vec::new(),
oci: ocip.clone(),
logger: logger.clone(),
};

View File

@@ -4,6 +4,7 @@
//
use std::path::Path;
use std::sync::mpsc::{channel, Sender};
use std::sync::{Arc, Mutex};
use ttrpc;
@@ -365,6 +366,7 @@ impl agentService {
let pid: pid_t;
let mut exit_pipe_r: RawFd = -1;
let mut buf: Vec<u8> = vec![0, 1];
let (exit_send, exit_recv) = channel();
info!(
sl!(),
@@ -382,6 +384,7 @@ impl agentService {
exit_pipe_r = p.exit_pipe_r.unwrap();
}
p.exit_watchers.push(exit_send);
pid = p.pid;
}
@@ -398,9 +401,16 @@ impl agentService {
}
};
// need to close all fds
let mut p = ctr.processes.get_mut(&pid).unwrap();
let mut p = match ctr.processes.get_mut(&pid) {
Some(p) => p,
None => {
// Lost race, pick up exit code from channel
resp.status = exit_recv.recv().unwrap();
return Ok(resp);
}
};
// need to close all fds
if p.parent_stdin.is_some() {
let _ = unistd::close(p.parent_stdin.unwrap());
}
@@ -427,6 +437,11 @@ impl agentService {
p.term_master = None;
resp.status = p.exit_code;
// broadcast exit code to all parallel watchers
for s in p.exit_watchers.iter() {
// Just ignore errors in case any watcher quits unexpectedly
let _ = s.send(p.exit_code);
}
ctr.processes.remove(&pid);

View File

@@ -9,25 +9,21 @@ import (
"context"
"path"
"github.com/containerd/containerd/api/types/task"
"github.com/containerd/containerd/mount"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
)
func deleteContainer(ctx context.Context, s *service, c *container) error {
status, err := s.sandbox.StatusContainer(c.id)
if err != nil && !isNotFound(err) {
return err
}
if !c.cType.IsSandbox() && err == nil {
if status.State.State != types.StateStopped {
_, err = s.sandbox.StopContainer(c.id, false)
if !c.cType.IsSandbox() {
if c.status != task.StatusStopped {
_, err := s.sandbox.StopContainer(c.id, false)
if err != nil {
return err
}
}
if _, err = s.sandbox.DeleteContainer(c.id); err != nil {
if _, err := s.sandbox.DeleteContainer(c.id); err != nil {
return err
}
}

View File

@@ -681,20 +681,7 @@ func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (_ *ptypes.E
return nil, err
}
// According to CRI specs, kubelet will call StopPodSandbox()
// at least once before calling RemovePodSandbox, and this call
// is idempotent, and must not return an error if all relevant
// resources have already been reclaimed. And in that call it will
// send a SIGKILL signal first to try to stop the container, thus
// once the container has terminated, here should ignore this signal
// and return directly.
if signum == syscall.SIGKILL || signum == syscall.SIGTERM {
if c.status == task.StatusStopped {
shimLog.WithField("sandbox", s.sandbox.ID()).WithField("container", c.id).Debug("Container has already been stopped")
return empty, nil
}
}
processStatus := c.status
processID := c.id
if r.ExecID != "" {
execs, err := c.getExec(r.ExecID)
@@ -710,6 +697,23 @@ func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (_ *ptypes.E
}).Debug("Id of exec process to be signalled is empty")
return empty, errors.New("The exec process does not exist")
}
processStatus = execs.status
}
// According to CRI specs, kubelet will call StopPodSandbox()
// at least once before calling RemovePodSandbox, and this call
// is idempotent, and must not return an error if all relevant
// resources have already been reclaimed. And in that call it will
// send a SIGKILL signal first to try to stop the container, thus
// once the container has terminated, here should ignore this signal
// and return directly.
if (signum == syscall.SIGKILL || signum == syscall.SIGTERM) && processStatus == task.StatusStopped {
shimLog.WithFields(logrus.Fields{
"sandbox": s.sandbox.ID(),
"container": c.id,
"exec-id": r.ExecID,
}).Debug("process has already stopped")
return empty, nil
}
return empty, s.sandbox.SignalProcess(c.id, processID, signum, r.All)