subd: fix waitpid properly.

lightningd would race with the subd destructor to do the waitpid(),
resulting in UNUSUAL log messages, but also us missing if a plugin
was killed via a signal.

We can also get rid of the gratuitous waitpid() in test_subdaemons.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
Rusty Russell
2022-01-22 15:19:22 +10:30
parent beed4fcccc
commit 4a4f85dd3f
4 changed files with 55 additions and 21 deletions

View File

@@ -151,6 +151,7 @@ static struct lightningd *new_lightningd(const tal_t *ctx)
* This method of manually declaring the list hooks avoids dynamic
* allocations to put things into a list. */
list_head_init(&ld->peers);
list_head_init(&ld->subds);
/*~ These are hash tables of incoming and outgoing HTLCs (contracts),
* defined as `struct htlc_in` and `struct htlc_out` in htlc_end.h.
@@ -409,10 +410,8 @@ void test_subdaemons(const struct lightningd *ld)
|| verstring[strlen(version())] != '\n')
errx(1, "%s: bad version '%s'",
subdaemons[i], verstring);
/*~ finally reap the child process, freeing all OS
* resources that go with it */
waitpid(pid, NULL, 0);
/*~ The child will be reaped by sigchld_rfd_in, so we don't
* need to waitpid() here. */
}
}
@@ -784,9 +783,13 @@ static struct io_plan *sigchld_rfd_in(struct io_conn *conn,
/* We don't actually care what we read, so we stuff things here. */
static u8 ignorebuf;
static size_t len;
pid_t childpid;
int wstatus;
/* Reap the plugins, since we otherwise ignore them. */
while (waitpid(-1, NULL, WNOHANG) != 0);
while ((childpid = waitpid(-1, &wstatus, WNOHANG)) != 0) {
maybe_subd_child(ld, childpid, wstatus);
}
return io_read_partial(conn, &ignorebuf, 1, &len, sigchld_rfd_in, ld);
}

View File

@@ -202,6 +202,9 @@ struct lightningd {
/* RPC response to send once we've shut down. */
const char *stop_response;
/* All the subdaemons. */
struct list_head subds;
/* Used these feerates instead of whatever bcli returns (up to
* FEERATE_PENALTY). */
u32 *force_feerates;

View File

@@ -21,6 +21,16 @@
#include <wire/common_wiregen.h>
#include <wire/wire_io.h>
void maybe_subd_child(struct lightningd *ld, int childpid, int wstatus)
{
struct subd *sd;
list_for_each(&ld->subds, sd, list) {
if (sd->pid == childpid)
sd->wstatus = tal_dup(sd, int, &wstatus);
}
}
/* Carefully move fd *@from to @to: on success *from set to to */
static bool move_fd(int *from, int to)
{
@@ -580,24 +590,30 @@ static void destroy_subd(struct subd *sd)
bool fail_if_subd_fails;
fail_if_subd_fails = IFDEV(sd->ld->dev_subdaemon_fail, false);
list_del_from(&sd->ld->subds, &sd->list);
switch (waitpid(sd->pid, &status, WNOHANG)) {
case 0:
/* If it's an essential daemon, don't kill: we want the
* exit status */
if (!sd->must_not_exit) {
log_debug(sd->log,
"Status closed, but not exited. Killing");
kill(sd->pid, SIGKILL);
/* lightningd may have already done waitpid() */
if (sd->wstatus != NULL) {
status = *sd->wstatus;
} else {
switch (waitpid(sd->pid, &status, WNOHANG)) {
case 0:
/* If it's an essential daemon, don't kill: we want the
* exit status */
if (!sd->must_not_exit) {
log_debug(sd->log,
"Status closed, but not exited. Killing");
kill(sd->pid, SIGKILL);
}
waitpid(sd->pid, &status, 0);
fail_if_subd_fails = false;
break;
case -1:
log_broken(sd->log, "Status closed, but waitpid %i says %s",
sd->pid, strerror(errno));
status = -1;
break;
}
waitpid(sd->pid, &status, 0);
fail_if_subd_fails = false;
break;
case -1:
log_unusual(sd->log, "Status closed, but waitpid %i says %s",
sd->pid, strerror(errno));
status = -1;
break;
}
if (fail_if_subd_fails && WIFSIGNALED(status)) {
@@ -742,6 +758,8 @@ static struct subd *new_subd(struct lightningd *ld,
sd->billboardcb = billboardcb;
sd->fds_in = NULL;
sd->outq = msg_queue_new(sd, true);
sd->wstatus = NULL;
list_add(&ld->subds, &sd->list);
tal_add_destructor(sd, destroy_subd);
list_head_init(&sd->reqs);
sd->channel = channel;
@@ -868,6 +886,7 @@ struct subd *subd_shutdown(struct subd *sd, unsigned int seconds)
if (waitpid(sd->pid, NULL, 0) > 0) {
alarm(0);
sigaction(SIGALRM, &old, NULL);
list_del_from(&sd->ld->subds, &sd->list);
return tal_free(sd);
}

View File

@@ -20,6 +20,9 @@ struct peer_fd;
/* One of our subds. */
struct subd {
/* Inside ld->subds */
struct list_node list;
/* Name, like John, or "lightning_hsmd" */
const char *name;
/* The Big Cheese. */
@@ -74,6 +77,9 @@ struct subd {
/* Callbacks for replies. */
struct list_head reqs;
/* Did lightningd already wait for this pid? */
int *wstatus;
};
/**
@@ -219,6 +225,9 @@ struct subd *subd_shutdown(struct subd *subd, unsigned int seconds);
/* Ugly helper to get full pathname of the current binary. */
const char *find_my_abspath(const tal_t *ctx, const char *argv0);
/* lightningd captures SIGCHLD and waits, but so does subd. */
void maybe_subd_child(struct lightningd *ld, int childpid, int wstatus);
#if DEVELOPER
char *opt_subd_dev_disconnect(const char *optarg, struct lightningd *ld);