mirror of
https://github.com/aljazceru/lightning.git
synced 2026-01-09 09:04:25 +01:00
subd: fix waitpid properly.
lightningd would race with the subd destructor to do the waitpid(), resulting in UNUSUAL log messages, but also us missing if a plugin was killed via a signal. We can also get rid of the gratuitous waitpid() in test_subdaemons. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
@@ -151,6 +151,7 @@ static struct lightningd *new_lightningd(const tal_t *ctx)
|
||||
* This method of manually declaring the list hooks avoids dynamic
|
||||
* allocations to put things into a list. */
|
||||
list_head_init(&ld->peers);
|
||||
list_head_init(&ld->subds);
|
||||
|
||||
/*~ These are hash tables of incoming and outgoing HTLCs (contracts),
|
||||
* defined as `struct htlc_in` and `struct htlc_out` in htlc_end.h.
|
||||
@@ -409,10 +410,8 @@ void test_subdaemons(const struct lightningd *ld)
|
||||
|| verstring[strlen(version())] != '\n')
|
||||
errx(1, "%s: bad version '%s'",
|
||||
subdaemons[i], verstring);
|
||||
|
||||
/*~ finally reap the child process, freeing all OS
|
||||
* resources that go with it */
|
||||
waitpid(pid, NULL, 0);
|
||||
/*~ The child will be reaped by sigchld_rfd_in, so we don't
|
||||
* need to waitpid() here. */
|
||||
}
|
||||
}
|
||||
|
||||
@@ -784,9 +783,13 @@ static struct io_plan *sigchld_rfd_in(struct io_conn *conn,
|
||||
/* We don't actually care what we read, so we stuff things here. */
|
||||
static u8 ignorebuf;
|
||||
static size_t len;
|
||||
pid_t childpid;
|
||||
int wstatus;
|
||||
|
||||
/* Reap the plugins, since we otherwise ignore them. */
|
||||
while (waitpid(-1, NULL, WNOHANG) != 0);
|
||||
while ((childpid = waitpid(-1, &wstatus, WNOHANG)) != 0) {
|
||||
maybe_subd_child(ld, childpid, wstatus);
|
||||
}
|
||||
|
||||
return io_read_partial(conn, &ignorebuf, 1, &len, sigchld_rfd_in, ld);
|
||||
}
|
||||
|
||||
@@ -202,6 +202,9 @@ struct lightningd {
|
||||
/* RPC response to send once we've shut down. */
|
||||
const char *stop_response;
|
||||
|
||||
/* All the subdaemons. */
|
||||
struct list_head subds;
|
||||
|
||||
/* Used these feerates instead of whatever bcli returns (up to
|
||||
* FEERATE_PENALTY). */
|
||||
u32 *force_feerates;
|
||||
|
||||
@@ -21,6 +21,16 @@
|
||||
#include <wire/common_wiregen.h>
|
||||
#include <wire/wire_io.h>
|
||||
|
||||
void maybe_subd_child(struct lightningd *ld, int childpid, int wstatus)
|
||||
{
|
||||
struct subd *sd;
|
||||
|
||||
list_for_each(&ld->subds, sd, list) {
|
||||
if (sd->pid == childpid)
|
||||
sd->wstatus = tal_dup(sd, int, &wstatus);
|
||||
}
|
||||
}
|
||||
|
||||
/* Carefully move fd *@from to @to: on success *from set to to */
|
||||
static bool move_fd(int *from, int to)
|
||||
{
|
||||
@@ -580,24 +590,30 @@ static void destroy_subd(struct subd *sd)
|
||||
bool fail_if_subd_fails;
|
||||
|
||||
fail_if_subd_fails = IFDEV(sd->ld->dev_subdaemon_fail, false);
|
||||
list_del_from(&sd->ld->subds, &sd->list);
|
||||
|
||||
switch (waitpid(sd->pid, &status, WNOHANG)) {
|
||||
case 0:
|
||||
/* If it's an essential daemon, don't kill: we want the
|
||||
* exit status */
|
||||
if (!sd->must_not_exit) {
|
||||
log_debug(sd->log,
|
||||
"Status closed, but not exited. Killing");
|
||||
kill(sd->pid, SIGKILL);
|
||||
/* lightningd may have already done waitpid() */
|
||||
if (sd->wstatus != NULL) {
|
||||
status = *sd->wstatus;
|
||||
} else {
|
||||
switch (waitpid(sd->pid, &status, WNOHANG)) {
|
||||
case 0:
|
||||
/* If it's an essential daemon, don't kill: we want the
|
||||
* exit status */
|
||||
if (!sd->must_not_exit) {
|
||||
log_debug(sd->log,
|
||||
"Status closed, but not exited. Killing");
|
||||
kill(sd->pid, SIGKILL);
|
||||
}
|
||||
waitpid(sd->pid, &status, 0);
|
||||
fail_if_subd_fails = false;
|
||||
break;
|
||||
case -1:
|
||||
log_broken(sd->log, "Status closed, but waitpid %i says %s",
|
||||
sd->pid, strerror(errno));
|
||||
status = -1;
|
||||
break;
|
||||
}
|
||||
waitpid(sd->pid, &status, 0);
|
||||
fail_if_subd_fails = false;
|
||||
break;
|
||||
case -1:
|
||||
log_unusual(sd->log, "Status closed, but waitpid %i says %s",
|
||||
sd->pid, strerror(errno));
|
||||
status = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (fail_if_subd_fails && WIFSIGNALED(status)) {
|
||||
@@ -742,6 +758,8 @@ static struct subd *new_subd(struct lightningd *ld,
|
||||
sd->billboardcb = billboardcb;
|
||||
sd->fds_in = NULL;
|
||||
sd->outq = msg_queue_new(sd, true);
|
||||
sd->wstatus = NULL;
|
||||
list_add(&ld->subds, &sd->list);
|
||||
tal_add_destructor(sd, destroy_subd);
|
||||
list_head_init(&sd->reqs);
|
||||
sd->channel = channel;
|
||||
@@ -868,6 +886,7 @@ struct subd *subd_shutdown(struct subd *sd, unsigned int seconds)
|
||||
if (waitpid(sd->pid, NULL, 0) > 0) {
|
||||
alarm(0);
|
||||
sigaction(SIGALRM, &old, NULL);
|
||||
list_del_from(&sd->ld->subds, &sd->list);
|
||||
return tal_free(sd);
|
||||
}
|
||||
|
||||
|
||||
@@ -20,6 +20,9 @@ struct peer_fd;
|
||||
|
||||
/* One of our subds. */
|
||||
struct subd {
|
||||
/* Inside ld->subds */
|
||||
struct list_node list;
|
||||
|
||||
/* Name, like John, or "lightning_hsmd" */
|
||||
const char *name;
|
||||
/* The Big Cheese. */
|
||||
@@ -74,6 +77,9 @@ struct subd {
|
||||
|
||||
/* Callbacks for replies. */
|
||||
struct list_head reqs;
|
||||
|
||||
/* Did lightningd already wait for this pid? */
|
||||
int *wstatus;
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -219,6 +225,9 @@ struct subd *subd_shutdown(struct subd *subd, unsigned int seconds);
|
||||
/* Ugly helper to get full pathname of the current binary. */
|
||||
const char *find_my_abspath(const tal_t *ctx, const char *argv0);
|
||||
|
||||
/* lightningd captures SIGCHLD and waits, but so does subd. */
|
||||
void maybe_subd_child(struct lightningd *ld, int childpid, int wstatus);
|
||||
|
||||
#if DEVELOPER
|
||||
char *opt_subd_dev_disconnect(const char *optarg, struct lightningd *ld);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user