lightningd: don't run more than one reconnect timer at once.

In various circumstances we can start a reconnection while one is already going on. These can stockpile if the node really is unreachable. Reported-by: @whitslack Fixes: #5654 Changelog-Fixed: lightningd: we no longer stack multiple reconnection attempts if connections fail. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2026-02-21 22:14:20 +01:00 · 2023-02-01 14:49:29 +10:30
parent ff1d537b87
commit 6347ee7308
2 changed files with 38 additions and 0 deletions
--- a/lightningd/connect_control.c
+++ b/lightningd/connect_control.c
@@ -252,6 +252,22 @@ struct delayed_reconnect {
 	bool dns_fallback;
 };

+static const struct node_id *delayed_reconnect_keyof(const struct delayed_reconnect *d)
+{
+	return &d->id;
+}
+
+static bool node_id_delayed_reconnect_eq(const struct delayed_reconnect *d,
+					 const struct node_id *node_id)
+{
+	return node_id_eq(node_id, &d->id);
+}
+
+HTABLE_DEFINE_TYPE(struct delayed_reconnect,
+		   delayed_reconnect_keyof,
+		   node_id_hash, node_id_delayed_reconnect_eq,
+		   delayed_reconnect_map);
+
 static void gossipd_got_addrs(struct subd *subd,
 			      const u8 *msg,
 			      const int *fds,
@@ -281,6 +297,11 @@ static void do_connect(struct delayed_reconnect *d)
 	subd_req(d, d->ld->gossip, take(msg), -1, 0, gossipd_got_addrs, d);
 }

+static void destroy_delayed_reconnect(struct delayed_reconnect *d)
+{
+	delayed_reconnect_map_del(d->ld->delayed_reconnect_map, d);
+}
+
 static void try_connect(const tal_t *ctx,
 			struct lightningd *ld,
 			const struct node_id *id,
@@ -291,11 +312,23 @@ static void try_connect(const tal_t *ctx,
 	struct delayed_reconnect *d;
 	struct peer *peer;

+	/* Don't stack, unless this is an instant reconnect */
+	d = delayed_reconnect_map_get(ld->delayed_reconnect_map, id);
+	if (d) {
+		if (seconds_delay) {
+			log_peer_debug(ld->log, id, "Already reconnecting");
+			return;
+		}
+		tal_free(d);
+	}
+
 	d = tal(ctx, struct delayed_reconnect);
 	d->ld = ld;
 	d->id = *id;
 	d->addrhint = tal_dup_or_null(d, struct wireaddr_internal, addrhint);
 	d->dns_fallback = dns_fallback;
+	delayed_reconnect_map_add(ld->delayed_reconnect_map, d);
+	tal_add_destructor(d, destroy_delayed_reconnect);

 	if (!seconds_delay) {
 		do_connect(d);
@@ -577,6 +610,9 @@ int connectd_init(struct lightningd *ld)
 	const char *websocket_helper_path;
 	void *ret;

+	ld->delayed_reconnect_map = tal(ld, struct delayed_reconnect_map);
+	delayed_reconnect_map_init(ld->delayed_reconnect_map);
+
 	websocket_helper_path = subdaemon_path(tmpctx, ld,
 					       "lightning_websocketd");

--- a/lightningd/lightningd.h
+++ b/lightningd/lightningd.h
@@ -183,6 +183,8 @@ struct lightningd {

 	/* Daemon looking after peers during init / before channel. */
 	struct subd *connectd;
+	/* Reconnection attempts */
+	struct delayed_reconnect_map *delayed_reconnect_map;

 	/* All peers we're tracking (by node_id) */
 	struct peer_node_id_map *peers;