From 22ff007d642d6b716fbbe6c2b210878775a9d760 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 28 Jul 2022 11:00:36 +0930 Subject: [PATCH] connectd: control connect backoff from lightningd. We used to tell connectd to remember our connect delay, and hand it back (increased if necessary). Instead, simply record when we last tried to connect. If it was less than 10 minutes ago, double delay (up to 5 minutes max), otherwise reset delay to 1 second. This covers all scenarios: whether we reconnect then immediately disconnect, or never successfully connect, it doesn't matter. Signed-off-by: Rusty Russell Fixes: #5453 --- connectd/connectd.c | 38 +++-------------- connectd/connectd_wire.csv | 2 - lightningd/channel.c | 10 ++--- lightningd/connect_control.c | 46 +++++++++++---------- lightningd/connect_control.h | 1 - lightningd/dual_open_control.c | 2 +- lightningd/opening_control.c | 2 +- lightningd/options.c | 2 +- lightningd/peer_control.c | 15 ++++--- lightningd/peer_control.h | 6 ++- lightningd/test/run-invoice-select-inchan.c | 1 - tests/test_connection.py | 5 ++- wallet/test/run-wallet.c | 1 - 13 files changed, 54 insertions(+), 77 deletions(-) diff --git a/connectd/connectd.c b/connectd/connectd.c index dbf03d039..ae514cc28 100644 --- a/connectd/connectd.c +++ b/connectd/connectd.c @@ -56,13 +56,6 @@ #define HSM_FD 3 #define GOSSIPCTL_FD 4 -/*~ In C convention, constants are UPPERCASE macros. Not everything needs to - * be a constant, but it soothes the programmer's conscience to encapsulate - * arbitrary decisions like these in one place. */ -#define MAX_CONNECT_ATTEMPTS 10 -#define INITIAL_WAIT_SECONDS 1 -#define MAX_WAIT_SECONDS 300 - /* Peers we're trying to reach: we iterate through addrs until we succeed * or fail. */ struct connecting { @@ -88,9 +81,6 @@ struct connecting { /* Accumulated errors */ char *errors; - - /* How many seconds did we wait this time? */ - u32 seconds_waited; }; /*~ C programs should generally be written bottom-to-top, with the root @@ -623,15 +613,13 @@ struct io_plan *connection_out(struct io_conn *conn, struct connecting *connect) */ static void connect_failed(struct daemon *daemon, const struct node_id *id, - u32 seconds_waited, const struct wireaddr_internal *addrhint, errcode_t errcode, const char *errfmt, ...) - PRINTF_FMT(6,7); + PRINTF_FMT(5,6); static void connect_failed(struct daemon *daemon, const struct node_id *id, - u32 seconds_waited, const struct wireaddr_internal *addrhint, errcode_t errcode, const char *errfmt, ...) @@ -639,27 +627,18 @@ static void connect_failed(struct daemon *daemon, u8 *msg; va_list ap; char *errmsg; - u32 wait_seconds; va_start(ap, errfmt); errmsg = tal_vfmt(tmpctx, errfmt, ap); va_end(ap); - /* Wait twice as long to reconnect, between min and max. */ - wait_seconds = seconds_waited * 2; - if (wait_seconds > MAX_WAIT_SECONDS) - wait_seconds = MAX_WAIT_SECONDS; - if (wait_seconds < INITIAL_WAIT_SECONDS) - wait_seconds = INITIAL_WAIT_SECONDS; - status_peer_debug(id, "Failed connected out: %s", errmsg); /* lightningd may have a connect command waiting to know what * happened. We leave it to lightningd to decide if it wants to try - * again, with the wait_seconds as a hint of how long before - * asking. */ + * again. */ msg = towire_connectd_connect_failed(NULL, id, errcode, errmsg, - wait_seconds, addrhint); + addrhint); daemon_conn_send(daemon->master, take(msg)); } @@ -801,7 +780,6 @@ static void try_connect_one_addr(struct connecting *connect) /* Out of addresses? */ if (connect->addrnum == tal_count(connect->addrs)) { connect_failed(connect->daemon, &connect->id, - connect->seconds_waited, connect->addrhint, CONNECT_ALL_ADDRESSES_FAILED, "All addresses failed: %s", connect->errors); @@ -1748,7 +1726,6 @@ static void add_gossip_addrs(struct wireaddr_internal **addrs, * caller so it's marginal. */ static void try_connect_peer(struct daemon *daemon, const struct node_id *id, - u32 seconds_waited, struct wireaddr *gossip_addrs, struct wireaddr_internal *addrhint STEALS) { @@ -1805,7 +1782,7 @@ static void try_connect_peer(struct daemon *daemon, /* Still no address? Fail immediately. Lightningd can still choose * to retry; an address may get gossiped or appear on the DNS seed. */ if (tal_count(addrs) == 0) { - connect_failed(daemon, id, seconds_waited, addrhint, + connect_failed(daemon, id, addrhint, CONNECT_NO_KNOWN_ADDRESS, "Unable to connect, no address known for peer"); return; @@ -1822,7 +1799,6 @@ static void try_connect_peer(struct daemon *daemon, * errors which occur. We miss it in a few places; would be nice to * fix! */ connect->connstate = "Connection establishment"; - connect->seconds_waited = seconds_waited; connect->addrhint = tal_steal(connect, addrhint); connect->errors = tal_strdup(connect, ""); connect->conn = NULL; @@ -1837,16 +1813,14 @@ static void try_connect_peer(struct daemon *daemon, static void connect_to_peer(struct daemon *daemon, const u8 *msg) { struct node_id id; - u32 seconds_waited; struct wireaddr_internal *addrhint; struct wireaddr *addrs; if (!fromwire_connectd_connect_to_peer(tmpctx, msg, - &id, &seconds_waited, - &addrs, &addrhint)) + &id, &addrs, &addrhint)) master_badmsg(WIRE_CONNECTD_CONNECT_TO_PEER, msg); - try_connect_peer(daemon, &id, seconds_waited, addrs, addrhint); + try_connect_peer(daemon, &id, addrs, addrhint); } /* lightningd tells us a peer should be disconnected. */ diff --git a/connectd/connectd_wire.csv b/connectd/connectd_wire.csv index 6536be9ea..1dc10bb92 100644 --- a/connectd/connectd_wire.csv +++ b/connectd/connectd_wire.csv @@ -47,7 +47,6 @@ msgdata,connectd_activate_reply,failmsg,?wirestring, # Master -> connectd: connect to a peer. msgtype,connectd_connect_to_peer,2001 msgdata,connectd_connect_to_peer,id,node_id, -msgdata,connectd_connect_to_peer,seconds_waited,u32, msgdata,connectd_connect_to_peer,len,u32, msgdata,connectd_connect_to_peer,addrs,wireaddr,len msgdata,connectd_connect_to_peer,addrhint,?wireaddr_internal, @@ -57,7 +56,6 @@ msgtype,connectd_connect_failed,2020 msgdata,connectd_connect_failed,id,node_id, msgdata,connectd_connect_failed,failcode,errcode_t, msgdata,connectd_connect_failed,failreason,wirestring, -msgdata,connectd_connect_failed,seconds_to_delay,u32, msgdata,connectd_connect_failed,addrhint,?wireaddr_internal, # Connectd -> master: we got a peer. diff --git a/lightningd/channel.c b/lightningd/channel.c index 2f7955191..caede53b9 100644 --- a/lightningd/channel.c +++ b/lightningd/channel.c @@ -929,9 +929,7 @@ void channel_set_billboard(struct channel *channel, bool perm, const char *str) } } -static void channel_err(struct channel *channel, - const char *why, - bool delay_reconnect) +static void channel_err(struct channel *channel, const char *why) { log_info(channel->log, "Peer transient failure in %s: %s", channel_state_name(channel), why); @@ -944,8 +942,6 @@ static void channel_err(struct channel *channel, return; } #endif - channel->peer->delay_reconnect = delay_reconnect; - channel_set_owner(channel, NULL); } @@ -954,7 +950,7 @@ void channel_fail_transient_delayreconnect(struct channel *channel, const char * va_list ap; va_start(ap, fmt); - channel_err(channel, tal_vfmt(tmpctx, fmt, ap), true); + channel_err(channel, tal_vfmt(tmpctx, fmt, ap)); va_end(ap); } @@ -963,7 +959,7 @@ void channel_fail_transient(struct channel *channel, const char *fmt, ...) va_list ap; va_start(ap, fmt); - channel_err(channel, tal_vfmt(tmpctx, fmt, ap), false); + channel_err(channel, tal_vfmt(tmpctx, fmt, ap)); va_end(ap); } diff --git a/lightningd/connect_control.c b/lightningd/connect_control.c index 36ec83bdc..a52117e90 100644 --- a/lightningd/connect_control.c +++ b/lightningd/connect_control.c @@ -247,7 +247,6 @@ AUTODATA(json_command, &connect_command); struct delayed_reconnect { struct lightningd *ld; struct node_id id; - u32 seconds_delayed; struct wireaddr_internal *addrhint; }; @@ -265,7 +264,6 @@ static void gossipd_got_addrs(struct subd *subd, connectmsg = towire_connectd_connect_to_peer(NULL, &d->id, - d->seconds_delayed, addrs, d->addrhint); subd_send_msg(d->ld->connectd, take(connectmsg)); @@ -280,7 +278,6 @@ static void do_connect(struct delayed_reconnect *d) subd_req(d, d->ld->gossip, take(msg), -1, 0, gossipd_got_addrs, d); } -/* peer may be NULL here */ static void try_connect(const tal_t *ctx, struct lightningd *ld, const struct node_id *id, @@ -293,7 +290,6 @@ static void try_connect(const tal_t *ctx, d = tal(ctx, struct delayed_reconnect); d->ld = ld; d->id = *id; - d->seconds_delayed = seconds_delay; d->addrhint = tal_dup_or_null(d, struct wireaddr_internal, addrhint); if (!seconds_delay) { @@ -316,6 +312,7 @@ static void try_connect(const tal_t *ctx, "in %u seconds", seconds_delay)); } + peer->last_connect_attempt = time_now(); } /* We fuzz the timer by up to 1 second, to avoid getting into @@ -326,18 +323,34 @@ static void try_connect(const tal_t *ctx, do_connect, d)); } +/*~ In C convention, constants are UPPERCASE macros. Not everything needs to + * be a constant, but it soothes the programmer's conscience to encapsulate + * arbitrary decisions like these in one place. */ +#define INITIAL_WAIT_SECONDS 1 +#define MAX_WAIT_SECONDS 300 + void try_reconnect(const tal_t *ctx, struct peer *peer, - u32 seconds_delay, const struct wireaddr_internal *addrhint) { if (!peer->ld->reconnect) return; + /* Did we last attempt to connect recently? Enter backoff mode. */ + if (time_less(time_between(time_now(), peer->last_connect_attempt), + time_from_sec(MAX_WAIT_SECONDS * 2))) { + u32 max = DEV_FAST_RECONNECT(peer->ld->dev_fast_reconnect, + 3, MAX_WAIT_SECONDS); + peer->reconnect_delay *= 2; + if (peer->reconnect_delay > max) + peer->reconnect_delay = max; + } else + peer->reconnect_delay = INITIAL_WAIT_SECONDS; + try_connect(ctx, peer->ld, &peer->id, - seconds_delay, + peer->reconnect_delay, addrhint); } @@ -346,7 +359,6 @@ static void connect_failed(struct lightningd *ld, const struct node_id *id, errcode_t errcode, const char *errmsg, - const u32 *seconds_to_delay, const struct wireaddr_internal *addrhint) { struct peer *peer; @@ -361,17 +373,10 @@ static void connect_failed(struct lightningd *ld, /* If we have an active channel, then reconnect. */ peer = peer_by_id(ld, id); if (peer && peer_any_active_channel(peer, NULL)) { - u32 delay; - if (seconds_to_delay) - delay = *seconds_to_delay; - else if (peer->delay_reconnect) - delay = DEV_FAST_RECONNECT(ld->dev_fast_reconnect, 3, 60); - else - delay = 1; - log_peer_debug(ld->log, id, "Reconnecting in %u seconds", delay); - try_reconnect(peer, peer, delay, addrhint); + try_reconnect(peer, peer, addrhint); } else - log_peer_debug(ld->log, id, "Not reconnecting: %s", peer ? "no active channel" : "no channels"); + log_peer_debug(ld->log, id, "Not reconnecting: %s", + peer ? "no active channel" : "no channels"); } void connect_failed_disconnect(struct lightningd *ld, @@ -379,7 +384,7 @@ void connect_failed_disconnect(struct lightningd *ld, const struct wireaddr_internal *addrhint) { connect_failed(ld, id, CONNECT_DISCONNECTED_DURING, - "disconnected during connection", NULL, addrhint); + "disconnected during connection", addrhint); } static void handle_connect_failed(struct lightningd *ld, const u8 *msg) @@ -387,15 +392,14 @@ static void handle_connect_failed(struct lightningd *ld, const u8 *msg) struct node_id id; errcode_t errcode; char *errmsg; - u32 seconds_to_delay; struct wireaddr_internal *addrhint; if (!fromwire_connectd_connect_failed(tmpctx, msg, &id, &errcode, &errmsg, - &seconds_to_delay, &addrhint)) + &addrhint)) fatal("Connect gave bad CONNECTD_CONNECT_FAILED message %s", tal_hex(msg, msg)); - connect_failed(ld, &id, errcode, errmsg, &seconds_to_delay, addrhint); + connect_failed(ld, &id, errcode, errmsg, addrhint); } void connect_succeeded(struct lightningd *ld, const struct peer *peer, diff --git a/lightningd/connect_control.h b/lightningd/connect_control.h index 5eaa3c98d..6b64b72d0 100644 --- a/lightningd/connect_control.h +++ b/lightningd/connect_control.h @@ -20,7 +20,6 @@ void connectd_activate(struct lightningd *ld); void try_reconnect(const tal_t *ctx, struct peer *peer, - u32 seconds_delay, const struct wireaddr_internal *addrhint); void connect_succeeded(struct lightningd *ld, const struct peer *peer, bool incoming, diff --git a/lightningd/dual_open_control.c b/lightningd/dual_open_control.c index 71133b8fb..05faaa19d 100644 --- a/lightningd/dual_open_control.c +++ b/lightningd/dual_open_control.c @@ -1296,7 +1296,7 @@ wallet_commit_channel(struct lightningd *ld, * reconnect because no channels are active. But the subd * just made it active! */ if (!any_active && channel->peer->connected == PEER_DISCONNECTED) { - try_reconnect(channel->peer, channel->peer, 1, + try_reconnect(channel->peer, channel->peer, &channel->peer->addr); } diff --git a/lightningd/opening_control.c b/lightningd/opening_control.c index effdd07d9..3ba43fac1 100644 --- a/lightningd/opening_control.c +++ b/lightningd/opening_control.c @@ -240,7 +240,7 @@ wallet_commit_channel(struct lightningd *ld, * reconnect because no channels are active. But the subd * just made it active! */ if (!any_active && channel->peer->connected == PEER_DISCONNECTED) { - try_reconnect(channel->peer, channel->peer, 1, + try_reconnect(channel->peer, channel->peer, &channel->peer->addr); } diff --git a/lightningd/options.c b/lightningd/options.c index be6c159c4..54f5213b1 100644 --- a/lightningd/options.c +++ b/lightningd/options.c @@ -695,7 +695,7 @@ static void dev_register_opts(struct lightningd *ld) "Disable automatic reconnect-attempts by this node, but accept incoming"); opt_register_noarg("--dev-fast-reconnect", opt_set_bool, &ld->dev_fast_reconnect, - "Make default reconnect delay 3 (not 60) seconds"); + "Make max default reconnect delay 3 (not 300) seconds"); opt_register_noarg("--dev-fail-on-subdaemon-fail", opt_set_bool, &ld->dev_subdaemon_fail, opt_hidden); diff --git a/lightningd/peer_control.c b/lightningd/peer_control.c index bdfb45628..edf124035 100644 --- a/lightningd/peer_control.c +++ b/lightningd/peer_control.c @@ -100,7 +100,8 @@ struct peer *new_peer(struct lightningd *ld, u64 dbid, list_head_init(&peer->channels); peer->direction = node_id_idx(&peer->ld->id, &peer->id); peer->connected = PEER_DISCONNECTED; - peer->delay_reconnect = false; + peer->last_connect_attempt.ts.tv_sec + = peer->last_connect_attempt.ts.tv_nsec = 0; #if DEVELOPER peer->ignore_htlcs = false; #endif @@ -1330,7 +1331,6 @@ void peer_connected(struct lightningd *ld, const u8 *msg) /* We mark peer in "connecting" state until hooks have passed. */ assert(peer->connected == PEER_DISCONNECTED); peer->connected = PEER_CONNECTING; - peer->delay_reconnect = false; /* Update peer address and direction */ peer->addr = hook_payload->addr; @@ -2025,9 +2025,14 @@ static void setup_peer(struct peer *peer, u32 delay) } /* Make sure connectd knows to try reconnecting. */ - if (connect) - try_reconnect(peer, peer, delay, &peer->addr); - + if (connect) { + /* To delay, make it seem like we just connected. */ + if (delay > 0) { + peer->reconnect_delay = delay; + peer->last_connect_attempt = time_now(); + } + try_reconnect(peer, peer, &peer->addr); + } } void setup_peers(struct lightningd *ld) diff --git a/lightningd/peer_control.h b/lightningd/peer_control.h index 4609ca54b..5bdcef569 100644 --- a/lightningd/peer_control.h +++ b/lightningd/peer_control.h @@ -30,8 +30,10 @@ struct peer { /* Connection counter from connectd. */ u64 connectd_counter; - /* Did we fail badly last time? Don't reconnect too fast. */ - bool delay_reconnect; + /* Last reconnect: if it's recent, we delay by reconnect_delay, + * doubling each time. */ + struct timeabs last_connect_attempt; + u32 reconnect_delay; /* Our channels */ struct list_head channels; diff --git a/lightningd/test/run-invoice-select-inchan.c b/lightningd/test/run-invoice-select-inchan.c index 699564d66..915a2c55c 100644 --- a/lightningd/test/run-invoice-select-inchan.c +++ b/lightningd/test/run-invoice-select-inchan.c @@ -764,7 +764,6 @@ u8 *towire_warningfmt(const tal_t *ctx UNNEEDED, /* Generated stub for try_reconnect */ void try_reconnect(const tal_t *ctx UNNEEDED, struct peer *peer UNNEEDED, - u32 seconds_delay UNNEEDED, const struct wireaddr_internal *addrhint UNNEEDED) { fprintf(stderr, "try_reconnect called!\n"); abort(); } /* Generated stub for version */ diff --git a/tests/test_connection.py b/tests/test_connection.py index 7c4548aea..a570a0497 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -3242,11 +3242,12 @@ def test_feerate_spam(node_factory, chainparams): l1.daemon.wait_for_log('peer_out WIRE_UPDATE_FEE', timeout=5) -@pytest.mark.developer("need dev-feerate") +@pytest.mark.developer("need dev-feerate, dev-fast-reconnect") def test_feerate_stress(node_factory, executor): # Third node makes HTLC traffic less predictable. l1, l2, l3 = node_factory.line_graph(3, opts={'commit-time': 100, - 'may_reconnect': True}) + 'may_reconnect': True, + 'dev-fast-reconnect': None}) l1.pay(l2, 10**9 // 2) scid12 = l1.get_channel_scid(l2) diff --git a/wallet/test/run-wallet.c b/wallet/test/run-wallet.c index 8dd612b80..0715963d6 100644 --- a/wallet/test/run-wallet.c +++ b/wallet/test/run-wallet.c @@ -839,7 +839,6 @@ u8 *towire_warningfmt(const tal_t *ctx UNNEEDED, /* Generated stub for try_reconnect */ void try_reconnect(const tal_t *ctx UNNEEDED, struct peer *peer UNNEEDED, - u32 seconds_delay UNNEEDED, const struct wireaddr_internal *addrhint UNNEEDED) { fprintf(stderr, "try_reconnect called!\n"); abort(); } /* Generated stub for watch_txid */