diff --git a/connectd/connectd.c b/connectd/connectd.c index dbf03d039..ae514cc28 100644 --- a/connectd/connectd.c +++ b/connectd/connectd.c @@ -56,13 +56,6 @@ #define HSM_FD 3 #define GOSSIPCTL_FD 4 -/*~ In C convention, constants are UPPERCASE macros. Not everything needs to - * be a constant, but it soothes the programmer's conscience to encapsulate - * arbitrary decisions like these in one place. */ -#define MAX_CONNECT_ATTEMPTS 10 -#define INITIAL_WAIT_SECONDS 1 -#define MAX_WAIT_SECONDS 300 - /* Peers we're trying to reach: we iterate through addrs until we succeed * or fail. */ struct connecting { @@ -88,9 +81,6 @@ struct connecting { /* Accumulated errors */ char *errors; - - /* How many seconds did we wait this time? */ - u32 seconds_waited; }; /*~ C programs should generally be written bottom-to-top, with the root @@ -623,15 +613,13 @@ struct io_plan *connection_out(struct io_conn *conn, struct connecting *connect) */ static void connect_failed(struct daemon *daemon, const struct node_id *id, - u32 seconds_waited, const struct wireaddr_internal *addrhint, errcode_t errcode, const char *errfmt, ...) - PRINTF_FMT(6,7); + PRINTF_FMT(5,6); static void connect_failed(struct daemon *daemon, const struct node_id *id, - u32 seconds_waited, const struct wireaddr_internal *addrhint, errcode_t errcode, const char *errfmt, ...) @@ -639,27 +627,18 @@ static void connect_failed(struct daemon *daemon, u8 *msg; va_list ap; char *errmsg; - u32 wait_seconds; va_start(ap, errfmt); errmsg = tal_vfmt(tmpctx, errfmt, ap); va_end(ap); - /* Wait twice as long to reconnect, between min and max. */ - wait_seconds = seconds_waited * 2; - if (wait_seconds > MAX_WAIT_SECONDS) - wait_seconds = MAX_WAIT_SECONDS; - if (wait_seconds < INITIAL_WAIT_SECONDS) - wait_seconds = INITIAL_WAIT_SECONDS; - status_peer_debug(id, "Failed connected out: %s", errmsg); /* lightningd may have a connect command waiting to know what * happened. We leave it to lightningd to decide if it wants to try - * again, with the wait_seconds as a hint of how long before - * asking. */ + * again. */ msg = towire_connectd_connect_failed(NULL, id, errcode, errmsg, - wait_seconds, addrhint); + addrhint); daemon_conn_send(daemon->master, take(msg)); } @@ -801,7 +780,6 @@ static void try_connect_one_addr(struct connecting *connect) /* Out of addresses? */ if (connect->addrnum == tal_count(connect->addrs)) { connect_failed(connect->daemon, &connect->id, - connect->seconds_waited, connect->addrhint, CONNECT_ALL_ADDRESSES_FAILED, "All addresses failed: %s", connect->errors); @@ -1748,7 +1726,6 @@ static void add_gossip_addrs(struct wireaddr_internal **addrs, * caller so it's marginal. */ static void try_connect_peer(struct daemon *daemon, const struct node_id *id, - u32 seconds_waited, struct wireaddr *gossip_addrs, struct wireaddr_internal *addrhint STEALS) { @@ -1805,7 +1782,7 @@ static void try_connect_peer(struct daemon *daemon, /* Still no address? Fail immediately. Lightningd can still choose * to retry; an address may get gossiped or appear on the DNS seed. */ if (tal_count(addrs) == 0) { - connect_failed(daemon, id, seconds_waited, addrhint, + connect_failed(daemon, id, addrhint, CONNECT_NO_KNOWN_ADDRESS, "Unable to connect, no address known for peer"); return; @@ -1822,7 +1799,6 @@ static void try_connect_peer(struct daemon *daemon, * errors which occur. We miss it in a few places; would be nice to * fix! */ connect->connstate = "Connection establishment"; - connect->seconds_waited = seconds_waited; connect->addrhint = tal_steal(connect, addrhint); connect->errors = tal_strdup(connect, ""); connect->conn = NULL; @@ -1837,16 +1813,14 @@ static void try_connect_peer(struct daemon *daemon, static void connect_to_peer(struct daemon *daemon, const u8 *msg) { struct node_id id; - u32 seconds_waited; struct wireaddr_internal *addrhint; struct wireaddr *addrs; if (!fromwire_connectd_connect_to_peer(tmpctx, msg, - &id, &seconds_waited, - &addrs, &addrhint)) + &id, &addrs, &addrhint)) master_badmsg(WIRE_CONNECTD_CONNECT_TO_PEER, msg); - try_connect_peer(daemon, &id, seconds_waited, addrs, addrhint); + try_connect_peer(daemon, &id, addrs, addrhint); } /* lightningd tells us a peer should be disconnected. */ diff --git a/connectd/connectd_wire.csv b/connectd/connectd_wire.csv index 6536be9ea..1dc10bb92 100644 --- a/connectd/connectd_wire.csv +++ b/connectd/connectd_wire.csv @@ -47,7 +47,6 @@ msgdata,connectd_activate_reply,failmsg,?wirestring, # Master -> connectd: connect to a peer. msgtype,connectd_connect_to_peer,2001 msgdata,connectd_connect_to_peer,id,node_id, -msgdata,connectd_connect_to_peer,seconds_waited,u32, msgdata,connectd_connect_to_peer,len,u32, msgdata,connectd_connect_to_peer,addrs,wireaddr,len msgdata,connectd_connect_to_peer,addrhint,?wireaddr_internal, @@ -57,7 +56,6 @@ msgtype,connectd_connect_failed,2020 msgdata,connectd_connect_failed,id,node_id, msgdata,connectd_connect_failed,failcode,errcode_t, msgdata,connectd_connect_failed,failreason,wirestring, -msgdata,connectd_connect_failed,seconds_to_delay,u32, msgdata,connectd_connect_failed,addrhint,?wireaddr_internal, # Connectd -> master: we got a peer. diff --git a/lightningd/channel.c b/lightningd/channel.c index 2f7955191..caede53b9 100644 --- a/lightningd/channel.c +++ b/lightningd/channel.c @@ -929,9 +929,7 @@ void channel_set_billboard(struct channel *channel, bool perm, const char *str) } } -static void channel_err(struct channel *channel, - const char *why, - bool delay_reconnect) +static void channel_err(struct channel *channel, const char *why) { log_info(channel->log, "Peer transient failure in %s: %s", channel_state_name(channel), why); @@ -944,8 +942,6 @@ static void channel_err(struct channel *channel, return; } #endif - channel->peer->delay_reconnect = delay_reconnect; - channel_set_owner(channel, NULL); } @@ -954,7 +950,7 @@ void channel_fail_transient_delayreconnect(struct channel *channel, const char * va_list ap; va_start(ap, fmt); - channel_err(channel, tal_vfmt(tmpctx, fmt, ap), true); + channel_err(channel, tal_vfmt(tmpctx, fmt, ap)); va_end(ap); } @@ -963,7 +959,7 @@ void channel_fail_transient(struct channel *channel, const char *fmt, ...) va_list ap; va_start(ap, fmt); - channel_err(channel, tal_vfmt(tmpctx, fmt, ap), false); + channel_err(channel, tal_vfmt(tmpctx, fmt, ap)); va_end(ap); } diff --git a/lightningd/connect_control.c b/lightningd/connect_control.c index 36ec83bdc..a52117e90 100644 --- a/lightningd/connect_control.c +++ b/lightningd/connect_control.c @@ -247,7 +247,6 @@ AUTODATA(json_command, &connect_command); struct delayed_reconnect { struct lightningd *ld; struct node_id id; - u32 seconds_delayed; struct wireaddr_internal *addrhint; }; @@ -265,7 +264,6 @@ static void gossipd_got_addrs(struct subd *subd, connectmsg = towire_connectd_connect_to_peer(NULL, &d->id, - d->seconds_delayed, addrs, d->addrhint); subd_send_msg(d->ld->connectd, take(connectmsg)); @@ -280,7 +278,6 @@ static void do_connect(struct delayed_reconnect *d) subd_req(d, d->ld->gossip, take(msg), -1, 0, gossipd_got_addrs, d); } -/* peer may be NULL here */ static void try_connect(const tal_t *ctx, struct lightningd *ld, const struct node_id *id, @@ -293,7 +290,6 @@ static void try_connect(const tal_t *ctx, d = tal(ctx, struct delayed_reconnect); d->ld = ld; d->id = *id; - d->seconds_delayed = seconds_delay; d->addrhint = tal_dup_or_null(d, struct wireaddr_internal, addrhint); if (!seconds_delay) { @@ -316,6 +312,7 @@ static void try_connect(const tal_t *ctx, "in %u seconds", seconds_delay)); } + peer->last_connect_attempt = time_now(); } /* We fuzz the timer by up to 1 second, to avoid getting into @@ -326,18 +323,34 @@ static void try_connect(const tal_t *ctx, do_connect, d)); } +/*~ In C convention, constants are UPPERCASE macros. Not everything needs to + * be a constant, but it soothes the programmer's conscience to encapsulate + * arbitrary decisions like these in one place. */ +#define INITIAL_WAIT_SECONDS 1 +#define MAX_WAIT_SECONDS 300 + void try_reconnect(const tal_t *ctx, struct peer *peer, - u32 seconds_delay, const struct wireaddr_internal *addrhint) { if (!peer->ld->reconnect) return; + /* Did we last attempt to connect recently? Enter backoff mode. */ + if (time_less(time_between(time_now(), peer->last_connect_attempt), + time_from_sec(MAX_WAIT_SECONDS * 2))) { + u32 max = DEV_FAST_RECONNECT(peer->ld->dev_fast_reconnect, + 3, MAX_WAIT_SECONDS); + peer->reconnect_delay *= 2; + if (peer->reconnect_delay > max) + peer->reconnect_delay = max; + } else + peer->reconnect_delay = INITIAL_WAIT_SECONDS; + try_connect(ctx, peer->ld, &peer->id, - seconds_delay, + peer->reconnect_delay, addrhint); } @@ -346,7 +359,6 @@ static void connect_failed(struct lightningd *ld, const struct node_id *id, errcode_t errcode, const char *errmsg, - const u32 *seconds_to_delay, const struct wireaddr_internal *addrhint) { struct peer *peer; @@ -361,17 +373,10 @@ static void connect_failed(struct lightningd *ld, /* If we have an active channel, then reconnect. */ peer = peer_by_id(ld, id); if (peer && peer_any_active_channel(peer, NULL)) { - u32 delay; - if (seconds_to_delay) - delay = *seconds_to_delay; - else if (peer->delay_reconnect) - delay = DEV_FAST_RECONNECT(ld->dev_fast_reconnect, 3, 60); - else - delay = 1; - log_peer_debug(ld->log, id, "Reconnecting in %u seconds", delay); - try_reconnect(peer, peer, delay, addrhint); + try_reconnect(peer, peer, addrhint); } else - log_peer_debug(ld->log, id, "Not reconnecting: %s", peer ? "no active channel" : "no channels"); + log_peer_debug(ld->log, id, "Not reconnecting: %s", + peer ? "no active channel" : "no channels"); } void connect_failed_disconnect(struct lightningd *ld, @@ -379,7 +384,7 @@ void connect_failed_disconnect(struct lightningd *ld, const struct wireaddr_internal *addrhint) { connect_failed(ld, id, CONNECT_DISCONNECTED_DURING, - "disconnected during connection", NULL, addrhint); + "disconnected during connection", addrhint); } static void handle_connect_failed(struct lightningd *ld, const u8 *msg) @@ -387,15 +392,14 @@ static void handle_connect_failed(struct lightningd *ld, const u8 *msg) struct node_id id; errcode_t errcode; char *errmsg; - u32 seconds_to_delay; struct wireaddr_internal *addrhint; if (!fromwire_connectd_connect_failed(tmpctx, msg, &id, &errcode, &errmsg, - &seconds_to_delay, &addrhint)) + &addrhint)) fatal("Connect gave bad CONNECTD_CONNECT_FAILED message %s", tal_hex(msg, msg)); - connect_failed(ld, &id, errcode, errmsg, &seconds_to_delay, addrhint); + connect_failed(ld, &id, errcode, errmsg, addrhint); } void connect_succeeded(struct lightningd *ld, const struct peer *peer, diff --git a/lightningd/connect_control.h b/lightningd/connect_control.h index 5eaa3c98d..6b64b72d0 100644 --- a/lightningd/connect_control.h +++ b/lightningd/connect_control.h @@ -20,7 +20,6 @@ void connectd_activate(struct lightningd *ld); void try_reconnect(const tal_t *ctx, struct peer *peer, - u32 seconds_delay, const struct wireaddr_internal *addrhint); void connect_succeeded(struct lightningd *ld, const struct peer *peer, bool incoming, diff --git a/lightningd/dual_open_control.c b/lightningd/dual_open_control.c index 71133b8fb..05faaa19d 100644 --- a/lightningd/dual_open_control.c +++ b/lightningd/dual_open_control.c @@ -1296,7 +1296,7 @@ wallet_commit_channel(struct lightningd *ld, * reconnect because no channels are active. But the subd * just made it active! */ if (!any_active && channel->peer->connected == PEER_DISCONNECTED) { - try_reconnect(channel->peer, channel->peer, 1, + try_reconnect(channel->peer, channel->peer, &channel->peer->addr); } diff --git a/lightningd/opening_control.c b/lightningd/opening_control.c index effdd07d9..3ba43fac1 100644 --- a/lightningd/opening_control.c +++ b/lightningd/opening_control.c @@ -240,7 +240,7 @@ wallet_commit_channel(struct lightningd *ld, * reconnect because no channels are active. But the subd * just made it active! */ if (!any_active && channel->peer->connected == PEER_DISCONNECTED) { - try_reconnect(channel->peer, channel->peer, 1, + try_reconnect(channel->peer, channel->peer, &channel->peer->addr); } diff --git a/lightningd/options.c b/lightningd/options.c index be6c159c4..54f5213b1 100644 --- a/lightningd/options.c +++ b/lightningd/options.c @@ -695,7 +695,7 @@ static void dev_register_opts(struct lightningd *ld) "Disable automatic reconnect-attempts by this node, but accept incoming"); opt_register_noarg("--dev-fast-reconnect", opt_set_bool, &ld->dev_fast_reconnect, - "Make default reconnect delay 3 (not 60) seconds"); + "Make max default reconnect delay 3 (not 300) seconds"); opt_register_noarg("--dev-fail-on-subdaemon-fail", opt_set_bool, &ld->dev_subdaemon_fail, opt_hidden); diff --git a/lightningd/peer_control.c b/lightningd/peer_control.c index bdfb45628..edf124035 100644 --- a/lightningd/peer_control.c +++ b/lightningd/peer_control.c @@ -100,7 +100,8 @@ struct peer *new_peer(struct lightningd *ld, u64 dbid, list_head_init(&peer->channels); peer->direction = node_id_idx(&peer->ld->id, &peer->id); peer->connected = PEER_DISCONNECTED; - peer->delay_reconnect = false; + peer->last_connect_attempt.ts.tv_sec + = peer->last_connect_attempt.ts.tv_nsec = 0; #if DEVELOPER peer->ignore_htlcs = false; #endif @@ -1330,7 +1331,6 @@ void peer_connected(struct lightningd *ld, const u8 *msg) /* We mark peer in "connecting" state until hooks have passed. */ assert(peer->connected == PEER_DISCONNECTED); peer->connected = PEER_CONNECTING; - peer->delay_reconnect = false; /* Update peer address and direction */ peer->addr = hook_payload->addr; @@ -2025,9 +2025,14 @@ static void setup_peer(struct peer *peer, u32 delay) } /* Make sure connectd knows to try reconnecting. */ - if (connect) - try_reconnect(peer, peer, delay, &peer->addr); - + if (connect) { + /* To delay, make it seem like we just connected. */ + if (delay > 0) { + peer->reconnect_delay = delay; + peer->last_connect_attempt = time_now(); + } + try_reconnect(peer, peer, &peer->addr); + } } void setup_peers(struct lightningd *ld) diff --git a/lightningd/peer_control.h b/lightningd/peer_control.h index 4609ca54b..5bdcef569 100644 --- a/lightningd/peer_control.h +++ b/lightningd/peer_control.h @@ -30,8 +30,10 @@ struct peer { /* Connection counter from connectd. */ u64 connectd_counter; - /* Did we fail badly last time? Don't reconnect too fast. */ - bool delay_reconnect; + /* Last reconnect: if it's recent, we delay by reconnect_delay, + * doubling each time. */ + struct timeabs last_connect_attempt; + u32 reconnect_delay; /* Our channels */ struct list_head channels; diff --git a/lightningd/test/run-invoice-select-inchan.c b/lightningd/test/run-invoice-select-inchan.c index 699564d66..915a2c55c 100644 --- a/lightningd/test/run-invoice-select-inchan.c +++ b/lightningd/test/run-invoice-select-inchan.c @@ -764,7 +764,6 @@ u8 *towire_warningfmt(const tal_t *ctx UNNEEDED, /* Generated stub for try_reconnect */ void try_reconnect(const tal_t *ctx UNNEEDED, struct peer *peer UNNEEDED, - u32 seconds_delay UNNEEDED, const struct wireaddr_internal *addrhint UNNEEDED) { fprintf(stderr, "try_reconnect called!\n"); abort(); } /* Generated stub for version */ diff --git a/tests/test_connection.py b/tests/test_connection.py index 7c4548aea..a570a0497 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -3242,11 +3242,12 @@ def test_feerate_spam(node_factory, chainparams): l1.daemon.wait_for_log('peer_out WIRE_UPDATE_FEE', timeout=5) -@pytest.mark.developer("need dev-feerate") +@pytest.mark.developer("need dev-feerate, dev-fast-reconnect") def test_feerate_stress(node_factory, executor): # Third node makes HTLC traffic less predictable. l1, l2, l3 = node_factory.line_graph(3, opts={'commit-time': 100, - 'may_reconnect': True}) + 'may_reconnect': True, + 'dev-fast-reconnect': None}) l1.pay(l2, 10**9 // 2) scid12 = l1.get_channel_scid(l2) diff --git a/wallet/test/run-wallet.c b/wallet/test/run-wallet.c index 8dd612b80..0715963d6 100644 --- a/wallet/test/run-wallet.c +++ b/wallet/test/run-wallet.c @@ -839,7 +839,6 @@ u8 *towire_warningfmt(const tal_t *ctx UNNEEDED, /* Generated stub for try_reconnect */ void try_reconnect(const tal_t *ctx UNNEEDED, struct peer *peer UNNEEDED, - u32 seconds_delay UNNEEDED, const struct wireaddr_internal *addrhint UNNEEDED) { fprintf(stderr, "try_reconnect called!\n"); abort(); } /* Generated stub for watch_txid */