gossipd: keep reaching struct only when we're actively connecting, and don't retry

1. Lifetime of 'struct reaching' now only while we're actively doing connect.
2. Always free after a single attempt: if it's an important peer, retry
   on a timer.
3. Have a single response message to master, rather than relying on
   peer_connected on success and other msgs on failure.
4. If we are actively connecting and we get another command for the same
   id, just increment the counter

The result is much simpler in the master daemon, and much nicer for
reconnection: if they say to connect they get an immediate response,
rather than waiting for 10 retries.  Even if it's an important peer,
it fires off another reconnect attempt, unless it's actively
connecting now.

This removes exponential backoff: that's restored in next patch.  It
also doesn't handle multiple addresses for a single peer.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
Rusty Russell
2018-04-26 14:20:58 +09:30
parent 20e3a18af5
commit 72c459dd6c
8 changed files with 191 additions and 233 deletions

View File

@@ -78,7 +78,9 @@ def wait_forget_channels(node):
"""This node is closing all of its channels, check we are forgetting them
"""
node.daemon.wait_for_log(r'onchaind complete, forgetting peer')
assert node.rpc.listpeers()['peers'] == []
# May have reconnected, but should merely be gossiping.
for peer in node.rpc.listpeers()['peers']:
assert peer['state'] == 'GOSSIPING'
assert node.db_query("SELECT * FROM channels") == []
@@ -616,6 +618,11 @@ class LightningDTests(BaseLightningDTests):
assert len(l1.rpc.listpeers()) == 1
assert len(l2.rpc.listpeers()) == 1
# Should get reasonable error if unknown addr for peer.
self.assertRaisesRegex(ValueError,
"No address known",
l1.rpc.connect, '032cf15d1ad9c4a08d26eab1918f732d8ef8fdc6abb9640bf3db174372c491304e')
def test_connect_standard_addr(self):
"""Test standard node@host:port address
"""
@@ -635,6 +642,34 @@ class LightningDTests(BaseLightningDTests):
# ret = l1.rpc.connect("{}@[::1]:{}".format(l3.info['id'], l3.info['port']))
# assert ret['id'] == l3.info['id']
def test_reconnect_channel_peers(self):
l1 = self.node_factory.get_node(may_reconnect=True)
l2 = self.node_factory.get_node(may_reconnect=True)
l1.rpc.connect(l2.info['id'], 'localhost', l2.info['port'])
self.fund_channel(l1, l2, 10**6)
l2.stop()
l2.daemon.start()
# Should reconnect.
wait_for(lambda: l1.rpc.listpeers(l2.info['id'])['peers'][0]['connected'])
wait_for(lambda: l2.rpc.listpeers(l1.info['id'])['peers'][0]['connected'])
# Connect command should succeed.
l1.rpc.connect(l2.info['id'], 'localhost', l2.info['port'])
# Stop l2 and wait for l1 to notice.
l2.stop()
wait_for(lambda: not l1.rpc.listpeers(l2.info['id'])['peers'][0]['connected'])
# Now should fail.
self.assertRaisesRegex(ValueError,
"Connection refused",
l1.rpc.connect, l2.info['id'], 'localhost', l2.info['port'])
# It should now succeed when it restarts.
l2.daemon.start()
l1.rpc.connect(l2.info['id'], 'localhost', l2.info['port'])
def test_balance(self):
l1, l2 = self.connect()
@@ -2961,18 +2996,24 @@ class LightningDTests(BaseLightningDTests):
@unittest.skipIf(not DEVELOPER, "needs DEVELOPER=1")
def test_disconnect(self):
# These should all make us fail, and retry.
# FIXME: Configure short timeout for reconnect!
# These should all make us fail
disconnects = ['-WIRE_INIT',
'@WIRE_INIT',
'+WIRE_INIT']
l1 = self.node_factory.get_node(disconnect=disconnects)
l2 = self.node_factory.get_node()
self.assertRaises(ValueError, l1.rpc.connect,
l2.info['id'], 'localhost', l2.info['port'])
self.assertRaises(ValueError, l1.rpc.connect,
l2.info['id'], 'localhost', l2.info['port'])
self.assertRaises(ValueError, l1.rpc.connect,
l2.info['id'], 'localhost', l2.info['port'])
l1.rpc.connect(l2.info['id'], 'localhost', l2.info['port'])
# Should have 3 connect fails.
for d in disconnects:
l1.daemon.wait_for_log('Failed connected out for {}, will try again'
l1.daemon.wait_for_log('Failed connected out for {}'
.format(l2.info['id']))
# Should still only have one peer!