From b5fcd54ef02c50854be315ec95d19848d7b11f68 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 24 Jul 2018 15:48:59 +0930
Subject: [PATCH] channeld: don't read from gossipfd while we're reconnecting.

That was the cause of the bad gossip order failures: gossipd thought our
channel was live, but the other end didn't receive message last time.

Now gossipd doesn't use fd to kill us (connectd tells master to do so), we
can implement read_peer_msg_nogossip().

Fixes: #1706
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 channeld/channel.c       |  9 +++++++--
 common/read_peer_msg.c   | 15 +++++++--------
 common/read_peer_msg.h   |  9 +++++++++
 tests/test_lightningd.py |  2 --
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/channeld/channel.c b/channeld/channel.c
index 8cffb5c76..5d14358b7 100644
--- a/channeld/channel.c
+++ b/channeld/channel.c
@@ -1884,8 +1884,13 @@ static void peer_reconnect(struct peer *peer)
 
 	peer_billboard(false, "Sent reestablish, waiting for theirs");
 
-	/* Read until they say something interesting */
-	while ((msg = channeld_read_peer_msg(peer)) == NULL)
+	/* Read until they say something interesting (don't forward
+	 * gossip to them yet: we might try sending channel_update
+	 * before we've reestablished channel). */
+	while ((msg = read_peer_msg_nogossip(peer, &peer->cs,
+					     &peer->channel_id,
+					     channeld_send_reply,
+					     peer)) == NULL)
 		clean_tmpctx();
 
 	if (!fromwire_channel_reestablish(msg, &channel_id,
diff --git a/common/read_peer_msg.c b/common/read_peer_msg.c
index 9a3896895..95ab912b9 100644
--- a/common/read_peer_msg.c
+++ b/common/read_peer_msg.c
@@ -56,19 +56,18 @@ u8 *read_peer_msg_(const tal_t *ctx,
 
 	FD_ZERO(&readfds);
 	FD_SET(peer_fd, &readfds);
-	FD_SET(gossip_fd, &readfds);
+	if (gossip_fd >= 0)
+		FD_SET(gossip_fd, &readfds);
 
 	select(peer_fd > gossip_fd ? peer_fd + 1 : gossip_fd + 1,
 	       &readfds, NULL, NULL, NULL);
 
-	if (FD_ISSET(gossip_fd, &readfds)) {
-		/* gossipd uses this to kill us, so not a surprise if it
-		   happens. */
+	if (gossip_fd >= 0 && FD_ISSET(gossip_fd, &readfds)) {
 		msg = wire_sync_read(NULL, gossip_fd);
-		if (!msg) {
-			status_debug("Error reading gossip msg");
-			peer_failed_connection_lost();
-		}
+		if (!msg)
+			status_failed(STATUS_FAIL_GOSSIP_IO,
+				      "Error reading gossip msg: %s",
+				      strerror(errno));
 
 		handle_gossip_msg_(msg, peer_fd, cs, send_reply, arg);
 		return NULL;
diff --git a/common/read_peer_msg.h b/common/read_peer_msg.h
index c39ae8be6..b0f433af5 100644
--- a/common/read_peer_msg.h
+++ b/common/read_peer_msg.h
@@ -26,6 +26,15 @@ struct channel_id;
 					   const u8 *),			\
 		       arg)
 
+/* Like the above, but don't read from GOSSIP_FD */
+#define read_peer_msg_nogossip(ctx, cs, chanid, send_reply, arg)	\
+	read_peer_msg_((ctx), PEER_FD, -1, (cs),			\
+		       (chanid),					\
+		       typesafe_cb_preargs(bool, void *, (send_reply), (arg), \
+					   struct crypto_state *, int,	\
+					   const u8 *),			\
+		       arg)
+
 /* Helper: sync_crypto_write, with extra args it ignores */
 bool sync_crypto_write_arg(struct crypto_state *cs, int fd, const u8 *TAKES,
 			   void *unused);
diff --git a/tests/test_lightningd.py b/tests/test_lightningd.py
index ecc5e9595..c04d7faa3 100644
--- a/tests/test_lightningd.py
+++ b/tests/test_lightningd.py
@@ -3565,8 +3565,6 @@ class LightningDTests(BaseLightningDTests):
         # Just to be sure, second openingd hand over to channeld.
         l2.daemon.wait_for_log('lightning_openingd.*REPLY WIRE_OPENING_FUNDEE_REPLY with 2 fds')
 
-    # FIXME: bad gossip order fix is wrapped up in gossipd/welcomed: see #1706
-    @flaky
     @unittest.skipIf(not DEVELOPER, "needs DEVELOPER=1")
     def test_reconnect_normal(self):
         # Should reconnect fine even if locked message gets lost.