mirror of
https://github.com/aljazceru/lightning.git
synced 2025-12-19 23:24:27 +01:00
gossipd: remove gossip entirely if we hit a problem on load.
The crashes in #2750 are mostly caused by us trying to partially truncate the store. The simplest fix for release is to discard the whole thing if we detect a problem. This is a workaround: it'd be far nicer to try to recover. Fixes: #2750 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
@@ -525,44 +525,6 @@ int gossip_store_readonly_fd(struct gossip_store *gs)
|
|||||||
return fd;
|
return fd;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If we ever truncated, we might have a dangling entries. */
|
|
||||||
static void cleanup_truncated_store(struct routing_state *rstate,
|
|
||||||
struct gossip_store *gs,
|
|
||||||
u32 chan_ann_off)
|
|
||||||
{
|
|
||||||
size_t num;
|
|
||||||
u32 index;
|
|
||||||
|
|
||||||
/* channel_announce with no channel_amount. */
|
|
||||||
if (chan_ann_off) {
|
|
||||||
status_unusual("Deleting un-amounted channel_announcement @%u",
|
|
||||||
chan_ann_off);
|
|
||||||
delete_by_index(gs, chan_ann_off, WIRE_CHANNEL_ANNOUNCEMENT);
|
|
||||||
}
|
|
||||||
|
|
||||||
num = 0;
|
|
||||||
while ((index = remove_unfinalized_node_announce(rstate)) != 0) {
|
|
||||||
delete_by_index(gs, index, WIRE_NODE_ANNOUNCEMENT);
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
if (num)
|
|
||||||
status_unusual("Deleted %zu unfinalized node_announcements",
|
|
||||||
num);
|
|
||||||
|
|
||||||
num = 0;
|
|
||||||
while ((index = remove_unupdated_channel_announce(rstate)) != 0) {
|
|
||||||
u32 next;
|
|
||||||
|
|
||||||
/* Delete announcement and channel amount, too */
|
|
||||||
next = delete_by_index(gs, index, WIRE_CHANNEL_ANNOUNCEMENT);
|
|
||||||
delete_by_index(gs, next, WIRE_GOSSIP_STORE_CHANNEL_AMOUNT);
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
if (num)
|
|
||||||
status_unusual("Deleted %zu unupdated channel_announcements",
|
|
||||||
num);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
|
bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
|
||||||
{
|
{
|
||||||
struct gossip_hdr hdr;
|
struct gossip_hdr hdr;
|
||||||
@@ -672,25 +634,36 @@ bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
|
|||||||
clean_tmpctx();
|
clean_tmpctx();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (chan_ann) {
|
||||||
|
status_unusual("gossip_store: dangling channel_announcement");
|
||||||
|
goto truncate_nomsg;
|
||||||
|
}
|
||||||
|
|
||||||
|
bad = unfinalized_entries(tmpctx, rstate);
|
||||||
|
if (bad) {
|
||||||
|
status_unusual("gossip_store: %s", bad);
|
||||||
|
goto truncate_nomsg;
|
||||||
|
}
|
||||||
|
|
||||||
/* If last timestamp is within 24 hours, say we're OK. */
|
/* If last timestamp is within 24 hours, say we're OK. */
|
||||||
contents_ok = (last_timestamp >= time_now().ts.tv_sec - 24*3600);
|
contents_ok = (last_timestamp >= time_now().ts.tv_sec - 24*3600);
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
truncate:
|
truncate:
|
||||||
status_unusual("gossip_store: %s (%s) truncating to %"PRIu64,
|
status_unusual("gossip_store: %s (%s) truncating",
|
||||||
bad, tal_hex(msg, msg), gs->len);
|
bad, tal_hex(msg, msg));
|
||||||
|
|
||||||
truncate_nomsg:
|
truncate_nomsg:
|
||||||
/* FIXME: We would like to truncate to known_good, except we would
|
/* FIXME: Debug partial truncate case. */
|
||||||
* miss channel_delete msgs. If we put block numbers into the store
|
if (ftruncate(gs->fd, 1) != 0)
|
||||||
* as we process them, we can know how far we need to roll back if we
|
|
||||||
* truncate the store */
|
|
||||||
if (ftruncate(gs->fd, gs->len) != 0)
|
|
||||||
status_failed(STATUS_FAIL_INTERNAL_ERROR,
|
status_failed(STATUS_FAIL_INTERNAL_ERROR,
|
||||||
"Truncating store: %s", strerror(errno));
|
"Truncating store: %s", strerror(errno));
|
||||||
|
remove_all_gossip(rstate);
|
||||||
|
gs->count = gs->deleted = 0;
|
||||||
|
gs->len = 1;
|
||||||
contents_ok = false;
|
contents_ok = false;
|
||||||
out:
|
out:
|
||||||
gs->writable = true;
|
gs->writable = true;
|
||||||
cleanup_truncated_store(rstate, gs, chan_ann ? chan_ann_off : 0);
|
|
||||||
status_trace("total store load time: %"PRIu64" msec",
|
status_trace("total store load time: %"PRIu64" msec",
|
||||||
time_to_msec(time_between(time_now(), start)));
|
time_to_msec(time_between(time_now(), start)));
|
||||||
status_trace("gossip_store: Read %zu/%zu/%zu/%zu cannounce/cupdate/nannounce/cdelete from store (%zu deleted) in %"PRIu64" bytes",
|
status_trace("gossip_store: Read %zu/%zu/%zu/%zu cannounce/cupdate/nannounce/cdelete from store (%zu deleted) in %"PRIu64" bytes",
|
||||||
|
|||||||
@@ -2583,42 +2583,63 @@ struct timeabs gossip_time_now(const struct routing_state *rstate)
|
|||||||
return time_now();
|
return time_now();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* gossip_store wants to delete any dangling node_announcement msgs */
|
const char *unfinalized_entries(const tal_t *ctx, struct routing_state *rstate)
|
||||||
u32 remove_unfinalized_node_announce(struct routing_state *rstate)
|
|
||||||
{
|
|
||||||
/* We're only interested in node_announcement we caught. */
|
|
||||||
for (;;) {
|
|
||||||
struct pending_node_announce *pna;
|
|
||||||
struct pending_node_map_iter it;
|
|
||||||
|
|
||||||
pna = pending_node_map_first(rstate->pending_node_map, &it);
|
|
||||||
if (!pna)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/* This will be deleted by the associated unupdated_channel; just
|
|
||||||
* remove from map for now. */
|
|
||||||
pending_node_map_del(rstate->pending_node_map, pna);
|
|
||||||
if (!pna->node_announcement)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
assert(pna->index);
|
|
||||||
return pna->index;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* gossip_store wants to delete any dangling channel_announcement msgs */
|
|
||||||
u32 remove_unupdated_channel_announce(struct routing_state *rstate)
|
|
||||||
{
|
{
|
||||||
struct unupdated_channel *uc;
|
struct unupdated_channel *uc;
|
||||||
u64 index;
|
u64 index;
|
||||||
|
struct pending_node_announce *pna;
|
||||||
|
struct pending_node_map_iter it;
|
||||||
|
|
||||||
uc = uintmap_first(&rstate->unupdated_chanmap, &index);
|
uc = uintmap_first(&rstate->unupdated_chanmap, &index);
|
||||||
if (!uc)
|
if (uc)
|
||||||
return 0;
|
return tal_fmt(ctx, "Unupdated channel_announcement at %u",
|
||||||
|
uc->index);
|
||||||
|
|
||||||
assert(uc->index);
|
pna = pending_node_map_first(rstate->pending_node_map, &it);
|
||||||
index = uc->index;
|
if (pna)
|
||||||
|
return tal_fmt(ctx, "Waiting node_announcement at %u",
|
||||||
|
pna->index);
|
||||||
|
|
||||||
tal_free(uc);
|
return NULL;
|
||||||
return index;
|
}
|
||||||
|
|
||||||
|
/* Gossip store was corrupt, forget anything we loaded. */
|
||||||
|
void remove_all_gossip(struct routing_state *rstate)
|
||||||
|
{
|
||||||
|
struct node *n;
|
||||||
|
struct node_map_iter nit;
|
||||||
|
struct chan *c;
|
||||||
|
struct unupdated_channel *uc;
|
||||||
|
u64 index;
|
||||||
|
struct pending_cannouncement *pca;
|
||||||
|
struct pending_cannouncement_map_iter pit;
|
||||||
|
struct pending_node_map_iter pnait;
|
||||||
|
|
||||||
|
/* We don't want them to try to delete from store, so do this
|
||||||
|
* manually. */
|
||||||
|
while ((n = node_map_first(rstate->nodes, &nit)) != NULL) {
|
||||||
|
tal_del_destructor2(n, destroy_node, rstate);
|
||||||
|
if (node_uses_chan_map(n))
|
||||||
|
chan_map_clear(&n->chans.map);
|
||||||
|
node_map_del(rstate->nodes, n);
|
||||||
|
tal_free(n);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Now free all the channels. */
|
||||||
|
while ((c = uintmap_first(&rstate->chanmap, &index)) != NULL) {
|
||||||
|
uintmap_del(&rstate->chanmap, index);
|
||||||
|
|
||||||
|
/* Remove from local_disabled_map if it's there. */
|
||||||
|
chan_map_del(&rstate->local_disabled_map, c);
|
||||||
|
tal_free(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
while ((uc = uintmap_first(&rstate->unupdated_chanmap, &index)) != NULL)
|
||||||
|
tal_free(uc);
|
||||||
|
|
||||||
|
while ((pca = pending_cannouncement_map_first(&rstate->pending_cannouncements, &pit)) != NULL)
|
||||||
|
tal_free(pca);
|
||||||
|
|
||||||
|
/* Freeing unupdated chanmaps should empty this */
|
||||||
|
assert(pending_node_map_first(rstate->pending_node_map, &pnait) == NULL);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -437,12 +437,8 @@ struct wireaddr *read_addresses(const tal_t *ctx, const u8 *ser);
|
|||||||
void remove_channel_from_store(struct routing_state *rstate,
|
void remove_channel_from_store(struct routing_state *rstate,
|
||||||
struct chan *chan);
|
struct chan *chan);
|
||||||
|
|
||||||
/* gossip_store wants to delete any dangling entries immediately after
|
/* Returns an error string if there are unfinalized entries after load */
|
||||||
* load; return 0 if no more, otherwise index into store.
|
const char *unfinalized_entries(const tal_t *ctx, struct routing_state *rstate);
|
||||||
*
|
|
||||||
* Must call remove_unfinalized_node_announce first, because removing
|
|
||||||
* unupdated channels may delete associatd node_announcements. */
|
|
||||||
u32 remove_unfinalized_node_announce(struct routing_state *rstate);
|
|
||||||
u32 remove_unupdated_channel_announce(struct routing_state *rstate);
|
|
||||||
|
|
||||||
|
void remove_all_gossip(struct routing_state *rstate);
|
||||||
#endif /* LIGHTNING_GOSSIPD_ROUTING_H */
|
#endif /* LIGHTNING_GOSSIPD_ROUTING_H */
|
||||||
|
|||||||
@@ -945,8 +945,8 @@ def test_gossip_store_load_amount_truncated(node_factory):
|
|||||||
|
|
||||||
l1.start()
|
l1.start()
|
||||||
# May preceed the Started msg waited for in 'start'.
|
# May preceed the Started msg waited for in 'start'.
|
||||||
wait_for(lambda: l1.daemon.is_in_log(r'Deleting un-amounted channel_announcement @1'))
|
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: dangling channel_announcement'))
|
||||||
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: Read 0/0/0/0 cannounce/cupdate/nannounce/cdelete from store \(1 deleted\) in 445 bytes'))
|
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: Read 0/0/0/0 cannounce/cupdate/nannounce/cdelete from store \(0 deleted\) in 1 bytes'))
|
||||||
assert not l1.daemon.is_in_log('gossip_store.*truncating')
|
assert not l1.daemon.is_in_log('gossip_store.*truncating')
|
||||||
|
|
||||||
# Extra sanity check if we can.
|
# Extra sanity check if we can.
|
||||||
|
|||||||
Reference in New Issue
Block a user