From 7139a232d26beef441ffbc13bc087baab3505ea8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 2 Feb 2016 10:51:53 -0500 Subject: [PATCH] osd: handle boot racing with NOUP set + clear Resolve this race: - osd sends boot message - mon sets NOUP - mon receives and drops boot message - mon later clear NOUP -> osd still waiting by restarting the boot process if we are is_booting() and we see the NOUP flag cleared. This implies we sent the boot before NOUP was set (we don't send boot while it is) and the mon probably dropped our request. If it didn't we'll go through another boot cycle, but that is rare, mostly harmless, and unavoidable (there's no easy way to tell whether our message was dropped or not). Signed-off-by: Sage Weil --- src/osd/OSD.cc | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 78e581e9395ce..26b5d8d2b902a 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -6617,6 +6617,10 @@ void OSD::handle_osd_map(MOSDMap *m) map_lock.get_write(); + bool do_shutdown = false; + bool do_restart = false; + bool network_error = false; + // advance through the new maps for (epoch_t cur = start; cur <= superblock.newest_map; cur++) { dout(10) << " advance to epoch " << cur << " (<= newest " << superblock.newest_map << ")" << dendl; @@ -6643,6 +6647,21 @@ void OSD::handle_osd_map(MOSDMap *m) } } + if (osdmap->test_flag(CEPH_OSDMAP_NOUP) && + !newmap->test_flag(CEPH_OSDMAP_NOUP)) { + dout(10) << __func__ << " NOUP flag cleared in " << newmap->get_epoch() + << dendl; + if (is_booting()) { + // this captures the case where we sent the boot message while + // NOUP was being set on the mon and our boot request was + // dropped, and then later it is cleared. it imperfectly + // handles the case where our original boot message was not + // dropped and we restart even though we might have booted, but + // that is harmless (boot will just take slightly longer). + do_restart = true; + } + } + osdmap = newmap; superblock.current_epoch = cur; @@ -6679,9 +6698,6 @@ void OSD::handle_osd_map(MOSDMap *m) } } - bool do_shutdown = false; - bool do_restart = false; - bool network_error = false; if (osdmap->get_epoch() > 0 && is_active()) { if (!osdmap->exists(whoami)) { -- 2.39.5