From: xie xingguo Date: Fri, 16 Nov 2018 06:54:39 +0000 (+0800) Subject: osd: cancel pending failure reports on re-activating osd X-Git-Tag: v14.1.0~743^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=8d8e8a359c66b5767be6a4a2327c5f7097885464;p=ceph.git osd: cancel pending failure reports on re-activating osd To reproduce, construct a cluster with 3 hosts, each containing a single osd only: - cut off osd.1's cluster network, waiting osd.1 to be marked as down - cut off both osd.2 & osd.3's cluster network ``` Note that there are two possible outputs for the above step: 1. osd.1's failure reports get ignored by monitor as osd.1 has already been marked as down. Osd.2 & osd.3 stay __up__ as a result. 2. osd.1's failure reports are considered as valid. Either osd.2 or osd.3 is marked as __down__. We consider case __2__ only here. ``` - restore osd.1 & osd.2's cluster network Now you get __3__ up osds. The root cause is that monitor will simply discard any failure reports from dead osds, whereas osds never re-send pending failure reports unless they are re-connecting to monitors. Fix by cancelling any pending failure reports each time an osd is transiting from dead to active *again*. Signed-off-by: xie xingguo --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 8f3bd8e6afc8..e239e4961896 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -5876,6 +5876,18 @@ void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs monc->send_mon_message(m); } +void OSD::cancel_pending_failures() +{ + std::lock_guard l(heartbeat_lock); + auto it = failure_pending.begin(); + while (it != failure_pending.end()) { + dout(10) << __func__ << " canceling in-flight failure report for osd." + << it->first << dendl; + send_still_alive(osdmap->get_epoch(), it->first, it->second.second); + failure_pending.erase(it++); + } +} + void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now) { const auto& monmap = monc->monmap; @@ -7834,6 +7846,7 @@ void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m) // set incarnation so that osd_reqid_t's we generate for our // objecter requests are unique across restarts. service.objecter->set_client_incarnation(osdmap->get_epoch()); + cancel_pending_failures(); } } @@ -7975,14 +7988,7 @@ void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m) if (do_shutdown) { if (network_error) { - std::lock_guard l(heartbeat_lock); - auto it = failure_pending.begin(); - while (it != failure_pending.end()) { - dout(10) << "handle_osd_ping canceling in-flight failure report for osd." - << it->first << dendl; - send_still_alive(osdmap->get_epoch(), it->first, it->second.second); - failure_pending.erase(it++); - } + cancel_pending_failures(); } // trigger shutdown in a different thread dout(0) << __func__ << " shutdown OSD via async signal" << dendl; diff --git a/src/osd/OSD.h b/src/osd/OSD.h index c8e0cea6a81d..3f22ffe8dd0a 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -2008,6 +2008,7 @@ protected: void requeue_failures(); void send_failures(); void send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs); + void cancel_pending_failures(); ceph::coarse_mono_clock::time_point last_sent_beacon; Mutex min_last_epoch_clean_lock{"OSD::min_last_epoch_clean_lock"};