]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
osd/OSD: prevent down osds from immediately rejoining the culster
authorxie xingguo <xie.xingguo@zte.com.cn>
Mon, 3 Feb 2020 13:04:05 +0000 (21:04 +0800)
committerxie xingguo <xie.xingguo@zte.com.cn>
Mon, 3 Feb 2020 13:35:05 +0000 (21:35 +0800)
In 114c65fc I posted a work-around to fix a heartbeat brain-split case
but it really looks to me now like I am missing some other cases where
an immediate attempt to rejoin is bad, like when the network actually
isn't working properly rather than being predictably manipulated by an
admin.
This patch instead slows the unconditionally rejoining attempt down,
especially make sure that we don't try to immediately rejoin the culster
when an osd has just been marked down by mon.

Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
src/osd/OSD.cc

index 7975f6d28d3e0ef02ab497e34a8d1a0bb298cee7..a7176fde6613fa32a2fd1bc452e0ea46a70fc44f 100644 (file)
@@ -6417,9 +6417,10 @@ bool OSD::_is_healthy()
 
   if (is_waiting_for_healthy()) {
      utime_t now = ceph_clock_now();
-     if (osd_markdown_log.size() <= 1) {
-       dout(5) << __func__ << " first time marked as down,"
-               << " try reboot unconditionally" << dendl;
+     if (osd_markdown_log.empty()) {
+       dout(5) << __func__ << " force returning true since last markdown"
+               << " was " << cct->_conf->osd_max_markdown_period
+               << "s ago" << dendl;
        return true;
     }
     std::lock_guard l(heartbeat_lock);