From d08c1ff199f21e842dad6a55dc96fc1b00ca51cc Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Mon, 3 Feb 2020 21:04:05 +0800 Subject: [PATCH] osd/OSD: prevent down osds from immediately rejoining the culster In 114c65fc I posted a work-around to fix a heartbeat brain-split case but it really looks to me now like I am missing some other cases where an immediate attempt to rejoin is bad, like when the network actually isn't working properly rather than being predictably manipulated by an admin. This patch instead slows the unconditionally rejoining attempt down, especially make sure that we don't try to immediately rejoin the culster when an osd has just been marked down by mon. Signed-off-by: xie xingguo --- src/osd/OSD.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 7975f6d28d3..a7176fde661 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -6417,9 +6417,10 @@ bool OSD::_is_healthy() if (is_waiting_for_healthy()) { utime_t now = ceph_clock_now(); - if (osd_markdown_log.size() <= 1) { - dout(5) << __func__ << " first time marked as down," - << " try reboot unconditionally" << dendl; + if (osd_markdown_log.empty()) { + dout(5) << __func__ << " force returning true since last markdown" + << " was " << cct->_conf->osd_max_markdown_period + << "s ago" << dendl; return true; } std::lock_guard l(heartbeat_lock); -- 2.39.5