From: Kamoltat Date: Wed, 3 Aug 2022 14:22:49 +0000 (+0000) Subject: mon/OSDMonitor: Added extra check before mon.go_recovery_stretch_mode() X-Git-Tag: v16.2.11~63^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=94dc97005bf6406d13c3fd5a3664a1fe2f7efec9;p=ceph.git mon/OSDMonitor: Added extra check before mon.go_recovery_stretch_mode() Problem: There are certain scenarios in degraded stretched cluster where will try to go into the function ``Monitor::go_recovery_stretch_mode()`` that will lead to a `ceph_assert`. Solution: Make sure ``dead_mon_buckets.size() == 0`` in ``OSDMonitor:update_from_paxos()`` before going into ``Monitor::go_recovery_stretch_mode()``. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2104207 Signed-off-by: Kamoltat (cherry picked from commit d95c41aa0c5c9bf9c3ac9bc4012f57e556ae4a81) --- diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 97034994546a..b88813e8af55 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -960,10 +960,12 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap) dout(20) << "mon_stretch_cluster_recovery_ratio: " << cct->_conf.get_val("mon_stretch_cluster_recovery_ratio") << dendl; if (prev_num_up_osd < osdmap.num_up_osd && (osdmap.num_up_osd / (double)osdmap.num_osd) > - cct->_conf.get_val("mon_stretch_cluster_recovery_ratio")) { + cct->_conf.get_val("mon_stretch_cluster_recovery_ratio") && + mon.dead_mon_buckets.size() == 0) { // TODO: This works for 2-site clusters when the OSD maps are appropriately // trimmed and everything is "normal" but not if you have a lot of out OSDs // you're ignoring or in some really degenerate failure cases + dout(10) << "Enabling recovery stretch mode in this map" << dendl; mon.go_recovery_stretch_mode(); }