From: Kamoltat Date: Mon, 5 Dec 2022 18:46:11 +0000 (+0000) Subject: mon/Elector.cc: Compress peer >= rank_size sanity check into send_peer_ping X-Git-Tag: v16.2.11~30^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F49444%2Fhead;p=ceph.git mon/Elector.cc: Compress peer >= rank_size sanity check into send_peer_ping Problem: Currently, https://github.com/ceph/ceph/pull/44993 failed to completely fix: https://tracker.ceph.com/issues/50089 There are certain code paths such as Elector::handle_ping → Elector::begin_peer_ping → Elector::send_peer_ping. that when a monitor is removed before shutdown in Cephadm it can hit the assert failure. Solution: Therefore, we have to enforce sanity checks on all code paths. We do this by compressing the `peer >= rank_size` sanity check into `send_peer_ping`. We also make `send_peer_ping` return true/false caller of `send_peer_ping` would drop itself if recieves a `false`. Fixes: https://tracker.ceph.com/issues/58155 Signed-off-by: Kamoltat (cherry picked from commit 27d499f8854fb5e73b103635ef227bbfdff3afb4) Conflicts: src/mon/Elector.cc - change ssize() to size() --- diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc index 6f7fba553c9a..0329fcb0301b 100644 --- a/src/mon/Elector.cc +++ b/src/mon/Elector.cc @@ -466,17 +466,26 @@ void Elector::begin_peer_ping(int peer) live_pinging.insert(peer); dead_pinging.erase(peer); peer_acked_ping[peer] = ceph_clock_now(); - send_peer_ping(peer); + if (!send_peer_ping(peer)) return; mon->timer.add_event_after(ping_timeout / PING_DIVISOR, new C_MonContext{mon, [this, peer](int) { ping_check(peer); }}); } -void Elector::send_peer_ping(int peer, const utime_t *n) +bool Elector::send_peer_ping(int peer, const utime_t *n) { dout(10) << __func__ << " to peer " << peer << dendl; - + if (peer >= mon->monmap->ranks.size()) { + // Monitor no longer exists in the monmap, + // therefore, we shouldn't ping this monitor + // since we cannot lookup the address! + dout(5) << "peer: " << peer << " >= ranks_size: " + << mon->monmap->ranks.size() << " ... dropping to prevent " + << "https://tracker.ceph.com/issues/50089" << dendl; + live_pinging.erase(peer); + return false; + } utime_t now; if (n != NULL) { now = *n; @@ -486,20 +495,13 @@ void Elector::send_peer_ping(int peer, const utime_t *n) MMonPing *ping = new MMonPing(MMonPing::PING, now, peer_tracker.get_encoded_bl()); mon->messenger->send_to_mon(ping, mon->monmap->get_addrs(peer)); peer_sent_ping[peer] = now; + return true; } void Elector::ping_check(int peer) { dout(20) << __func__ << " to peer " << peer << dendl; - if (peer >= mon->monmap->ranks.size()) { - // Monitor no longer exists in the monmap, - // therefore, we shouldn't ping this monitor - // since we cannot lookup the address! - dout(20) << __func__ << "peer >= ranks_size" << dendl; - live_pinging.erase(peer); - return; - } if (!live_pinging.count(peer) && !dead_pinging.count(peer)) { dout(20) << __func__ << peer << " is no longer marked for pinging" << dendl; @@ -516,7 +518,7 @@ void Elector::ping_check(int peer) } if (acked_ping == newest_ping) { - send_peer_ping(peer, &now); + if (!send_peer_ping(peer, &now)) return; } mon->timer.add_event_after(ping_timeout / PING_DIVISOR, @@ -590,7 +592,7 @@ void Elector::handle_ping(MonOpRequestRef op) } utime_t now = ceph_clock_now(); if (now - m->stamp > ping_timeout / PING_DIVISOR) { - send_peer_ping(prank, &now); + if (!send_peer_ping(prank, &now)) return; } break; } diff --git a/src/mon/Elector.h b/src/mon/Elector.h index d14dbb5c4ce9..241f7e0ebf8f 100644 --- a/src/mon/Elector.h +++ b/src/mon/Elector.h @@ -188,7 +188,7 @@ class Elector : public ElectionOwner, RankProvider { * Send a ping to the specified peer. * @n optional time that we will use instead of calling ceph_clock_now() */ - void send_peer_ping(int peer, const utime_t *n=NULL); + bool send_peer_ping(int peer, const utime_t *n=NULL); /** * Check the state of pinging the specified peer. This is our * "tick" for heartbeating; scheduled by itself and begin_peer_ping().