From f4bda36a60223d36e2ca6ef29d81d98ff72ec3f6 Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Mon, 5 Dec 2022 18:46:11 +0000 Subject: [PATCH] mon/Elector.cc: Compress peer >= rank_size sanity check into send_peer_ping MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Problem: Currently, https://github.com/ceph/ceph/pull/44993 failed to completely fix: https://tracker.ceph.com/issues/50089 There are certain code paths such as Elector::handle_ping → Elector::begin_peer_ping → Elector::send_peer_ping. that when a monitor is removed before shutdown in Cephadm it can hit the assert failure. Solution: Therefore, we have to enforce sanity checks on all code paths. We do this by compressing the `peer >= rank_size` sanity check into `send_peer_ping`. We also make `send_peer_ping` return true/false caller of `send_peer_ping` would drop itself if recieves a `false`. Fixes: https://tracker.ceph.com/issues/58155 Signed-off-by: Kamoltat --- src/mon/Elector.cc | 29 +++++++++++++++-------------- src/mon/Elector.h | 2 +- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc index 9ef74d70a4e..a7221ebfb19 100644 --- a/src/mon/Elector.cc +++ b/src/mon/Elector.cc @@ -468,17 +468,26 @@ void Elector::begin_peer_ping(int peer) live_pinging.insert(peer); dead_pinging.erase(peer); peer_acked_ping[peer] = ceph_clock_now(); - send_peer_ping(peer); + if (!send_peer_ping(peer)) return; mon->timer.add_event_after(ping_timeout / PING_DIVISOR, new C_MonContext{mon, [this, peer](int) { ping_check(peer); }}); } -void Elector::send_peer_ping(int peer, const utime_t *n) +bool Elector::send_peer_ping(int peer, const utime_t *n) { dout(10) << __func__ << " to peer " << peer << dendl; - + if (peer >= ssize(mon->monmap->ranks)) { + // Monitor no longer exists in the monmap, + // therefore, we shouldn't ping this monitor + // since we cannot lookup the address! + dout(5) << "peer: " << peer << " >= ranks_size: " + << ssize(mon->monmap->ranks) << " ... dropping to prevent " + << "https://tracker.ceph.com/issues/50089" << dendl; + live_pinging.erase(peer); + return false; + } utime_t now; if (n != NULL) { now = *n; @@ -488,20 +497,13 @@ void Elector::send_peer_ping(int peer, const utime_t *n) MMonPing *ping = new MMonPing(MMonPing::PING, now, peer_tracker.get_encoded_bl()); mon->messenger->send_to_mon(ping, mon->monmap->get_addrs(peer)); peer_sent_ping[peer] = now; + return true; } void Elector::ping_check(int peer) { dout(20) << __func__ << " to peer " << peer << dendl; - if (peer >= ssize(mon->monmap->ranks)) { - // Monitor no longer exists in the monmap, - // therefore, we shouldn't ping this monitor - // since we cannot lookup the address! - dout(20) << __func__ << "peer >= ranks_size" << dendl; - live_pinging.erase(peer); - return; - } if (!live_pinging.count(peer) && !dead_pinging.count(peer)) { dout(20) << __func__ << peer << " is no longer marked for pinging" << dendl; @@ -518,7 +520,7 @@ void Elector::ping_check(int peer) } if (acked_ping == newest_ping) { - send_peer_ping(peer, &now); + if (!send_peer_ping(peer, &now)) return; } mon->timer.add_event_after(ping_timeout / PING_DIVISOR, @@ -599,8 +601,7 @@ void Elector::handle_ping(MonOpRequestRef op) dout(30) << "now: " << now << " m->stamp: " << m->stamp << " ping_timeout: " << ping_timeout << " PING_DIVISOR: " << PING_DIVISOR << dendl; if (now - m->stamp > ping_timeout / PING_DIVISOR) { - dout(30) << "(now - m->stamp > ping_timeout / PING_DIVISOR)" << dendl; - send_peer_ping(prank, &now); + if (!send_peer_ping(prank, &now)) return; } break; } diff --git a/src/mon/Elector.h b/src/mon/Elector.h index be5eee5d4da..be2f91c0f93 100644 --- a/src/mon/Elector.h +++ b/src/mon/Elector.h @@ -188,7 +188,7 @@ class Elector : public ElectionOwner, RankProvider { * Send a ping to the specified peer. * @n optional time that we will use instead of calling ceph_clock_now() */ - void send_peer_ping(int peer, const utime_t *n=NULL); + bool send_peer_ping(int peer, const utime_t *n=NULL); /** * Check the state of pinging the specified peer. This is our * "tick" for heartbeating; scheduled by itself and begin_peer_ping(). -- 2.39.5