]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/Elector.cc: Compress peer >= rank_size sanity check into send_peer_ping 49259/head
authorKamoltat <ksirivad@redhat.com>
Mon, 5 Dec 2022 18:46:11 +0000 (18:46 +0000)
committerKamoltat <ksirivad@redhat.com>
Thu, 15 Dec 2022 22:11:15 +0000 (22:11 +0000)
Problem:

Currently, https://github.com/ceph/ceph/pull/44993
failed to completely fix:

https://tracker.ceph.com/issues/50089

There are certain code paths such as

Elector::handle_ping → Elector::begin_peer_ping →
Elector::send_peer_ping.

that when a monitor is removed before shutdown in
Cephadm it can hit the assert failure.

Solution:

Therefore, we have to enforce sanity checks on
all code paths.

We do this by compressing the `peer >= rank_size`
sanity check into `send_peer_ping`. We also make
`send_peer_ping` return true/false

caller of `send_peer_ping` would drop itself if
recieves a `false`.

Fixes: https://tracker.ceph.com/issues/58155
Signed-off-by: Kamoltat <ksirivad@redhat.com>
src/mon/Elector.cc
src/mon/Elector.h

index 9ef74d70a4e2392951829fa64811b8ca2972833d..a7221ebfb196410d04ba1ab223ac88bd43fd298d 100644 (file)
@@ -468,17 +468,26 @@ void Elector::begin_peer_ping(int peer)
   live_pinging.insert(peer);
   dead_pinging.erase(peer);
   peer_acked_ping[peer] = ceph_clock_now();
-  send_peer_ping(peer);
+  if (!send_peer_ping(peer)) return;
   mon->timer.add_event_after(ping_timeout / PING_DIVISOR,
                             new C_MonContext{mon, [this, peer](int) {
                                 ping_check(peer);
                               }});
 }
 
-void Elector::send_peer_ping(int peer, const utime_t *n)
+bool Elector::send_peer_ping(int peer, const utime_t *n)
 {
   dout(10) << __func__ << " to peer " << peer << dendl;
-
+  if (peer >= ssize(mon->monmap->ranks)) {
+    // Monitor no longer exists in the monmap,
+    // therefore, we shouldn't ping this monitor
+    // since we cannot lookup the address!
+    dout(5) << "peer: " << peer << " >= ranks_size: "
+      << ssize(mon->monmap->ranks) << " ... dropping to prevent "
+      << "https://tracker.ceph.com/issues/50089" << dendl;
+    live_pinging.erase(peer);
+    return false;
+  }
   utime_t now;
   if (n != NULL) {
     now = *n;
@@ -488,20 +497,13 @@ void Elector::send_peer_ping(int peer, const utime_t *n)
   MMonPing *ping = new MMonPing(MMonPing::PING, now, peer_tracker.get_encoded_bl());
   mon->messenger->send_to_mon(ping, mon->monmap->get_addrs(peer));
   peer_sent_ping[peer] = now;
+  return true;
 }
 
 void Elector::ping_check(int peer)
 {
   dout(20) << __func__ << " to peer " << peer << dendl;
 
-  if (peer >= ssize(mon->monmap->ranks)) {
-    // Monitor no longer exists in the monmap,
-    // therefore, we shouldn't ping this monitor
-    // since we cannot lookup the address!
-    dout(20) << __func__ << "peer >= ranks_size" << dendl;
-    live_pinging.erase(peer);
-    return;
-  }
   if (!live_pinging.count(peer) &&
       !dead_pinging.count(peer)) {
     dout(20) << __func__ << peer << " is no longer marked for pinging" << dendl;
@@ -518,7 +520,7 @@ void Elector::ping_check(int peer)
   }
 
   if (acked_ping == newest_ping) {
-    send_peer_ping(peer, &now);
+    if (!send_peer_ping(peer, &now)) return;
   }
 
   mon->timer.add_event_after(ping_timeout / PING_DIVISOR,
@@ -599,8 +601,7 @@ void Elector::handle_ping(MonOpRequestRef op)
     dout(30) << "now: " << now << " m->stamp: " << m->stamp << " ping_timeout: "
       << ping_timeout << " PING_DIVISOR: " << PING_DIVISOR << dendl;
     if (now - m->stamp > ping_timeout / PING_DIVISOR) {
-      dout(30) << "(now - m->stamp > ping_timeout / PING_DIVISOR)" << dendl;
-      send_peer_ping(prank, &now);
+      if (!send_peer_ping(prank, &now)) return;
     }
     break;
   }
index be5eee5d4dadaedb48332f362765b1576b265a31..be2f91c0f93dd400954503fef8a40e9d4dbb538c 100644 (file)
@@ -188,7 +188,7 @@ class Elector : public ElectionOwner, RankProvider {
    * Send a ping to the specified peer.
    * @n optional time that we will use instead of calling ceph_clock_now()
    */
-  void send_peer_ping(int peer, const utime_t *n=NULL);
+  bool send_peer_ping(int peer, const utime_t *n=NULL);
   /**
    * Check the state of pinging the specified peer. This is our
    * "tick" for heartbeating; scheduled by itself and begin_peer_ping().