]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/Elector.cc: Compress peer >= rank_size sanity check into send_peer_ping 49444/head
authorKamoltat <ksirivad@redhat.com>
Mon, 5 Dec 2022 18:46:11 +0000 (18:46 +0000)
committerKamoltat <ksirivad@redhat.com>
Wed, 14 Dec 2022 21:26:29 +0000 (21:26 +0000)
Problem:

Currently, https://github.com/ceph/ceph/pull/44993
failed to completely fix:

https://tracker.ceph.com/issues/50089

There are certain code paths such as

Elector::handle_ping → Elector::begin_peer_ping →
Elector::send_peer_ping.

that when a monitor is removed before shutdown in
Cephadm it can hit the assert failure.

Solution:

Therefore, we have to enforce sanity checks on
all code paths.

We do this by compressing the `peer >= rank_size`
sanity check into `send_peer_ping`. We also make
`send_peer_ping` return true/false

caller of `send_peer_ping` would drop itself if
recieves a `false`.

Fixes: https://tracker.ceph.com/issues/58155
Signed-off-by: Kamoltat <ksirivad@redhat.com>
(cherry picked from commit 27d499f8854fb5e73b103635ef227bbfdff3afb4)

Conflicts:
src/mon/Elector.cc - change ssize() to size()

src/mon/Elector.cc
src/mon/Elector.h

index 6f7fba553c9a868e1468b103b81a37418b51b6e2..0329fcb0301b42bb0fa71f0e28387093e9eb5875 100644 (file)
@@ -466,17 +466,26 @@ void Elector::begin_peer_ping(int peer)
   live_pinging.insert(peer);
   dead_pinging.erase(peer);
   peer_acked_ping[peer] = ceph_clock_now();
-  send_peer_ping(peer);
+  if (!send_peer_ping(peer)) return;
   mon->timer.add_event_after(ping_timeout / PING_DIVISOR,
                             new C_MonContext{mon, [this, peer](int) {
                                 ping_check(peer);
                               }});
 }
 
-void Elector::send_peer_ping(int peer, const utime_t *n)
+bool Elector::send_peer_ping(int peer, const utime_t *n)
 {
   dout(10) << __func__ << " to peer " << peer << dendl;
-
+  if (peer >= mon->monmap->ranks.size()) {
+    // Monitor no longer exists in the monmap,
+    // therefore, we shouldn't ping this monitor
+    // since we cannot lookup the address!
+    dout(5) << "peer: " << peer << " >= ranks_size: "
+      << mon->monmap->ranks.size() << " ... dropping to prevent "
+      << "https://tracker.ceph.com/issues/50089" << dendl;
+    live_pinging.erase(peer);
+    return false;
+  }
   utime_t now;
   if (n != NULL) {
     now = *n;
@@ -486,20 +495,13 @@ void Elector::send_peer_ping(int peer, const utime_t *n)
   MMonPing *ping = new MMonPing(MMonPing::PING, now, peer_tracker.get_encoded_bl());
   mon->messenger->send_to_mon(ping, mon->monmap->get_addrs(peer));
   peer_sent_ping[peer] = now;
+  return true;
 }
 
 void Elector::ping_check(int peer)
 {
   dout(20) << __func__ << " to peer " << peer << dendl;
 
-  if (peer >= mon->monmap->ranks.size()) {
-    // Monitor no longer exists in the monmap,
-    // therefore, we shouldn't ping this monitor
-    // since we cannot lookup the address!
-    dout(20) << __func__ << "peer >= ranks_size" << dendl;
-    live_pinging.erase(peer);
-    return;
-  }
   if (!live_pinging.count(peer) &&
       !dead_pinging.count(peer)) {
     dout(20) << __func__ << peer << " is no longer marked for pinging" << dendl;
@@ -516,7 +518,7 @@ void Elector::ping_check(int peer)
   }
 
   if (acked_ping == newest_ping) {
-    send_peer_ping(peer, &now);
+    if (!send_peer_ping(peer, &now)) return;
   }
 
   mon->timer.add_event_after(ping_timeout / PING_DIVISOR,
@@ -590,7 +592,7 @@ void Elector::handle_ping(MonOpRequestRef op)
     }
     utime_t now = ceph_clock_now();
     if (now - m->stamp > ping_timeout / PING_DIVISOR) {
-      send_peer_ping(prank, &now);
+      if (!send_peer_ping(prank, &now)) return;
     }
     break;
   }
index d14dbb5c4ce9f1d76ab53d1967dc48d159aab65a..241f7e0ebf8fb1bd2190c5615ccf2072d4908c9c 100644 (file)
@@ -188,7 +188,7 @@ class Elector : public ElectionOwner, RankProvider {
    * Send a ping to the specified peer.
    * @n optional time that we will use instead of calling ceph_clock_now()
    */
-  void send_peer_ping(int peer, const utime_t *n=NULL);
+  bool send_peer_ping(int peer, const utime_t *n=NULL);
   /**
    * Check the state of pinging the specified peer. This is our
    * "tick" for heartbeating; scheduled by itself and begin_peer_ping().