]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: refactor heartbeat health check
authorxie xingguo <xie.xingguo@zte.com.cn>
Mon, 16 May 2016 05:50:28 +0000 (13:50 +0800)
committerDavid Zafman <dzafman@redhat.com>
Fri, 18 Oct 2019 17:44:33 +0000 (10:44 -0700)
The original logic will reuse the timestamp which we send pings to
the specific heartbeat peer to update the last_rx_front[back] field
on receiving the corresponding replies, which later shall be honoured
as the exact time we succeed in getting the corresponding replies and
is used to calculate the heartbeat latency and determine whether the
relevant peer is dead.

However this is not accurate enough as there may be a delay between
we receive a reply and call heartbeat_check(). We can eliminate
the delay by introducing a map to track the ping-history here,
each entry of which consists of three elements:

1. "tx_time", worked as the map key, indicates the exact timestamp
   we send pings.
2. "deadline", indicates we shall receive all replies by then,
   otherwise we consider this peer as "dead".
3. "unacknowledged", indicates how many pings for the corresponding
   ping are still unacknowledged. The initial value is 2(as we send
   two pings from the front and back side for each peer).

We insert an item into the map on every time we sending out a ping, and
decrease the "unacknowledged" counter by 1 each time we get a reply from
the tracked ping. If "unacknowledged" drops to 0, we know all the replies
have been successfully collected and we can safely erase the relevant
item from the map as well as the earlier sent ones,  if there is any.

By comparing the current timestamp with the oldest deadline, we can now
make a much accurate decision about whether the corresponding peer is
healthy or not. And by setting last_rx_* to the timestamp we receiving
the reply, the lower bound when we can no longer hear a reply from the
corresponding connection is also much clear now.

Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
(cherry picked from commit 477774ceee42641f6d6884536462f92567bfea11)

Conflicts:
src/osd/OSD.cc (send_still_alive() has 1 less argument)

src/osd/OSD.cc
src/osd/OSD.h

index f46aca8c032f65c9951cfc903c6200ced4baed8e..690252eb41c64a8a758d36ebd1f21bd67e908854 100644 (file)
@@ -4613,46 +4613,71 @@ void OSD::handle_osd_ping(MOSDPing *m)
     {
       map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
       if (i != heartbeat_peers.end()) {
-       if (m->get_connection() == i->second.con_back) {
-         dout(25) << "handle_osd_ping got reply from osd." << from
-                  << " first_tx " << i->second.first_tx
-                  << " last_tx " << i->second.last_tx
-                  << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
-                  << " last_rx_front " << i->second.last_rx_front
-                  << dendl;
-         i->second.last_rx_back = m->stamp;
-         // if there is no front con, set both stamps.
-         if (i->second.con_front == NULL)
-           i->second.last_rx_front = m->stamp;
-       } else if (m->get_connection() == i->second.con_front) {
-         dout(25) << "handle_osd_ping got reply from osd." << from
-                  << " first_tx " << i->second.first_tx
-                  << " last_tx " << i->second.last_tx
-                  << " last_rx_back " << i->second.last_rx_back
-                  << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
-                  << dendl;
-         i->second.last_rx_front = m->stamp;
-       }
+        auto acked = i->second.ping_history.find(m->stamp);
+        if (acked != i->second.ping_history.end()) {
+          utime_t now = ceph_clock_now();
+          int &unacknowledged = acked->second.second;
+          if (m->get_connection() == i->second.con_back) {
+            dout(25) << "handle_osd_ping got reply from osd." << from
+                     << " first_tx " << i->second.first_tx
+                     << " last_tx " << i->second.last_tx
+                     << " last_rx_back " << i->second.last_rx_back << " -> " << now
+                     << " last_rx_front " << i->second.last_rx_front
+                     << dendl;
+            i->second.last_rx_back = now;
+            assert(unacknowledged > 0);
+            --unacknowledged;
+            // if there is no front con, set both stamps.
+            if (i->second.con_front == NULL) {
+              i->second.last_rx_front = now;
+              assert(unacknowledged > 0);
+              --unacknowledged;
+            }
+          } else if (m->get_connection() == i->second.con_front) {
+            dout(25) << "handle_osd_ping got reply from osd." << from
+                     << " first_tx " << i->second.first_tx
+                     << " last_tx " << i->second.last_tx
+                     << " last_rx_back " << i->second.last_rx_back
+                     << " last_rx_front " << i->second.last_rx_front << " -> " << now
+                     << dendl;
+            i->second.last_rx_front = now;
+            assert(unacknowledged > 0);
+            --unacknowledged;
+          }
 
-        utime_t cutoff = ceph_clock_now();
-        cutoff -= cct->_conf->osd_heartbeat_grace;
-        if (i->second.is_healthy(cutoff)) {
-          // Cancel false reports
-         auto failure_queue_entry = failure_queue.find(from);
-         if (failure_queue_entry != failure_queue.end()) {
-            dout(10) << "handle_osd_ping canceling queued "
-                     << "failure report for osd." << from << dendl;
-            failure_queue.erase(failure_queue_entry);
+          if (unacknowledged == 0) {
+            // succeeded in getting all replies
+            dout(25) << "handle_osd_ping got all replies from osd." << from
+                     << " , erase pending ping(sent at " << m->stamp << ")"
+                     << " and older pending ping(s)"
+                     << dendl;
+            i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
           }
 
-         auto failure_pending_entry = failure_pending.find(from);
-         if (failure_pending_entry != failure_pending.end()) {
-            dout(10) << "handle_osd_ping canceling in-flight "
-                     << "failure report for osd." << from << dendl;
-            send_still_alive(curmap->get_epoch(),
-                            failure_pending_entry->second.second);
-            failure_pending.erase(failure_pending_entry);
+          if (i->second.is_healthy(now)) {
+            // Cancel false reports
+            auto failure_queue_entry = failure_queue.find(from);
+            if (failure_queue_entry != failure_queue.end()) {
+              dout(10) << "handle_osd_ping canceling queued "
+                       << "failure report for osd." << from << dendl;
+              failure_queue.erase(failure_queue_entry);
+            }
+
+            auto failure_pending_entry = failure_pending.find(from);
+            if (failure_pending_entry != failure_pending.end()) {
+              dout(10) << "handle_osd_ping canceling in-flight "
+                       << "failure report for osd." << from << dendl;
+              send_still_alive(curmap->get_epoch(),
+                               failure_pending_entry->second.second);
+              failure_pending.erase(failure_pending_entry);
+            }
           }
+        } else {
+          // old replies, deprecated by newly sent pings.
+          dout(10) << "handle_osd_ping no pending ping(sent at " << m->stamp
+                   << ") is found, treat as covered by newly sent pings "
+                   << "and ignore"
+                   << dendl;
         }
       }
 
@@ -4704,9 +4729,7 @@ void OSD::heartbeat_check()
   assert(heartbeat_lock.is_locked());
   utime_t now = ceph_clock_now();
 
-  // check for heartbeat replies (move me elsewhere?)
-  utime_t cutoff = now;
-  cutoff -= cct->_conf->osd_heartbeat_grace;
+  // check for incoming heartbeats (move me elsewhere?)
   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
        p != heartbeat_peers.end();
        ++p) {
@@ -4723,19 +4746,26 @@ void OSD::heartbeat_check()
             << " last_rx_back " << p->second.last_rx_back
             << " last_rx_front " << p->second.last_rx_front
             << dendl;
-    if (p->second.is_unhealthy(cutoff)) {
+    if (p->second.is_unhealthy(now)) {
+      utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
       if (p->second.last_rx_back == utime_t() ||
          p->second.last_rx_front == utime_t()) {
-       derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
-            << " osd." << p->first << " ever on either front or back, first ping sent "
-            << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
+        derr << "heartbeat_check: no reply from "
+             << p->second.con_front->get_peer_addr().get_sockaddr()
+             << " osd." << p->first
+             << " ever on either front or back, first ping sent "
+             << p->second.first_tx
+             << " (oldest deadline " << oldest_deadline << ")"
+             << dendl;
        // fail
        failure_queue[p->first] = p->second.last_tx;
       } else {
-       derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
+       derr << "heartbeat_check: no reply from "
+             << p->second.con_front->get_peer_addr().get_sockaddr()
             << " osd." << p->first << " since back " << p->second.last_rx_back
             << " front " << p->second.last_rx_front
-            << " (cutoff " << cutoff << ")" << dendl;
+            << " (oldest deadline " << oldest_deadline << ")"
+             << dendl;
        // fail
        failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
       }
@@ -4781,6 +4811,8 @@ void OSD::heartbeat()
   service.check_full_status(ratio);
 
   utime_t now = ceph_clock_now();
+  utime_t deadline = now;
+  deadline += cct->_conf->osd_heartbeat_grace;
 
   // send heartbeats
   for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
@@ -4790,6 +4822,8 @@ void OSD::heartbeat()
     i->second.last_tx = now;
     if (i->second.first_tx == utime_t())
       i->second.first_tx = now;
+    i->second.ping_history[now] = make_pair(deadline,
+      HeartbeatInfo::HEARTBEAT_MAX_CONN);
     dout(30) << "heartbeat sending ping to osd." << peer << dendl;
     i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
                                          service.get_osdmap_epoch(),
@@ -5451,13 +5485,12 @@ bool OSD::_is_healthy()
 
   if (is_waiting_for_healthy()) {
     Mutex::Locker l(heartbeat_lock);
-    utime_t cutoff = ceph_clock_now();
-    cutoff -= cct->_conf->osd_heartbeat_grace;
+    utime_t now = ceph_clock_now();
     int num = 0, up = 0;
     for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
         p != heartbeat_peers.end();
         ++p) {
-      if (p->second.is_healthy(cutoff))
+      if (p->second.is_healthy(now))
        ++up;
       ++num;
     }
index 45a6d4fef65e2fc18947ef9d79c24e4a94fe6bd5..115d1804992cc8613cb6c283d781fe1d445dd5eb 100644 (file)
@@ -1512,20 +1512,26 @@ private:
     utime_t last_rx_front;  ///< last time we got a ping reply on the front side
     utime_t last_rx_back;   ///< last time we got a ping reply on the back side
     epoch_t epoch;      ///< most recent epoch we wanted this peer
+    /// number of connections we send and receive heartbeat pings/replies
+    const static int HEARTBEAT_MAX_CONN = 2;
+    /// history of inflight pings, arranging by timestamp we sent
+    /// send time -> deadline -> remaining replies
+    map<utime_t, pair<utime_t, int>> ping_history;
+
+    bool is_unhealthy(utime_t now) {
+      if (ping_history.empty()) {
+        /// we haven't sent a ping yet or we have got all replies,
+        /// in either way we are safe and healthy for now
+        return false;
+      }
 
-    bool is_unhealthy(utime_t cutoff) const {
-      return
-       ! ((last_rx_front > cutoff ||
-           (last_rx_front == utime_t() && (last_tx == utime_t() ||
-                                           first_tx > cutoff))) &&
-          (last_rx_back > cutoff ||
-           (last_rx_back == utime_t() && (last_tx == utime_t() ||
-                                          first_tx > cutoff))));
-    }
-    bool is_healthy(utime_t cutoff) const {
-      return last_rx_front > cutoff && last_rx_back > cutoff;
+      utime_t oldest_deadline = ping_history.begin()->second.first;
+      return now > oldest_deadline;
     }
 
+    bool is_healthy(utime_t now) {
+      return !is_unhealthy(now);
+    }
   };
   /// state attached to outgoing heartbeat connections
   struct HeartbeatSession : public RefCountedObject {