crimson/osd: heartbeat include first sent time as unresponsive

author Yingxin Cheng <yingxin.cheng@intel.com>

Thu, 26 Mar 2020 04:54:13 +0000 (12:54 +0800)

committer Kefu Chai <kchai@redhat.com>

Mon, 15 Jun 2020 12:49:09 +0000 (20:49 +0800)
author Yingxin Cheng <yingxin.cheng@intel.com>
Thu, 26 Mar 2020 04:54:13 +0000 (12:54 +0800)
committer Kefu Chai <kchai@redhat.com>
Mon, 15 Jun 2020 12:49:09 +0000 (20:49 +0800)
diff --git a/src/crimson/osd/heartbeat.cc b/src/crimson/osd/heartbeat.cc

index f6d0d7b3b7a48ab789ce0da275c62da1a443b2b6..e91452c01db8a07aae5b612a7c4eaf8f32734bd5 100644 (file)
--- a/src/crimson/osd/heartbeat.cc
+++ b/src/crimson/osd/heartbeat.cc
@@ -315,12 +315,22 @@ void Heartbeat::heartbeat_check()
      }
  
      if (peer_info.is_unhealthy(now)) {
-      logger().error(" heartbeat_check: no reply from osd.{} "
-                    "since back {} front {} (oldest deadline {})",
-                    osd, peer_info.last_rx_back, peer_info.last_rx_front,
-                    peer_info.ping_history.begin()->second.deadline);
-      failure_queue[osd] = std::min(peer_info.last_rx_back,
-                                   peer_info.last_rx_front);
+      auto oldest_deadline = peer_info.ping_history.begin()->second.deadline;
+      auto failed_since = std::min(peer_info.last_rx_back,
+                                   peer_info.last_rx_front);
+      if (clock::is_zero(failed_since)) {
+        logger().error("heartbeat_check: no reply from osd.{} "
+                       "ever on either front or back, first ping sent {} "
+                       "(oldest deadline {})",
+                       osd, peer_info.first_tx, oldest_deadline);
+        failed_since = peer_info.first_tx;
+      } else {
+        logger().error("heartbeat_check: no reply from osd.{} "
+                       "since back {} front {} (oldest deadline {})",
+                       osd, peer_info.last_rx_back, peer_info.last_rx_front,
+                       oldest_deadline);
+      }
+      failure_queue.emplace(osd, failed_since);
      }
    }
    if (!failure_queue.empty()) {
diff --git a/src/crimson/osd/heartbeat.h b/src/crimson/osd/heartbeat.h

index f5e2ad596cf56ba07d8440f7eafe48d9145ea30b..2ab4970f59e7ad91529430d57b76005d1cfbf712 100644 (file)
--- a/src/crimson/osd/heartbeat.h
+++ b/src/crimson/osd/heartbeat.h
@@ -111,6 +111,7 @@ private:
    peers_map_t peers;
    // osds which are considered failed
    // osd_id => when was the last time that both front and back pings were acked
+  //           or sent.
    //           use for calculating how long the OSD has been unresponsive
    using failure_queue_t = std::map<osd_id_t, clock::time_point>;
    seastar::future<> send_failures(failure_queue_t&& failure_queue);
author	Yingxin Cheng <yingxin.cheng@intel.com>
	Thu, 26 Mar 2020 04:54:13 +0000 (12:54 +0800)
committer	Kefu Chai <kchai@redhat.com>
	Mon, 15 Jun 2020 12:49:09 +0000 (20:49 +0800)
src/crimson/osd/heartbeat.cc		patch \| blob \| history
src/crimson/osd/heartbeat.h		patch \| blob \| history