]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/osd: heartbeat cleanup, show gray health states
authorYingxin Cheng <yingxin.cheng@intel.com>
Thu, 26 Mar 2020 08:47:00 +0000 (16:47 +0800)
committerKefu Chai <kchai@redhat.com>
Mon, 15 Jun 2020 14:06:41 +0000 (22:06 +0800)
Be very explicit about gray states between healthy and unhealthy.

Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
src/crimson/osd/heartbeat.cc
src/crimson/osd/heartbeat.h

index 4ea6185415eee416153a63ac8699265a6b0fce91..57cb3ed6510b21b34d13bab38da129a7b34769bf 100644 (file)
@@ -372,55 +372,57 @@ void Heartbeat::Peer::disconnect()
 Heartbeat::Peer::~Peer()
 { disconnect(); }
 
-bool Heartbeat::Peer::is_unhealthy(clock::time_point now) const
+bool Heartbeat::Peer::pinged() const
 {
-  if (ping_history.empty()) {
-    // we haven't sent a ping yet or we have got all replies,
-    // in either way we are safe and healthy for now
+  if (clock::is_zero(first_tx)) {
+    // i can never receive a pong without sending any ping message first.
+    assert(clock::is_zero(last_rx_front) &&
+          clock::is_zero(last_rx_back));
     return false;
   } else {
-    auto oldest_ping = ping_history.begin();
-    return now > oldest_ping->second.deadline;
+    return true;
   }
 }
 
-bool Heartbeat::Peer::is_healthy(clock::time_point now) const
+Heartbeat::Peer::health_state
+Heartbeat::Peer::do_health_screen(clock::time_point now) const
 {
-  if (clock::is_zero(last_rx_front)) {
-    return false;
-  }
-  if (clock::is_zero(last_rx_back)) {
-    return false;
+  if (!pinged()) {
+    // we are not healty nor unhealty because we haven't sent anything yet
+    return health_state::UNKNOWN;
+  } else if (!ping_history.empty() && ping_history.begin()->second.deadline < now) {
+    return health_state::UNHEALTHY;
+  } else if (!clock::is_zero(last_rx_front) &&
+             !clock::is_zero(last_rx_back)) {
+    // only declare to be healthy until we have received the first
+    // replies from both front/back connections
+    return health_state::HEALTHY;
+  } else {
+    return health_state::UNKNOWN;
   }
-  // only declare to be healthy until we have received the first
-  // replies from both front/back connections
-  return !is_unhealthy(now);
 }
 
 Heartbeat::clock::time_point
 Heartbeat::Peer::failed_since(clock::time_point now) const
 {
-  if (clock::is_zero(first_tx)) {
-    return clock::zero();
-  }
-  if (!is_unhealthy(now)) {
-    return clock::zero();
-  }
-
-  auto oldest_deadline = ping_history.begin()->second.deadline;
-  auto failed_since = std::min(last_rx_back, last_rx_front);
-  if (clock::is_zero(failed_since)) {
-    logger().error("failed_since: no reply from osd.{} "
-                   "ever on either front or back, first ping sent {} "
-                   "(oldest deadline {})",
-                   peer, first_tx, oldest_deadline);
-    failed_since = first_tx;
+  if (do_health_screen(now) == health_state::UNHEALTHY) {
+    auto oldest_deadline = ping_history.begin()->second.deadline;
+    auto failed_since = std::min(last_rx_back, last_rx_front);
+    if (clock::is_zero(failed_since)) {
+      logger().error("failed_since: no reply from osd.{} "
+                     "ever on either front or back, first ping sent {} "
+                     "(oldest deadline {})",
+                     peer, first_tx, oldest_deadline);
+      failed_since = first_tx;
+    } else {
+      logger().error("failed_since: no reply from osd.{} "
+                     "since back {} front {} (oldest deadline {})",
+                     peer, last_rx_back, last_rx_front, oldest_deadline);
+    }
+    return failed_since;
   } else {
-    logger().error("failed_since: no reply from osd.{} "
-                   "since back {} front {} (oldest deadline {})",
-                   peer, last_rx_back, last_rx_front, oldest_deadline);
+    return clock::zero();
   }
-  return failed_since;
 }
 
 void Heartbeat::Peer::send_heartbeat(
@@ -428,7 +430,7 @@ void Heartbeat::Peer::send_heartbeat(
     ceph::signedspan mnow,
     std::vector<seastar::future<>>& futures)
 {
-  if (clock::is_zero(first_tx)) {
+  if (!pinged()) {
     first_tx = now;
   }
   last_tx = now;
@@ -475,7 +477,7 @@ seastar::future<> Heartbeat::Peer::handle_reply(
   if (unacked == 0) {
     ping_history.erase(ping_history.begin(), ++ping);
   }
-  if (is_healthy(now)) {
+  if (do_health_screen(now) == health_state::HEALTHY) {
     // cancel false reports
     if (auto pending = heartbeat.failure_pending.find(peer);
         pending != heartbeat.failure_pending.end()) {
index 63f77a2b9de081df8fbc0d000f8bd25ceb01610f..83676372c6213482cf2ff59d4b53dd3d61a7f6cc 100644 (file)
@@ -130,9 +130,13 @@ class Heartbeat::Peer {
   void handle_reset(crimson::net::ConnectionRef);
 
  private:
-  bool is_unhealthy(clock::time_point now) const;
-  bool is_healthy(clock::time_point now) const;
-
+  bool pinged() const;
+  enum class health_state {
+    UNKNOWN,
+    UNHEALTHY,
+    HEALTHY,
+  };
+  health_state do_health_screen(clock::time_point now) const;
   void connect();
   void disconnect();