crimson/os/heartbeat: report peer failure to monitors

author Xuehan Xu <xxhdx1985126@163.com>

Sun, 8 Mar 2020 05:59:32 +0000 (13:59 +0800)

committer Xuehan Xu <xxhdx1985126@163.com>

Tue, 17 Mar 2020 09:12:30 +0000 (17:12 +0800)
author Xuehan Xu <xxhdx1985126@163.com>
Sun, 8 Mar 2020 05:59:32 +0000 (13:59 +0800)
committer Xuehan Xu <xxhdx1985126@163.com>
Tue, 17 Mar 2020 09:12:30 +0000 (17:12 +0800)
diff --git a/src/crimson/osd/heartbeat.cc b/src/crimson/osd/heartbeat.cc

index 0fb5eff7dcc93f41b6e8a46a8788df9a0c78ad63..dd289e777171cf8ed8c80efcfbb5c30bd81ce58a 100644 (file)
--- a/src/crimson/osd/heartbeat.cc
+++ b/src/crimson/osd/heartbeat.cc
@@ -9,6 +9,7 @@
  #include "messages/MOSDFailure.h"
  
  #include "crimson/common/config_proxy.h"
+#include "crimson/common/formatter.h"
  #include "crimson/net/Connection.h"
  #include "crimson/net/Messenger.h"
  #include "crimson/osd/shard_services.h"
@@ -33,7 +34,10 @@ Heartbeat::Heartbeat(const crimson::osd::ShardServices& service,
      front_msgr{front_msgr},
      back_msgr{back_msgr},
      // do this in background
-    timer{[this] { (void)send_heartbeats(); }}
+    timer{[this] {
+      heartbeat_check();
+      (void)send_heartbeats();
+    }}
  {}
  
  seastar::future<> Heartbeat::start(entity_addrvec_t front_addrs,
@@ -294,7 +298,6 @@ seastar::future<> Heartbeat::handle_reply(crimson::net::Connection* conn,
    }
    if (peer.is_healthy(now)) {
      // cancel false reports
-    failure_queue.erase(from);
      if (auto pending = failure_pending.find(from);
          pending != failure_pending.end()) {
        return send_still_alive(from, pending->second.addrs);
@@ -309,6 +312,34 @@ seastar::future<> Heartbeat::handle_you_died()
    return seastar::now();
  }
  
+void Heartbeat::heartbeat_check()
+{
+  failure_queue_t failure_queue;
+  const auto now = clock::now();
+  for (const auto& [osd, peer_info]: peers) {
+    if (clock::is_zero(peer_info.first_tx)) {
+      continue;
+    }
+
+    if (peer_info.is_unhealthy(now)) {
+      logger().error(" heartbeat_check: no reply from osd.{} "
+                    "since back {} front {} (oldest deadline {})",
+                    osd, peer_info.last_rx_back, peer_info.last_rx_front,
+                    peer_info.ping_history.begin()->second.deadline);
+      failure_queue[osd] = std::min(peer_info.last_rx_back,
+                                   peer_info.last_rx_front);
+    }
+  }
+  if (!failure_queue.empty()) {
+    // send_failures can run in background, because messages
+    // are sent in order, if later checks find out the previous
+    // "failed" peers to be healthy, that "still alive" messages
+    // would be sent after the previous "osd failure" messages
+    // which is totally safe.
+    (void)send_failures(std::move(failure_queue));
+  }
+}
+
  seastar::future<> Heartbeat::send_heartbeats()
  {
    using peers_item_t = typename peers_map_t::value_type;
@@ -353,7 +384,7 @@ seastar::future<> Heartbeat::send_heartbeats()
      });
  }
  
-seastar::future<> Heartbeat::send_failures()
+seastar::future<> Heartbeat::send_failures(failure_queue_t&& failure_queue)
  {
    using failure_item_t = typename failure_queue_t::value_type;
    return seastar::parallel_for_each(failure_queue,
@@ -374,9 +405,6 @@ seastar::future<> Heartbeat::send_failures()
        failure_pending.emplace(osd, failure_info_t{failed_since,
                                                    osdmap->get_addrs(osd)});
        return monc.send_message(failure_report);
-    }).then([this] {
-      failure_queue.clear();
-      return seastar::now();
      });
  }
  
diff --git a/src/crimson/osd/heartbeat.h b/src/crimson/osd/heartbeat.h

index 036299f38990204586ed9ed2934b32990352fa98..c51e81de67b073c6438ca8d7ff5d111e7475f5a2 100644 (file)
--- a/src/crimson/osd/heartbeat.h
+++ b/src/crimson/osd/heartbeat.h
@@ -38,9 +38,6 @@ public:
    seastar::future<> update_peers(int whoami);
    seastar::future<> remove_peer(osd_id_t peer);
  
-  seastar::future<> send_heartbeats();
-  seastar::future<> send_failures();
-
    const entity_addrvec_t& get_front_addrs() const;
    const entity_addrvec_t& get_back_addrs() const;
  
@@ -109,12 +106,14 @@ private:
    };
    using peers_map_t = std::map<osd_id_t, PeerInfo>;
    peers_map_t peers;
-
    // osds which are considered failed
    // osd_id => when was the last time that both front and back pings were acked
    //           use for calculating how long the OSD has been unresponsive
    using failure_queue_t = std::map<osd_id_t, clock::time_point>;
-  failure_queue_t failure_queue;
+  seastar::future<> send_failures(failure_queue_t&& failure_queue);
+  seastar::future<> send_heartbeats();
+  void heartbeat_check();
+
    struct failure_info_t {
      clock::time_point failed_since;
      entity_addrvec_t addrs;
author	Xuehan Xu <xxhdx1985126@163.com>
	Sun, 8 Mar 2020 05:59:32 +0000 (13:59 +0800)
committer	Xuehan Xu <xxhdx1985126@163.com>
	Tue, 17 Mar 2020 09:12:30 +0000 (17:12 +0800)
src/crimson/osd/heartbeat.cc		patch \| blob \| history
src/crimson/osd/heartbeat.h		patch \| blob \| history