]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd mon: Track heartbeat ping times and report health warning
authorDavid Zafman <dzafman@redhat.com>
Wed, 26 Jun 2019 02:59:06 +0000 (02:59 +0000)
committerDavid Zafman <dzafman@redhat.com>
Fri, 18 Oct 2019 17:48:51 +0000 (10:48 -0700)
Fixes: http://tracker.ceph.com/issues/40640
Signed-off-by: David Zafman <dzafman@redhat.com>
(cherry picked from commit 66d44e7f911a57100d650ad7df9445f88ec70140)

Conflicts:
src/common/options.cc (trivial)
src/mon/PGMap.cc (trivial)
src/osd/OSD.cc (trivial)
src/osd/OSD.h (trivial)
src/osd/osd_types.cc (encode version difference)
src/osd/osd_types.h (osd_stat_t location in file changed)

src/mon/PGMap.cc manually get rid of extra argument to checks->add
src/osd/OSD.cc rename ping_stamp to stamp for backport

doc/rados/configuration/mon-osd-interaction.rst
src/common/options.cc
src/mon/PGMap.cc
src/osd/OSD.cc
src/osd/OSD.h
src/osd/osd_types.cc
src/osd/osd_types.h

index e2c24771481489ca733149b25279952267d02a5f..42be922fec0b51e561feab6a144e25cc7c2c4c9a 100644 (file)
@@ -24,10 +24,8 @@ monitoring the Ceph Storage Cluster.
 OSDs Check Heartbeats
 =====================
 
-Each Ceph OSD Daemon checks the heartbeat of other Ceph OSD Daemons every 6
-seconds. You can change the heartbeat interval by adding an ``osd heartbeat
-interval`` setting under the ``[osd]`` section of your Ceph configuration file,
-or by setting the value at runtime. If a neighboring Ceph OSD Daemon doesn't
+Each Ceph OSD Daemon checks the heartbeat of other Ceph OSD Daemons at random
+intervals less than every 6 seconds.  If a neighboring Ceph OSD Daemon doesn't
 show a heartbeat within a 20 second grace period, the Ceph OSD Daemon may
 consider the neighboring Ceph OSD Daemon ``down`` and report it back to a Ceph
 Monitor, which will update the Ceph Cluster Map. You may change this grace
index 70af216ba13b9649a80ee227ce093d5f2949432a..8f32a7414990b7826b221b71e0e6c1168e3ab064 100644 (file)
@@ -1468,6 +1468,11 @@ std::vector<Option> get_global_options() {
     .add_service("mgr")
     .set_description("Issue a health warning if there are fewer OSDs than osd_pool_default_size"),
 
+    Option("mon_warn_on_slow_ping_time", Option::TYPE_UINT, Option::LEVEL_BASIC)
+    .set_default(1000000)
+    .add_service("mgr")
+    .set_description("Issue a health warning if heartbeat ping longer than specified microseconds"),
+
     Option("mon_max_snap_prune_per_epoch", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(100)
     .set_description("Max number of pruned snaps we will process in a single OSDMap epoch"),
@@ -2919,9 +2924,9 @@ std::vector<Option> get_global_options() {
     .set_default(entity_addr_t())
     .set_description(""),
 
-    Option("osd_heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("osd_heartbeat_interval", Option::TYPE_INT, Option::LEVEL_DEV)
     .set_default(6)
-    .set_min_max(1, 86400)
+    .set_min_max(1, 60)
     .set_description("Interval (in seconds) between peer pings"),
 
     Option("osd_heartbeat_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
index a0a4778f4236a62785f623f984b186b0b9e04751..c2213bda319372f9972877ffda9389e30fe11849 100644 (file)
@@ -13,6 +13,7 @@
 
 #include "osd/osd_types.h"
 #include "osd/OSDMap.h"
+#include <boost/range/adaptor/reversed.hpp>
 
 #define dout_context g_ceph_context
 
@@ -2579,6 +2580,102 @@ void PGMap::get_health_checks(
     checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str());
   }
 
+  // SLOW_PING_TIME
+  auto warn_slow_ping_time = cct->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time");
+  if (warn_slow_ping_time > 0) {
+
+    struct mon_ping_item_t {
+      uint32_t pingtime;
+      int from;
+      int to;
+      bool improving;
+
+      bool operator<(const mon_ping_item_t& rhs) const {
+        if (pingtime < rhs.pingtime)
+          return true;
+        if (pingtime > rhs.pingtime)
+          return false;
+        if (from < rhs.from)
+          return true;
+        if (from > rhs.from)
+          return false;
+        return to < rhs.to;
+      }
+    };
+
+    list<string> detail_back;
+    list<string> detail_front;
+    set<mon_ping_item_t> back_sorted, front_sorted;
+    for (auto i : osd_stat) {
+      for (auto j : i.second.hb_pingtime) {
+
+       mon_ping_item_t back;
+       back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
+       back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]);
+       back.from = i.first;
+       back.to = j.first;
+       if (back.pingtime > warn_slow_ping_time) {
+         back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1]
+                           && j.second.back_pingtime[1] < j.second.back_pingtime[2]);
+         back_sorted.emplace(back);
+       }
+
+       mon_ping_item_t front;
+       front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
+       front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]);
+       front.from = i.first;
+       front.to = j.first;
+       if (front.pingtime > warn_slow_ping_time) {
+         front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1]
+                            && j.second.front_pingtime[1] < j.second.back_pingtime[2]);
+         front_sorted.emplace(front);
+       }
+      }
+    }
+    int max_detail = 10;
+    for (auto &sback : boost::adaptors::reverse(back_sorted)) {
+      ostringstream ss;
+      if (max_detail == 0) {
+       ss << "Truncated long network list.";
+        detail_back.push_back(ss.str());
+        break;
+      }
+      max_detail--;
+      ss << "Slow heartbeat ping on back interface from osd." << sback.from
+         << " to osd." << sback.to << " " << sback.pingtime << " usec"
+        << (sback.improving ? " possibly improving" : "");
+      detail_back.push_back(ss.str());
+    }
+    max_detail = 10;
+    for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
+      ostringstream ss;
+      if (max_detail == 0) {
+       ss << "Truncated long network list.";
+        detail_front.push_back(ss.str());
+        break;
+      }
+      max_detail--;
+      ss << "Slow heartbeat ping on front interface from osd." << sfront.from
+         << " to osd." << sfront.to << " " << sfront.pingtime << " usec"
+        << (sfront.improving ? " possibly improving" : "");
+      detail_front.push_back(ss.str());
+    }
+    if (detail_back.size() != 0) {
+      ostringstream ss;
+      ss << "Long heartbeat ping times on back interface seen, longest is "
+        << back_sorted.rbegin()->pingtime  << " usec";
+      auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str());
+      d.detail.swap(detail_back);
+    }
+    if (detail_front.size() != 0) {
+      ostringstream ss;
+      ss << "Long heartbeat ping times on front interface seen, longest is "
+        << front_sorted.rbegin()->pingtime  << " usec";
+      auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str());
+      d.detail.swap(detail_front);
+    }
+  }
+
   // SMALLER_PGP_NUM
   // MANY_OBJECTS_PER_PG
   if (!pg_stat.empty()) {
index 7dd8981d2a5d85e7679bd07af8a7abc00ba22d63..db8f487f18ec50351ee663267457832cdaed138d 100644 (file)
@@ -4366,6 +4366,7 @@ void OSD::_add_heartbeat_peer(int p)
     hi = &heartbeat_peers[p];
     hi->peer = p;
     RefCountedPtr s{new HeartbeatSession{p}, false};
+    hi->hb_interval_start = ceph_clock_now();
     hi->con_back = cons.first.get();
     hi->con_back->set_priv(s);
     if (cons.second) {
@@ -4651,6 +4652,71 @@ void OSD::handle_osd_ping(MOSDPing *m)
                      << " , erase pending ping(sent at " << m->stamp << ")"
                      << " and older pending ping(s)"
                      << dendl;
+
+#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
+           ++i->second.hb_average_count;
+           uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->stamp);
+           i->second.hb_total_back += back_pingtime;
+           uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->stamp);
+           i->second.hb_total_front += front_pingtime;
+
+           ceph_assert(i->second.hb_interval_start != utime_t());
+           if (i->second.hb_interval_start == utime_t())
+             i->second.hb_interval_start = now;
+           if (now - i->second.hb_interval_start >=  utime_t(hb_avg, 0)) {
+              uint32_t back_pingtime = i->second.hb_total_back / i->second.hb_average_count;
+              uint32_t front_pingtime = i->second.hb_total_front / i->second.hb_average_count;
+
+             // Reset for new interval
+             i->second.hb_average_count = 0;
+             i->second.hb_interval_start = now;
+             i->second.hb_total_back = 0;
+             i->second.hb_total_front = 0;
+
+             // Record per osd interace ping times
+             // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
+             if (i->second.hb_back_pingtime.size() < hb_vector_size) {
+               ceph_assert(i->second.hb_front_pingtime.size() == i->second.hb_back_pingtime.size());
+               i->second.hb_back_pingtime.push_back(back_pingtime);
+               i->second.hb_front_pingtime.push_back(front_pingtime);
+             } else {
+               i->second.hb_back_pingtime[i->second.hb_index & (hb_vector_size - 1)] = back_pingtime;
+               i->second.hb_front_pingtime[i->second.hb_index & (hb_vector_size - 1)] = front_pingtime;
+             }
+             ++i->second.hb_index;
+
+             {
+               std::lock_guard l(service.stat_lock);
+               uint32_t total = 0;
+               uint32_t count = 0;
+               uint32_t which = 0;
+               uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
+               for (int32_t k = size - 1 ; k >= 0; --k) {
+                 ++count;
+                 total += i->second.hb_back_pingtime[(i->second.hb_index + k) % size];
+                 if (count == 1 || count == 5 || count == 15) {
+                   service.osd_stat.hb_pingtime[from].back_pingtime[which++] = total / count;
+                   if (count == 15)
+                     break;
+                 }
+               }
+
+                if (i->second.con_front != NULL) {
+                 total = 0;
+                 count = 0;
+                 which = 0;
+                 for (int32_t k = size - 1 ; k >= 0; --k) {
+                   ++count;
+                   total += i->second.hb_front_pingtime[(i->second.hb_index + k) % size];
+                   if (count == 1 || count == 5 || count == 15) {
+                     service.osd_stat.hb_pingtime[from].front_pingtime[which++] = total / count;
+                     if (count == 15)
+                       break;
+                   }
+                 }
+               }
+             }
+           }
             i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
           }
 
@@ -4824,6 +4890,8 @@ void OSD::heartbeat()
       i->second.first_tx = now;
     i->second.ping_history[now] = make_pair(deadline,
       HeartbeatInfo::HEARTBEAT_MAX_CONN);
+    if (i->second.hb_interval_start == utime_t())
+      i->second.hb_interval_start = now;
     dout(30) << "heartbeat sending ping to osd." << peer << dendl;
     i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
                                          service.get_osdmap_epoch(),
index f1b605f4b11176feba645cdfecfd5e403a2aa1cc..b59fb7c0232e97dd6ccdb351c1e3a86a3b58aee1 100644 (file)
@@ -1518,6 +1518,16 @@ private:
     /// send time -> deadline -> remaining replies
     map<utime_t, pair<utime_t, int>> ping_history;
 
+    utime_t hb_interval_start;
+    uint32_t hb_average_count = 0;
+    uint32_t hb_index = 0;
+
+    uint32_t hb_total_back = 0;
+    vector<uint32_t> hb_back_pingtime;
+
+    uint32_t hb_total_front = 0;
+    vector<uint32_t> hb_front_pingtime;
+
     bool is_unhealthy(utime_t now) {
       if (ping_history.empty()) {
         /// we haven't sent a ping yet or we have got all replies,
@@ -1556,7 +1566,12 @@ private:
   Messenger *hb_back_server_messenger;
   utime_t last_heartbeat_resample;   ///< last time we chose random peers in waiting-for-healthy state
   double daily_loadavg;
-  
+
+  const uint32_t hb_avg = 60;
+  // Track ping repsonse times using vector as a circular buffer
+  // MUST BE A POWER OF 2
+  const uint32_t hb_vector_size = 16;
+
   void _add_heartbeat_peer(int p);
   void _remove_heartbeat_peer(int p);
   bool heartbeat_reset(Connection *con);
index 5669b1ff97bc92737897b2b331c3a8ce98512c22..92248a20afa8ef180543a0ff82f9907d0d5df7c7 100644 (file)
@@ -359,11 +359,37 @@ void osd_stat_t::dump(Formatter *f) const
   f->open_object_section("perf_stat");
   os_perf_stat.dump(f);
   f->close_section();
+  // Call to update ping times?
+  f->open_array_section("network_ping_times");
+  for (auto &i : hb_pingtime) {
+    f->open_object_section("osd");
+    f->dump_int("osd", i.first);
+
+    f->open_array_section("interfaces");
+    f->open_object_section("interface");
+    f->dump_string("interface", "back");
+    f->dump_int("1min", i.second.back_pingtime[0]);
+    f->dump_int("5min", i.second.back_pingtime[1]);
+    f->dump_int("15min", i.second.back_pingtime[2]);
+    f->close_section();
+
+    if (i.second.front_pingtime[0] != 0) {
+      f->open_object_section("interface");
+      f->dump_string("interface", "front");
+      f->dump_int("1min", i.second.front_pingtime[0]);
+      f->dump_int("5min", i.second.front_pingtime[1]);
+      f->dump_int("15min", i.second.front_pingtime[2]);
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
 }
 
 void osd_stat_t::encode(bufferlist &bl, uint64_t features) const
 {
-  ENCODE_START(8, 2, bl);
+  ENCODE_START(9, 2, bl);
   encode(kb, bl);
   encode(kb_used, bl);
   encode(kb_avail, bl);
@@ -379,12 +405,23 @@ void osd_stat_t::encode(bufferlist &bl, uint64_t features) const
   encode(kb_used_data, bl);
   encode(kb_used_omap, bl);
   encode(kb_used_meta, bl);
+  // hb_pingtime map
+  encode((int)hb_pingtime.size(), bl);
+  for (auto i : hb_pingtime) {
+    encode(i.first, bl); // osd
+    encode(i.second.back_pingtime[0], bl);
+    encode(i.second.back_pingtime[1], bl);
+    encode(i.second.back_pingtime[2], bl);
+    encode(i.second.front_pingtime[0], bl);
+    encode(i.second.front_pingtime[1], bl);
+    encode(i.second.front_pingtime[2], bl);
+  }
   ENCODE_FINISH(bl);
 }
 
 void osd_stat_t::decode(bufferlist::iterator &bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(8, 2, 2, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(9, 2, 2, bl);
   decode(kb, bl);
   decode(kb_used, bl);
   decode(kb_avail, bl);
@@ -413,6 +450,23 @@ void osd_stat_t::decode(bufferlist::iterator &bl)
     kb_used_omap = 0;
     kb_used_meta = 0;
   }
+  hb_pingtime.clear();
+  if (struct_v >= 9) {
+    int count;
+    decode(count, bl);
+    for (int i = 0 ; i < count ; i++) {
+      int osd;
+      decode(osd, bl);
+      struct Interfaces ifs;
+      decode(ifs.back_pingtime[0],bl);
+      decode(ifs.back_pingtime[1], bl);
+      decode(ifs.back_pingtime[2], bl);
+      decode(ifs.front_pingtime[0], bl);
+      decode(ifs.front_pingtime[1], bl);
+      decode(ifs.front_pingtime[2], bl);
+      hb_pingtime[osd] = ifs;
+    }
+  }
   DECODE_FINISH(bl);
 }
 
@@ -430,6 +484,10 @@ void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
   o.back()->hb_peers.push_back(7);
   o.back()->snap_trim_queue_len = 8;
   o.back()->num_snap_trimming = 99;
+  struct Interfaces gen_interfaces = { { 1000, 995, 990 }, {980, 985, 990} };
+  o.back()->hb_pingtime[20] = gen_interfaces;
+  gen_interfaces = { { 700, 800, 900 } };
+  o.back()->hb_pingtime[30] = gen_interfaces;
 }
 
 // -- pg_t --
index 82758aea580aa6cc35ada5f9f4b34e4b40b81d16..d5b287c00e58e3cb428918d9c1e05fdc489fd91b 100644 (file)
@@ -933,6 +933,12 @@ struct osd_stat_t {
 
   uint32_t num_pgs = 0;
 
+  struct Interfaces {
+    uint32_t back_pingtime[3];
+    uint32_t front_pingtime[3];
+  };
+  map<int, Interfaces> hb_pingtime;  ///< map of osd id to Interfaces
+
   void add(const osd_stat_t& o) {
     kb += o.kb;
     kb_used += o.kb_used;