]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd mon: Track heartbeat ping times and report health warning
authorDavid Zafman <dzafman@redhat.com>
Wed, 26 Jun 2019 02:59:06 +0000 (02:59 +0000)
committerDavid Zafman <dzafman@redhat.com>
Mon, 4 Nov 2019 22:21:21 +0000 (14:21 -0800)
Fixes: http://tracker.ceph.com/issues/40640
Signed-off-by: David Zafman <dzafman@redhat.com>
(cherry picked from commit 66d44e7f911a57100d650ad7df9445f88ec70140)

Conflicts:
src/common/options.cc (trivial)
src/mon/PGMap.cc (trivial)
src/osd/OSD.cc (trivial)
src/osd/OSD.h (trivial)
src/osd/osd_types.cc (trivial)

src/mon/PGMap.cc manually get rid of extra argument to checks->add
src/osd/OSD.cc rename ping_stamp to stamp for backport

doc/rados/configuration/mon-osd-interaction.rst
src/common/options.cc
src/mon/PGMap.cc
src/osd/OSD.cc
src/osd/OSD.h
src/osd/osd_types.cc
src/osd/osd_types.h

index e2c24771481489ca733149b25279952267d02a5f..42be922fec0b51e561feab6a144e25cc7c2c4c9a 100644 (file)
@@ -24,10 +24,8 @@ monitoring the Ceph Storage Cluster.
 OSDs Check Heartbeats
 =====================
 
-Each Ceph OSD Daemon checks the heartbeat of other Ceph OSD Daemons every 6
-seconds. You can change the heartbeat interval by adding an ``osd heartbeat
-interval`` setting under the ``[osd]`` section of your Ceph configuration file,
-or by setting the value at runtime. If a neighboring Ceph OSD Daemon doesn't
+Each Ceph OSD Daemon checks the heartbeat of other Ceph OSD Daemons at random
+intervals less than every 6 seconds.  If a neighboring Ceph OSD Daemon doesn't
 show a heartbeat within a 20 second grace period, the Ceph OSD Daemon may
 consider the neighboring Ceph OSD Daemon ``down`` and report it back to a Ceph
 Monitor, which will update the Ceph Cluster Map. You may change this grace
index f8e7a6b8e0c2bdde4353717ef34f7ed569edfb7b..ec52e4942d6bbebe63e832b00ec26495e7da4a48 100644 (file)
@@ -1732,6 +1732,11 @@ std::vector<Option> get_global_options() {
     .add_service("mgr")
     .set_description("Issue a health warning if there are fewer OSDs than osd_pool_default_size"),
 
+    Option("mon_warn_on_slow_ping_time", Option::TYPE_UINT, Option::LEVEL_BASIC)
+    .set_default(1000000)
+    .add_service("mgr")
+    .set_description("Issue a health warning if heartbeat ping longer than specified microseconds"),
+
     Option("mon_max_snap_prune_per_epoch", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(100)
     .add_service("mon")
@@ -3336,9 +3341,9 @@ std::vector<Option> get_global_options() {
     .set_default(15_min)
     .set_description(""),
 
-    Option("osd_heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    Option("osd_heartbeat_interval", Option::TYPE_INT, Option::LEVEL_DEV)
     .set_default(6)
-    .set_min_max(1, 86400)
+    .set_min_max(1, 60)
     .set_description("Interval (in seconds) between peer pings"),
 
     Option("osd_heartbeat_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
index 0e50c5c097c33e3b7071ca813e778e4a80e6a31b..b214f3f8bef0ce4296d5efc710ec17e47c05fe2c 100644 (file)
@@ -15,6 +15,7 @@
 
 #include "osd/osd_types.h"
 #include "osd/OSDMap.h"
+#include <boost/range/adaptor/reversed.hpp>
 
 #define dout_context g_ceph_context
 
@@ -2710,6 +2711,102 @@ void PGMap::get_health_checks(
     checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str());
   }
 
+  // SLOW_PING_TIME
+  auto warn_slow_ping_time = cct->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time");
+  if (warn_slow_ping_time > 0) {
+
+    struct mon_ping_item_t {
+      uint32_t pingtime;
+      int from;
+      int to;
+      bool improving;
+
+      bool operator<(const mon_ping_item_t& rhs) const {
+        if (pingtime < rhs.pingtime)
+          return true;
+        if (pingtime > rhs.pingtime)
+          return false;
+        if (from < rhs.from)
+          return true;
+        if (from > rhs.from)
+          return false;
+        return to < rhs.to;
+      }
+    };
+
+    list<string> detail_back;
+    list<string> detail_front;
+    set<mon_ping_item_t> back_sorted, front_sorted;
+    for (auto i : osd_stat) {
+      for (auto j : i.second.hb_pingtime) {
+
+       mon_ping_item_t back;
+       back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
+       back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]);
+       back.from = i.first;
+       back.to = j.first;
+       if (back.pingtime > warn_slow_ping_time) {
+         back.improving = (j.second.back_pingtime[0] < j.second.back_pingtime[1]
+                           && j.second.back_pingtime[1] < j.second.back_pingtime[2]);
+         back_sorted.emplace(back);
+       }
+
+       mon_ping_item_t front;
+       front.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
+       front.pingtime = std::max(front.pingtime, j.second.front_pingtime[2]);
+       front.from = i.first;
+       front.to = j.first;
+       if (front.pingtime > warn_slow_ping_time) {
+         front.improving = (j.second.front_pingtime[0] < j.second.front_pingtime[1]
+                            && j.second.front_pingtime[1] < j.second.back_pingtime[2]);
+         front_sorted.emplace(front);
+       }
+      }
+    }
+    int max_detail = 10;
+    for (auto &sback : boost::adaptors::reverse(back_sorted)) {
+      ostringstream ss;
+      if (max_detail == 0) {
+       ss << "Truncated long network list.";
+        detail_back.push_back(ss.str());
+        break;
+      }
+      max_detail--;
+      ss << "Slow heartbeat ping on back interface from osd." << sback.from
+         << " to osd." << sback.to << " " << sback.pingtime << " usec"
+        << (sback.improving ? " possibly improving" : "");
+      detail_back.push_back(ss.str());
+    }
+    max_detail = 10;
+    for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
+      ostringstream ss;
+      if (max_detail == 0) {
+       ss << "Truncated long network list.";
+        detail_front.push_back(ss.str());
+        break;
+      }
+      max_detail--;
+      ss << "Slow heartbeat ping on front interface from osd." << sfront.from
+         << " to osd." << sfront.to << " " << sfront.pingtime << " usec"
+        << (sfront.improving ? " possibly improving" : "");
+      detail_front.push_back(ss.str());
+    }
+    if (detail_back.size() != 0) {
+      ostringstream ss;
+      ss << "Long heartbeat ping times on back interface seen, longest is "
+        << back_sorted.rbegin()->pingtime  << " usec";
+      auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str());
+      d.detail.swap(detail_back);
+    }
+    if (detail_front.size() != 0) {
+      ostringstream ss;
+      ss << "Long heartbeat ping times on front interface seen, longest is "
+        << front_sorted.rbegin()->pingtime  << " usec";
+      auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str());
+      d.detail.swap(detail_front);
+    }
+  }
+
   // SMALLER_PGP_NUM
   // MANY_OBJECTS_PER_PG
   if (!pg_stat.empty()) {
index 5255cfc843f68b6fe02e1b792b1bdef84e3e8e54..6f4b5729d5622cf2bb85d88aaaee16cb3c5216ed 100644 (file)
@@ -4810,6 +4810,7 @@ void OSD::_add_heartbeat_peer(int p)
     hi = &heartbeat_peers[p];
     hi->peer = p;
     RefCountedPtr s{new HeartbeatSession{p}, false};
+    hi->hb_interval_start = ceph_clock_now();
     hi->con_back = cons.first.get();
     hi->con_back->set_priv(s);
     if (cons.second) {
@@ -5111,6 +5112,71 @@ void OSD::handle_osd_ping(MOSDPing *m)
                      << " , erase pending ping(sent at " << m->stamp << ")"
                      << " and older pending ping(s)"
                      << dendl;
+
+#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
+           ++i->second.hb_average_count;
+           uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->stamp);
+           i->second.hb_total_back += back_pingtime;
+           uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->stamp);
+           i->second.hb_total_front += front_pingtime;
+
+           ceph_assert(i->second.hb_interval_start != utime_t());
+           if (i->second.hb_interval_start == utime_t())
+             i->second.hb_interval_start = now;
+           if (now - i->second.hb_interval_start >=  utime_t(hb_avg, 0)) {
+              uint32_t back_pingtime = i->second.hb_total_back / i->second.hb_average_count;
+              uint32_t front_pingtime = i->second.hb_total_front / i->second.hb_average_count;
+
+             // Reset for new interval
+             i->second.hb_average_count = 0;
+             i->second.hb_interval_start = now;
+             i->second.hb_total_back = 0;
+             i->second.hb_total_front = 0;
+
+             // Record per osd interace ping times
+             // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
+             if (i->second.hb_back_pingtime.size() < hb_vector_size) {
+               ceph_assert(i->second.hb_front_pingtime.size() == i->second.hb_back_pingtime.size());
+               i->second.hb_back_pingtime.push_back(back_pingtime);
+               i->second.hb_front_pingtime.push_back(front_pingtime);
+             } else {
+               i->second.hb_back_pingtime[i->second.hb_index & (hb_vector_size - 1)] = back_pingtime;
+               i->second.hb_front_pingtime[i->second.hb_index & (hb_vector_size - 1)] = front_pingtime;
+             }
+             ++i->second.hb_index;
+
+             {
+               std::lock_guard l(service.stat_lock);
+               uint32_t total = 0;
+               uint32_t count = 0;
+               uint32_t which = 0;
+               uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
+               for (int32_t k = size - 1 ; k >= 0; --k) {
+                 ++count;
+                 total += i->second.hb_back_pingtime[(i->second.hb_index + k) % size];
+                 if (count == 1 || count == 5 || count == 15) {
+                   service.osd_stat.hb_pingtime[from].back_pingtime[which++] = total / count;
+                   if (count == 15)
+                     break;
+                 }
+               }
+
+                if (i->second.con_front != NULL) {
+                 total = 0;
+                 count = 0;
+                 which = 0;
+                 for (int32_t k = size - 1 ; k >= 0; --k) {
+                   ++count;
+                   total += i->second.hb_front_pingtime[(i->second.hb_index + k) % size];
+                   if (count == 1 || count == 5 || count == 15) {
+                     service.osd_stat.hb_pingtime[from].front_pingtime[which++] = total / count;
+                     if (count == 15)
+                       break;
+                   }
+                 }
+               }
+             }
+           }
             i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
           }
 
@@ -5287,6 +5353,8 @@ void OSD::heartbeat()
       i->second.first_tx = now;
     i->second.ping_history[now] = make_pair(deadline,
       HeartbeatInfo::HEARTBEAT_MAX_CONN);
+    if (i->second.hb_interval_start == utime_t())
+      i->second.hb_interval_start = now;
     dout(30) << "heartbeat sending ping to osd." << peer << dendl;
     i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
                                          service.get_osdmap_epoch(),
index dab173c033d9fb5769aa63b3441ddc285b73318b..768b3a3e5f000f0769c8efb843b65c8a0d46e403 100644 (file)
@@ -1563,6 +1563,16 @@ private:
     /// send time -> deadline -> remaining replies
     map<utime_t, pair<utime_t, int>> ping_history;
 
+    utime_t hb_interval_start;
+    uint32_t hb_average_count = 0;
+    uint32_t hb_index = 0;
+
+    uint32_t hb_total_back = 0;
+    vector<uint32_t> hb_back_pingtime;
+
+    uint32_t hb_total_front = 0;
+    vector<uint32_t> hb_front_pingtime;
+
     bool is_stale(utime_t stale) {
       if (ping_history.empty()) {
         return false;
@@ -1609,7 +1619,12 @@ private:
   Messenger *hb_back_server_messenger;
   utime_t last_heartbeat_resample;   ///< last time we chose random peers in waiting-for-healthy state
   double daily_loadavg;
-  
+
+  const uint32_t hb_avg = 60;
+  // Track ping repsonse times using vector as a circular buffer
+  // MUST BE A POWER OF 2
+  const uint32_t hb_vector_size = 16;
+
   void _add_heartbeat_peer(int p);
   void _remove_heartbeat_peer(int p);
   bool heartbeat_reset(Connection *con);
index 913299df55a759cce2fd9ce152f9f2795d745a1a..3a75c9c0cfb0fd48dc678ec19ccbae0e010554c2 100644 (file)
@@ -393,11 +393,37 @@ void osd_stat_t::dump(Formatter *f) const
   f->open_array_section("alerts");
   ::dump(f, os_alerts);
   f->close_section();
+  // Call to update ping times?
+  f->open_array_section("network_ping_times");
+  for (auto &i : hb_pingtime) {
+    f->open_object_section("osd");
+    f->dump_int("osd", i.first);
+
+    f->open_array_section("interfaces");
+    f->open_object_section("interface");
+    f->dump_string("interface", "back");
+    f->dump_int("1min", i.second.back_pingtime[0]);
+    f->dump_int("5min", i.second.back_pingtime[1]);
+    f->dump_int("15min", i.second.back_pingtime[2]);
+    f->close_section();
+
+    if (i.second.front_pingtime[0] != 0) {
+      f->open_object_section("interface");
+      f->dump_string("interface", "front");
+      f->dump_int("1min", i.second.front_pingtime[0]);
+      f->dump_int("5min", i.second.front_pingtime[1]);
+      f->dump_int("15min", i.second.front_pingtime[2]);
+      f->close_section();
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
 }
 
 void osd_stat_t::encode(bufferlist &bl, uint64_t features) const
 {
-  ENCODE_START(12, 2, bl);
+  ENCODE_START(13, 2, bl);
 
   //////// for compatibility ////////
   int64_t kb = statfs.kb();
@@ -431,6 +457,18 @@ void osd_stat_t::encode(bufferlist &bl, uint64_t features) const
   encode(num_shards_repaired, bl);
   encode(num_osds, bl);
   encode(num_per_pool_osds, bl);
+
+  // hb_pingtime map
+  encode((int)hb_pingtime.size(), bl);
+  for (auto i : hb_pingtime) {
+    encode(i.first, bl); // osd
+    encode(i.second.back_pingtime[0], bl);
+    encode(i.second.back_pingtime[1], bl);
+    encode(i.second.back_pingtime[2], bl);
+    encode(i.second.front_pingtime[0], bl);
+    encode(i.second.front_pingtime[1], bl);
+    encode(i.second.front_pingtime[2], bl);
+  }
   ENCODE_FINISH(bl);
 }
 
@@ -438,7 +476,7 @@ void osd_stat_t::decode(bufferlist::const_iterator &bl)
 {
   int64_t kb, kb_used,kb_avail;
   int64_t kb_used_data, kb_used_omap, kb_used_meta;
-  DECODE_START_LEGACY_COMPAT_LEN(12, 2, 2, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(13, 2, 2, bl);
   decode(kb, bl);
   decode(kb_used, bl);
   decode(kb_avail, bl);
@@ -505,6 +543,23 @@ void osd_stat_t::decode(bufferlist::const_iterator &bl)
     num_osds = 0;
     num_per_pool_osds = 0;
   }
+  hb_pingtime.clear();
+  if (struct_v >= 13) {
+    int count;
+    decode(count, bl);
+    for (int i = 0 ; i < count ; i++) {
+      int osd;
+      decode(osd, bl);
+      struct Interfaces ifs;
+      decode(ifs.back_pingtime[0],bl);
+      decode(ifs.back_pingtime[1], bl);
+      decode(ifs.back_pingtime[2], bl);
+      decode(ifs.front_pingtime[0], bl);
+      decode(ifs.front_pingtime[1], bl);
+      decode(ifs.front_pingtime[2], bl);
+      hb_pingtime[osd] = ifs;
+    }
+  }
   DECODE_FINISH(bl);
 }
 
@@ -524,6 +579,10 @@ void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
     "some alert", "some alert details");
   o.back()->os_alerts[1].emplace(
     "some alert2", "some alert2 details");
+  struct Interfaces gen_interfaces = { { 1000, 995, 990 }, {980, 985, 990} };
+  o.back()->hb_pingtime[20] = gen_interfaces;
+  gen_interfaces = { { 700, 800, 900 } };
+  o.back()->hb_pingtime[30] = gen_interfaces;
 }
 
 // -- pg_t --
index 7e591d956b51e0db7877a271efc3f622465a66c8..9a598c7a9d82c134e7680d59e7db06eb1e2d09d1 100644 (file)
@@ -2356,6 +2356,12 @@ struct osd_stat_t {
   uint32_t num_osds = 0;
   uint32_t num_per_pool_osds = 0;
 
+  struct Interfaces {
+    uint32_t back_pingtime[3];
+    uint32_t front_pingtime[3];
+  };
+  map<int, Interfaces> hb_pingtime;  ///< map of osd id to Interfaces
+
   osd_stat_t() : snap_trim_queue_len(0), num_snap_trimming(0),
        num_shards_repaired(0)  {}