]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Add "dump_osd_network" osd admin request to get a sorted report
authorDavid Zafman <dzafman@redhat.com>
Wed, 10 Jul 2019 18:15:44 +0000 (18:15 +0000)
committerDavid Zafman <dzafman@redhat.com>
Mon, 4 Nov 2019 22:21:21 +0000 (14:21 -0800)
Signed-off-by: David Zafman <dzafman@redhat.com>
(cherry picked from commit 025b10a5329127734367a6899543f51cd8580d43)

src/osd/OSD.cc
src/osd/OSD.h

index 6f4b5729d5622cf2bb85d88aaaee16cb3c5216ed..32b92faac18086bab7e4f104bc9edc8ece248a08 100644 (file)
@@ -24,6 +24,7 @@
 #include <sys/stat.h>
 #include <signal.h>
 #include <boost/scoped_ptr.hpp>
+#include <boost/range/adaptor/reversed.hpp>
 
 #ifdef HAVE_SYS_PARAM_H
 #include <sys/param.h>
@@ -2712,6 +2713,77 @@ will start to track new ops received afterwards.";
     if (is_active()) {
       send_beacon(ceph::coarse_mono_clock::now());
     }
+  } else if (admin_command == "dump_osd_network") {
+    int64_t value = 0;
+    if (!(cmd_getval(cct, cmdmap, "value", value))) {
+      value = static_cast<int64_t>(g_conf().get_val<uint64_t>("mon_warn_on_slow_ping_time"));
+    }
+    if (value < 0) value = 0;
+
+    struct osd_ping_time_t {
+      uint32_t pingtime;
+      int to;
+      bool back;
+      std::array<uint32_t,3> times;
+
+      bool operator<(const osd_ping_time_t& rhs) const {
+       if (pingtime < rhs.pingtime)
+          return true;
+       if (pingtime > rhs.pingtime)
+         return false;
+        if (to < rhs.to)
+         return true;
+        if (to > rhs.to)
+         return false;
+       return back;
+      }
+    };
+
+    set<osd_ping_time_t> sorted;
+    // Get pingtimes under lock and not on the stack
+    map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
+    service.get_hb_pingtime(pingtimes);
+    for (auto j : *pingtimes) {
+      osd_ping_time_t item;
+      item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
+      item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
+      if (item.pingtime >= value) {
+       item.to = j.first;
+       item.times[0] = j.second.back_pingtime[0];
+       item.times[1] = j.second.back_pingtime[1];
+       item.times[2] = j.second.back_pingtime[2];
+       item.back = true;
+       sorted.emplace(item);
+      }
+      if (j.second.front_pingtime[0] == 0)
+       continue;
+      item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
+      item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
+      if (item.pingtime >= value) {
+       item.to = j.first;
+       item.times[0] = j.second.front_pingtime[0];
+       item.times[1] = j.second.front_pingtime[1];
+       item.times[2] = j.second.front_pingtime[2];
+       item.back = false;
+       sorted.emplace(item);
+      }
+    }
+    delete pingtimes;
+    //
+    // Network ping times (1min 5min 15min)
+    f->open_array_section("network_ping_times");
+    for (auto &sitem : boost::adaptors::reverse(sorted)) {
+      ceph_assert(sitem.pingtime >= value);
+      f->open_object_section("entry");
+      f->dump_int("from osd", whoami);
+      f->dump_int("to osd", sitem.to);
+      f->dump_string("interface", (sitem.back ? "back" : "front"));
+      f->dump_int("1min", sitem.times[0]);
+      f->dump_int("5min", sitem.times[1]);
+      f->dump_int("15min", sitem.times[2]);
+      f->close_section();  // entry
+    }
+    f->close_section(); // network_ping_times
   } else {
     ceph_abort_msg("broken asok registration");
   }
@@ -3390,6 +3462,10 @@ void OSD::final_init()
                                      asok_hook,
                                      "send OSD beacon to mon immediately");
 
+  r = admin_socket->register_command("dump_osd_network", "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
+                                        "Dump osd heartbeat network ping times");
+  ceph_assert(r == 0);
+
   test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
   // Note: pools are CephString instead of CephPoolname because
   // these commands traditionally support both pool names and numbers
index 768b3a3e5f000f0769c8efb843b65c8a0d46e403..00805d0d46f5b0fe01c321a2f47f7882b82110cb 100644 (file)
@@ -920,6 +920,12 @@ public:
     std::lock_guard l(stat_lock);
     return osd_stat.seq;
   }
+  void get_hb_pingtime(map<int, osd_stat_t::Interfaces> *pp)
+  {
+    std::lock_guard l(stat_lock);
+    *pp = osd_stat.hb_pingtime;
+    return;
+  }
 
   // -- OSD Full Status --
 private: