]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr: Add "dump_osd_network" mgr admin request to get a sorted report
authorDavid Zafman <dzafman@redhat.com>
Tue, 9 Jul 2019 17:22:12 +0000 (17:22 +0000)
committerDavid Zafman <dzafman@redhat.com>
Mon, 4 Nov 2019 22:21:21 +0000 (14:21 -0800)
Signed-off-by: David Zafman <dzafman@redhat.com>
(cherry picked from commit 5d3c1856415f8b66e31361a0a7b9c75edc46e49e)

Conflicts:
src/mgr/ClusterState.cc (trivial)

src/mgr/ClusterState.cc
src/mgr/ClusterState.h
src/mgr/DaemonServer.cc
src/mgr/Mgr.cc
src/mon/PGMap.cc

index 7e073a58bcc077b81f0781bc6c72fcfbe4e0e691..b0548e01dc3caa9780eaaf3eb4f14aa0b512db62 100644 (file)
@@ -16,6 +16,7 @@
 #include "messages/MPGStats.h"
 
 #include "mgr/ClusterState.h"
+#include <boost/range/adaptor/reversed.hpp>
 
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_mgr
@@ -29,7 +30,8 @@ ClusterState::ClusterState(
   : monc(monc_),
     objecter(objecter_),
     lock("ClusterState"),
-    mgr_map(mgrmap)
+    mgr_map(mgrmap),
+    asok_hook(NULL)
 {}
 
 void ClusterState::set_objecter(Objecter *objecter_)
@@ -174,3 +176,133 @@ void ClusterState::notify_osdmap(const OSDMap &osd_map)
   // that a cut-down set of functionality remains in PGMonitor
   // while the full-blown PGMap lives only here.
 }
+
+class ClusterSocketHook : public AdminSocketHook {
+  ClusterState *cluster_state;
+public:
+  explicit ClusterSocketHook(ClusterState *o) : cluster_state(o) {}
+  bool call(std::string_view admin_command, const cmdmap_t& cmdmap,
+           std::string_view format, bufferlist& out) override {
+    stringstream ss;
+    bool r = true;
+    try {
+      r = cluster_state->asok_command(admin_command, cmdmap, format, ss);
+    } catch (const bad_cmd_get& e) {
+      ss << e.what();
+      r = true;
+    }
+    out.append(ss);
+    return r;
+  }
+};
+
+void ClusterState::final_init()
+{
+  AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+  asok_hook = new ClusterSocketHook(this);
+  int r = admin_socket->register_command("dump_osd_network",
+                     "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
+                     "Dump osd heartbeat network ping times");
+  ceph_assert(r == 0);
+}
+
+void ClusterState::shutdown()
+{
+  // unregister commands
+  g_ceph_context->get_admin_socket()->unregister_commands(asok_hook);
+  delete asok_hook;
+  asok_hook = NULL;
+}
+
+bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& cmdmap,
+                      std::string_view format, ostream& ss)
+{
+  std::lock_guard l(lock);
+  Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
+  if (admin_command == "dump_osd_network") {
+    int64_t value = 0;
+    // Default to health warning level if nothing specified
+    if (!(cmd_getval(g_ceph_context, cmdmap, "value", value))) {
+      value = static_cast<int64_t>(g_ceph_context->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time"));
+    }
+    if (value < 0)
+      value = 0;
+
+    struct mgr_ping_time_t {
+      uint32_t pingtime;
+      int from;
+      int to;
+      bool back;
+      std::array<uint32_t,3> times;
+
+      bool operator<(const mgr_ping_time_t& rhs) const {
+        if (pingtime < rhs.pingtime)
+          return true;
+        if (pingtime > rhs.pingtime)
+          return false;
+        if (from < rhs.from)
+          return true;
+        if (from > rhs.from)
+          return false;
+        if (to < rhs.to)
+          return true;
+        if (to > rhs.to)
+          return false;
+        return back;
+      }
+    };
+
+    set<mgr_ping_time_t> sorted;
+    for (auto i : pg_map.osd_stat) {
+      for (auto j : i.second.hb_pingtime) {
+       mgr_ping_time_t item;
+       item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
+       item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
+       if (!value || item.pingtime >= value) {
+         item.from = i.first;
+         item.to = j.first;
+         item.times[0] = j.second.back_pingtime[0];
+         item.times[1] = j.second.back_pingtime[1];
+         item.times[2] = j.second.back_pingtime[2];
+         item.back = true;
+         sorted.emplace(item);
+       }
+
+       item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
+       item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
+       if (item.pingtime == 0)
+         continue;
+       if (!value || item.pingtime >= value) {
+         item.from = i.first;
+         item.to = j.first;
+         item.times[0] = j.second.front_pingtime[0];
+         item.times[1] = j.second.front_pingtime[1];
+         item.times[2] = j.second.front_pingtime[2];
+         item.back = false;
+         sorted.emplace(item);
+       }
+      }
+    }
+
+    // Network ping times (1min 5min 15min)
+    f->open_array_section("network_ping_times");
+    for (auto &sitem : boost::adaptors::reverse(sorted)) {
+      ceph_assert(!value || sitem.pingtime >= value);
+
+      f->open_object_section("entry");
+      f->dump_int("from osd", sitem.from);
+      f->dump_int("to osd", sitem.to);
+      f->dump_string("interface", (sitem.back ? "back" : "front"));
+      f->dump_unsigned("1min", sitem.times[0]);
+      f->dump_unsigned("5min", sitem.times[1]);
+      f->dump_unsigned("15min", sitem.times[2]);
+      f->close_section(); // entry
+    }
+    f->close_section(); // network_ping_times
+  } else {
+    ceph_abort_msg("broken asok registration");
+  }
+  f->flush(ss);
+  delete f;
+  return true;
+}
index c5d46fa1abfd53e21ab8bf6b9bf84ad417416146..0298f62ec29acbf0239c2506c5d2cad5e5b31be0 100644 (file)
@@ -50,6 +50,8 @@ protected:
   bufferlist health_json;
   bufferlist mon_status_json;
 
+  class ClusterSocketHook *asok_hook;
+
 public:
 
   void load_digest(MMgrDigest *m);
@@ -137,7 +139,10 @@ public:
       pg_map,
       std::forward<Args>(args)...);
   }
-
+  void final_init();
+  void shutdown();
+  bool asok_command(std::string_view admin_command, const cmdmap_t& cmdmap,
+                      std::string_view format, ostream& ss);
 };
 
 #endif
index acf4443a65524b7bbc85c028411c051c1fbaf43c..6eb5306681f93c7dca6c19cdbf5d1b9ed0609ba3 100644 (file)
@@ -378,6 +378,7 @@ void DaemonServer::shutdown()
   dout(10) << "begin" << dendl;
   msgr->shutdown();
   msgr->wait();
+  cluster_state.shutdown();
   dout(10) << "done" << dendl;
 
   std::lock_guard l(lock);
index 7cd6c91eb655d6e94ef6893ae112bcd6c3644886..ab749b025771308e48bec435c36c772309770032 100644 (file)
@@ -302,6 +302,8 @@ void Mgr::init()
       kv_store, *monc, clog, audit_clog, *objecter, *client,
       finisher, server);
 
+  cluster_state.final_init();
+
   dout(4) << "Complete." << dendl;
   initializing = false;
   initialized = true;
index b214f3f8bef0ce4296d5efc710ec17e47c05fe2c..cb0b1b1dd2db31ceec94668860c2f872f359058d 100644 (file)
@@ -2767,7 +2767,7 @@ void PGMap::get_health_checks(
     for (auto &sback : boost::adaptors::reverse(back_sorted)) {
       ostringstream ss;
       if (max_detail == 0) {
-       ss << "Truncated long network list.";
+       ss << "Truncated long network list.  Use ceph daemon mgr.# dump_osd_network for more information";
         detail_back.push_back(ss.str());
         break;
       }
@@ -2781,7 +2781,7 @@ void PGMap::get_health_checks(
     for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
       ostringstream ss;
       if (max_detail == 0) {
-       ss << "Truncated long network list.";
+       ss << "Truncated long network list.  Use ceph daemon mgr.# dump_osd_network for more information";
         detail_front.push_back(ss.str());
         break;
       }