]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr: Add "dump_osd_network" mgr admin request to get a sorted report
authorDavid Zafman <dzafman@redhat.com>
Tue, 9 Jul 2019 17:22:12 +0000 (17:22 +0000)
committerDavid Zafman <dzafman@redhat.com>
Fri, 18 Oct 2019 17:48:52 +0000 (10:48 -0700)
Signed-off-by: David Zafman <dzafman@redhat.com>
(cherry picked from commit 5d3c1856415f8b66e31361a0a7b9c75edc46e49e)

Conflicts:
src/mgr/ClusterState.cc (trivial)
src/mgr/ClusterState.h (trivial

src/mgr/ClusterState.cc
src/mgr/ClusterState.h
src/mgr/DaemonServer.cc
src/mgr/Mgr.cc
src/mon/PGMap.cc

index d9eed6d4dfbabae179e634690478259d428eb172..eb92b280be22fb74ac282dd7c728f25f8463db9a 100644 (file)
@@ -16,6 +16,7 @@
 #include "messages/MPGStats.h"
 
 #include "mgr/ClusterState.h"
+#include <boost/range/adaptor/reversed.hpp>
 
 #define dout_context g_ceph_context
 #define dout_subsys ceph_subsys_mgr
@@ -29,7 +30,8 @@ ClusterState::ClusterState(
   : monc(monc_),
     objecter(objecter_),
     lock("ClusterState"),
-    mgr_map(mgrmap)
+    mgr_map(mgrmap),
+    asok_hook(NULL)
 {}
 
 void ClusterState::set_objecter(Objecter *objecter_)
@@ -163,3 +165,133 @@ void ClusterState::notify_osdmap(const OSDMap &osd_map)
   // that a cut-down set of functionality remains in PGMonitor
   // while the full-blown PGMap lives only here.
 }
+
+class ClusterSocketHook : public AdminSocketHook {
+  ClusterState *cluster_state;
+public:
+  explicit ClusterSocketHook(ClusterState *o) : cluster_state(o) {}
+  bool call(std::string_view admin_command, const cmdmap_t& cmdmap,
+           std::string_view format, bufferlist& out) override {
+    stringstream ss;
+    bool r = true;
+    try {
+      r = cluster_state->asok_command(admin_command, cmdmap, format, ss);
+    } catch (const bad_cmd_get& e) {
+      ss << e.what();
+      r = true;
+    }
+    out.append(ss);
+    return r;
+  }
+};
+
+void ClusterState::final_init()
+{
+  AdminSocket *admin_socket = g_ceph_context->get_admin_socket();
+  asok_hook = new ClusterSocketHook(this);
+  int r = admin_socket->register_command("dump_osd_network",
+                     "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
+                     "Dump osd heartbeat network ping times");
+  ceph_assert(r == 0);
+}
+
+void ClusterState::shutdown()
+{
+  // unregister commands
+  g_ceph_context->get_admin_socket()->unregister_commands(asok_hook);
+  delete asok_hook;
+  asok_hook = NULL;
+}
+
+bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& cmdmap,
+                      std::string_view format, ostream& ss)
+{
+  std::lock_guard l(lock);
+  Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
+  if (admin_command == "dump_osd_network") {
+    int64_t value = 0;
+    // Default to health warning level if nothing specified
+    if (!(cmd_getval(g_ceph_context, cmdmap, "value", value))) {
+      value = static_cast<int64_t>(g_ceph_context->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time"));
+    }
+    if (value < 0)
+      value = 0;
+
+    struct mgr_ping_time_t {
+      uint32_t pingtime;
+      int from;
+      int to;
+      bool back;
+      std::array<uint32_t,3> times;
+
+      bool operator<(const mgr_ping_time_t& rhs) const {
+        if (pingtime < rhs.pingtime)
+          return true;
+        if (pingtime > rhs.pingtime)
+          return false;
+        if (from < rhs.from)
+          return true;
+        if (from > rhs.from)
+          return false;
+        if (to < rhs.to)
+          return true;
+        if (to > rhs.to)
+          return false;
+        return back;
+      }
+    };
+
+    set<mgr_ping_time_t> sorted;
+    for (auto i : pg_map.osd_stat) {
+      for (auto j : i.second.hb_pingtime) {
+       mgr_ping_time_t item;
+       item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
+       item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
+       if (!value || item.pingtime >= value) {
+         item.from = i.first;
+         item.to = j.first;
+         item.times[0] = j.second.back_pingtime[0];
+         item.times[1] = j.second.back_pingtime[1];
+         item.times[2] = j.second.back_pingtime[2];
+         item.back = true;
+         sorted.emplace(item);
+       }
+
+       item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
+       item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
+       if (item.pingtime == 0)
+         continue;
+       if (!value || item.pingtime >= value) {
+         item.from = i.first;
+         item.to = j.first;
+         item.times[0] = j.second.front_pingtime[0];
+         item.times[1] = j.second.front_pingtime[1];
+         item.times[2] = j.second.front_pingtime[2];
+         item.back = false;
+         sorted.emplace(item);
+       }
+      }
+    }
+
+    // Network ping times (1min 5min 15min)
+    f->open_array_section("network_ping_times");
+    for (auto &sitem : boost::adaptors::reverse(sorted)) {
+      ceph_assert(!value || sitem.pingtime >= value);
+
+      f->open_object_section("entry");
+      f->dump_int("from osd", sitem.from);
+      f->dump_int("to osd", sitem.to);
+      f->dump_string("interface", (sitem.back ? "back" : "front"));
+      f->dump_unsigned("1min", sitem.times[0]);
+      f->dump_unsigned("5min", sitem.times[1]);
+      f->dump_unsigned("15min", sitem.times[2]);
+      f->close_section(); // entry
+    }
+    f->close_section(); // network_ping_times
+  } else {
+    ceph_abort_msg("broken asok registration");
+  }
+  f->flush(ss);
+  delete f;
+  return true;
+}
index a270861e80b48713cc1b8643d1ee53cee529622e..3d179b3ad710d84576723c3d32e8cd535e5606e3 100644 (file)
@@ -50,6 +50,8 @@ protected:
   bufferlist health_json;
   bufferlist mon_status_json;
 
+  class ClusterSocketHook *asok_hook;
+
 public:
 
   void load_digest(MMgrDigest *m);
@@ -126,7 +128,10 @@ public:
     assert(objecter != nullptr);
     return objecter->with_osdmap(std::forward<Args>(args)...);
   }
-
+  void final_init();
+  void shutdown();
+  bool asok_command(std::string_view admin_command, const cmdmap_t& cmdmap,
+                      std::string_view format, ostream& ss);
 };
 
 #endif
index 1e6cd8249609c11f654e26b0e0adca209fac1390..3ed69f9dd8dc420ed76325e847d3baa780bb9ea3 100644 (file)
@@ -340,6 +340,7 @@ void DaemonServer::shutdown()
   dout(10) << "begin" << dendl;
   msgr->shutdown();
   msgr->wait();
+  cluster_state.shutdown();
   dout(10) << "done" << dendl;
 }
 
index 05d1b004c9f6f2c68ba19242c321746ef5899ff7..4258231f99db018b764006675229fadb65569a0f 100644 (file)
@@ -271,6 +271,8 @@ void Mgr::init()
   py_module_registry->active_start(daemon_state, cluster_state,
       kv_store, *monc, clog, *objecter, *client, finisher);
 
+  cluster_state.final_init();
+
   dout(4) << "Complete." << dendl;
   initializing = false;
   initialized = true;
index c2213bda319372f9972877ffda9389e30fe11849..2a34d6b941eec66bd6fce497d13138810a1a85f1 100644 (file)
@@ -2636,7 +2636,7 @@ void PGMap::get_health_checks(
     for (auto &sback : boost::adaptors::reverse(back_sorted)) {
       ostringstream ss;
       if (max_detail == 0) {
-       ss << "Truncated long network list.";
+       ss << "Truncated long network list.  Use ceph daemon mgr.# dump_osd_network for more information";
         detail_back.push_back(ss.str());
         break;
       }
@@ -2650,7 +2650,7 @@ void PGMap::get_health_checks(
     for (auto &sfront : boost::adaptors::reverse(front_sorted)) {
       ostringstream ss;
       if (max_detail == 0) {
-       ss << "Truncated long network list.";
+       ss << "Truncated long network list.  Use ceph daemon mgr.# dump_osd_network for more information";
         detail_front.push_back(ss.str());
         break;
       }