]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon: add admin command to dump historic/slow ops
authorlvshanchun <lvshanchun@gmail.com>
Tue, 13 Mar 2018 06:37:31 +0000 (14:37 +0800)
committerlvshanchun <lvshanchun@gmail.com>
Wed, 28 Mar 2018 03:08:34 +0000 (11:08 +0800)
Signed-off-by: lvshanchun <lvshanchun@gmail.com>
src/common/options.cc
src/mon/Monitor.cc

index 25f70032848ddcc788284dd54b377effe4974089..f54c78212bc06f3e41414682086cb16da13396af 100644 (file)
@@ -1121,6 +1121,34 @@ std::vector<Option> get_global_options() {
     .set_default(false)
     .set_description(""),
 
+    Option("mon_enable_op_tracker", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("enable/disable MON op tracking"),
+
+    Option("mon_op_complaint_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description("time in seconds to consider a MON OP blocked after no updates"),
+
+    Option("mon_op_log_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("max number of slow ops to display"),
+
+    Option("mon_op_history_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(20)
+    .set_description("max number of completed ops to track"),
+
+    Option("mon_op_history_duration", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(600)
+    .set_description("expiration time in seconds of historical MON OPS"),
+
+    Option("mon_op_history_slow_op_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(20)
+    .set_description("max number of slow historical MON OPS to keep"),
+
+    Option("mon_op_history_slow_op_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(10.0)
+    .set_description("duration time in seconds of an op to be considered as a historical slow op"),
+
     Option("mon_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
     .set_flag(Option::FLAG_NO_MON_UPDATE)
     .set_default("/var/lib/ceph/mon/$cluster-$id")
index 74866fa4a2f99d6508cbbde0d95d4e50fe5e1274..0b3993773d3f5458c0e127ad5d31b9218bec1a08 100644 (file)
@@ -172,13 +172,20 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
   paxos_service(PAXOS_NUM),
   admin_hook(NULL),
   routed_request_tid(0),
-  op_tracker(cct, true, 1)
+  op_tracker(cct, g_conf->get_val<bool>("mon_enable_op_tracker"), 1)
 {
   clog = log_client.create_channel(CLOG_CHANNEL_CLUSTER);
   audit_clog = log_client.create_channel(CLOG_CHANNEL_AUDIT);
 
   update_log_clients();
 
+  op_tracker.set_complaint_and_threshold(g_conf->get_val<double>("mon_op_complaint_time"),                              
+                                         g_conf->get_val<int64_t>("mon_op_log_threshold"));
+  op_tracker.set_history_size_and_duration(g_conf->get_val<uint64_t>("mon_op_history_size"),
+                                           g_conf->get_val<uint64_t>("mon_op_history_duration"));
+  op_tracker.set_history_slow_op_size_and_threshold(g_conf->get_val<uint64_t>("mon_op_history_slow_op_size"),
+                                                    g_conf->get_val<double>("mon_op_history_slow_op_threshold"));
+
   paxos = new Paxos(this, "paxos");
 
   paxos_service[PAXOS_MDSMAP] = new MDSMonitor(this, paxos, "mdsmap");
@@ -267,6 +274,14 @@ void Monitor::do_admin_command(std::string_view command, const cmdmap_t& cmdmap,
     << "from='admin socket' entity='admin socket' "
     << "cmd='" << command << "' args=" << args << ": dispatch";
 
+  set<string> filters;
+  vector<string> filter_str;
+  if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {                                                                                                                           
+      copy(filter_str.begin(), filter_str.end(),
+          inserter(filters, filters.end()));
+  }
+
+
   if (command == "mon_status") {
     get_mon_status(f.get(), ss);
     if (f)
@@ -310,6 +325,27 @@ void Monitor::do_admin_command(std::string_view command, const cmdmap_t& cmdmap,
       f->flush(ss);
     }
 
+  } else if (command == "dump_historic_ops") {
+    if (op_tracker.dump_historic_ops(f.get())) {
+      f->flush(ss);
+    } else {
+      ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
+        please enable \"mon_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+    }
+  } else if (command == "dump_historic_ops_by_duration" ) {
+    if (op_tracker.dump_historic_ops(f.get(), true)) {
+      f->flush(ss);
+    } else {
+      ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
+        please enable \"mon_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+    }
+  } else if (command == "dump_historic_slow_ops") {
+    if (op_tracker.dump_historic_slow_ops(f.get(), filters)) {
+      f->flush(ss);
+    } else {
+      ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
+        please enable \"mon_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+    }
   } else {
     assert(0 == "bad AdminSocket command binding");
   }
@@ -755,6 +791,18 @@ int Monitor::preinit()
                                      admin_hook,
                                      "list existing sessions");
   assert(r == 0);
+  r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops",
+                                     admin_hook,
+                                    "show recent ops");
+  assert(r == 0);
+  r = admin_socket->register_command("dump_historic_ops_by_duration", "dump_historic_ops_by_duration",
+                                     admin_hook,
+                                    "show recent ops, sorted by duration");
+  assert(r == 0);
+  r = admin_socket->register_command("dump_historic_slow_ops", "dump_historic_slow_ops",
+                                     admin_hook,
+                                    "show recent slow ops");
+  assert(r == 0);
 
   lock.Lock();
 
@@ -881,6 +929,9 @@ void Monitor::shutdown()
     admin_socket->unregister_command("quorum exit");
     admin_socket->unregister_command("ops");
     admin_socket->unregister_command("sessions");
+    admin_socket->unregister_command("dump_historic_ops");
+    admin_socket->unregister_command("dump_historic_ops_by_duration");
+    admin_socket->unregister_command("dump_historic_slow_ops");
     delete admin_hook;
     admin_hook = NULL;
   }