]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd mgr mon: Add mon_warn_on_slow_ping_ratio config as 5% of osd_heartbeat_grace
authorDavid Zafman <dzafman@redhat.com>
Thu, 11 Jul 2019 21:24:12 +0000 (21:24 +0000)
committerDavid Zafman <dzafman@redhat.com>
Mon, 26 Aug 2019 15:25:34 +0000 (15:25 +0000)
Compute network ping threshold based on ratio (5% of 20 seconds is 1 second)
Make the threshold value used part of dump_osd_network for osd and mgr
Keep mon_warn_on_slow_ping_time (default 0) to optionally override the ratio

Signed-off-by: David Zafman <dzafman@redhat.com>
src/common/options.cc
src/mgr/ClusterState.cc
src/mon/PGMap.cc
src/osd/OSD.cc

index b655117f676abe61774fbcb8be407b73aeec38cb..15d88ced4a4bbadcb3ebea66031e0bd34c68d892 100644 (file)
@@ -1713,9 +1713,17 @@ std::vector<Option> get_global_options() {
     .set_description("Issue a health warning if there are fewer OSDs than osd_pool_default_size"),
 
     Option("mon_warn_on_slow_ping_time", Option::TYPE_UINT, Option::LEVEL_BASIC)
-    .set_default(1000000)
+    .set_default(0)
+    .add_service("mgr")
+    .set_description("Override mon_warn_on_slow_ping_ratio with specified threshold in microseconds")
+    .add_see_also("mon_warn_on_slow_ping_ratio"),
+
+    Option("mon_warn_on_slow_ping_ratio", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
+    .set_default(.05)
     .add_service("mgr")
-    .set_description("Issue a health warning if heartbeat ping longer than specified microseconds"),
+    .set_description("Issue a health warning if heartbeat ping longer than percentage of osd_heartbeat_grace")
+    .add_see_also("osd_heartbeat_grace")
+    .add_see_also("mon_warn_on_slow_ping_time"),
 
     Option("mon_max_snap_prune_per_epoch", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(100)
index 5f7b224d1ed89cebe153c499f86337c370053d95..dc8ed0904a3dbfb55fcf29affd9868c8cc49cdf1 100644 (file)
@@ -223,6 +223,11 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t&
     // Default to health warning level if nothing specified
     if (!(cmd_getval(g_ceph_context, cmdmap, "value", value))) {
       value = static_cast<int64_t>(g_ceph_context->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time"));
+      if (value == 0) {
+        double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
+       value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
+       value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
+      }
     }
     if (value < 0)
       value = 0;
@@ -284,7 +289,9 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t&
     }
 
     // Network ping times (1min 5min 15min)
-    f->open_array_section("network_ping_times");
+    f->open_object_section("network_ping_times");
+    f->dump_int("threshold", value);
+    f->open_array_section("entries");
     for (auto &sitem : boost::adaptors::reverse(sorted)) {
       ceph_assert(!value || sitem.pingtime >= value);
 
@@ -297,6 +304,7 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t&
       f->dump_unsigned("15min", sitem.times[2]);
       f->close_section(); // entry
     }
+    f->close_section(); // entries
     f->close_section(); // network_ping_times
   } else {
     ceph_abort_msg("broken asok registration");
index 8f35a9ebbcbf5e5b989383c0f5f58da2c8fb9838..634fa8258e61bde86a13b7801d020590717dcffd 100644 (file)
@@ -2763,6 +2763,11 @@ void PGMap::get_health_checks(
 
   // SLOW_PING_TIME
   auto warn_slow_ping_time = cct->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time");
+  if (warn_slow_ping_time == 0) {
+    double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
+    warn_slow_ping_time = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
+    warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
+  }
   if (warn_slow_ping_time > 0) {
 
     struct mon_ping_item_t {
index 6363493013235421c92a2cd9d76dcb5322ad6a51..051e2a08402f512bfd6ae8ef083bb1ce5d8c247d 100644 (file)
@@ -2553,6 +2553,11 @@ will start to track new ops received afterwards.";
     int64_t value = 0;
     if (!(cmd_getval(cct, cmdmap, "value", value))) {
       value = static_cast<int64_t>(g_conf().get_val<uint64_t>("mon_warn_on_slow_ping_time"));
+      if (value == 0) {
+        double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
+        value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
+        value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
+      }
     }
     if (value < 0) value = 0;
 
@@ -2607,7 +2612,9 @@ will start to track new ops received afterwards.";
     delete pingtimes;
     //
     // Network ping times (1min 5min 15min)
-    f->open_array_section("network_ping_times");
+    f->open_object_section("network_ping_times");
+    f->dump_int("threshold", value);
+    f->open_array_section("entries");
     for (auto &sitem : boost::adaptors::reverse(sorted)) {
       ceph_assert(sitem.pingtime >= value);
       f->open_object_section("entry");
@@ -2619,6 +2626,7 @@ will start to track new ops received afterwards.";
       f->dump_int("15min", sitem.times[2]);
       f->close_section();  // entry
     }
+    f->close_section(); // entries
     f->close_section(); // network_ping_times
   } else {
     ceph_abort_msg("broken asok registration");