]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd mgr mon: Add mon_warn_on_slow_ping_ratio config as 5% of osd_heartbeat_grace
authorDavid Zafman <dzafman@redhat.com>
Thu, 11 Jul 2019 21:24:12 +0000 (21:24 +0000)
committerDavid Zafman <dzafman@redhat.com>
Fri, 18 Oct 2019 17:48:52 +0000 (10:48 -0700)
Compute network ping threshold based on ratio (5% of 20 seconds is 1 second)
Make the threshold value used part of dump_osd_network for osd and mgr
Keep mon_warn_on_slow_ping_time (default 0) to optionally override the ratio

Signed-off-by: David Zafman <dzafman@redhat.com>
(cherry picked from commit 0d1bbd34e96e2da2027861229b376805d5ea8aa6)

src/common/options.cc
src/mgr/ClusterState.cc
src/mon/PGMap.cc
src/osd/OSD.cc

index 8f32a7414990b7826b221b71e0e6c1168e3ab064..ab9d44e56c0204d10503e87c58728235d6e7d301 100644 (file)
@@ -1469,9 +1469,17 @@ std::vector<Option> get_global_options() {
     .set_description("Issue a health warning if there are fewer OSDs than osd_pool_default_size"),
 
     Option("mon_warn_on_slow_ping_time", Option::TYPE_UINT, Option::LEVEL_BASIC)
-    .set_default(1000000)
+    .set_default(0)
+    .add_service("mgr")
+    .set_description("Override mon_warn_on_slow_ping_ratio with specified threshold in microseconds")
+    .add_see_also("mon_warn_on_slow_ping_ratio"),
+
+    Option("mon_warn_on_slow_ping_ratio", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
+    .set_default(.05)
     .add_service("mgr")
-    .set_description("Issue a health warning if heartbeat ping longer than specified microseconds"),
+    .set_description("Issue a health warning if heartbeat ping longer than percentage of osd_heartbeat_grace")
+    .add_see_also("osd_heartbeat_grace")
+    .add_see_also("mon_warn_on_slow_ping_time"),
 
     Option("mon_max_snap_prune_per_epoch", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(100)
index eb92b280be22fb74ac282dd7c728f25f8463db9a..6d5fbe83ff80fdfa79cfa5b90550922cacd8467c 100644 (file)
@@ -213,6 +213,11 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t&
     // Default to health warning level if nothing specified
     if (!(cmd_getval(g_ceph_context, cmdmap, "value", value))) {
       value = static_cast<int64_t>(g_ceph_context->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time"));
+      if (value == 0) {
+        double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
+       value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
+       value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
+      }
     }
     if (value < 0)
       value = 0;
@@ -274,7 +279,9 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t&
     }
 
     // Network ping times (1min 5min 15min)
-    f->open_array_section("network_ping_times");
+    f->open_object_section("network_ping_times");
+    f->dump_int("threshold", value);
+    f->open_array_section("entries");
     for (auto &sitem : boost::adaptors::reverse(sorted)) {
       ceph_assert(!value || sitem.pingtime >= value);
 
@@ -287,6 +294,7 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t&
       f->dump_unsigned("15min", sitem.times[2]);
       f->close_section(); // entry
     }
+    f->close_section(); // entries
     f->close_section(); // network_ping_times
   } else {
     ceph_abort_msg("broken asok registration");
index 2a34d6b941eec66bd6fce497d13138810a1a85f1..1d1cfce09be6f630fb28b6b2d07bd8bf81e219e6 100644 (file)
@@ -2582,6 +2582,11 @@ void PGMap::get_health_checks(
 
   // SLOW_PING_TIME
   auto warn_slow_ping_time = cct->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time");
+  if (warn_slow_ping_time == 0) {
+    double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
+    warn_slow_ping_time = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
+    warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
+  }
   if (warn_slow_ping_time > 0) {
 
     struct mon_ping_item_t {
index 6b70b739bdf721af75164934faec3b17537b1532..fbb730b558b6d760e89c0b4ab2f08957b09437ed 100644 (file)
@@ -2286,6 +2286,11 @@ will start to track new ops received afterwards.";
     int64_t value = 0;
     if (!(cmd_getval(cct, cmdmap, "value", value))) {
       value = static_cast<int64_t>(g_conf().get_val<uint64_t>("mon_warn_on_slow_ping_time"));
+      if (value == 0) {
+        double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
+        value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
+        value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
+      }
     }
     if (value < 0) value = 0;
 
@@ -2340,7 +2345,9 @@ will start to track new ops received afterwards.";
     delete pingtimes;
     //
     // Network ping times (1min 5min 15min)
-    f->open_array_section("network_ping_times");
+    f->open_object_section("network_ping_times");
+    f->dump_int("threshold", value);
+    f->open_array_section("entries");
     for (auto &sitem : boost::adaptors::reverse(sorted)) {
       ceph_assert(sitem.pingtime >= value);
       f->open_object_section("entry");
@@ -2352,6 +2359,7 @@ will start to track new ops received afterwards.";
       f->dump_int("15min", sitem.times[2]);
       f->close_section();  // entry
     }
+    f->close_section(); // entries
     f->close_section(); // network_ping_times
   } else {
     assert(0 == "broken asok registration");