option ``mon_warn_on_slow_ping_ratio`` specifies a percentage of
``osd_heartbeat_grace`` to determine the threshold. A value of zero
disables the warning. New configuration option
- ``mon_warn_on_slow_ping_time`` specified in microseconds over-rides the
+ ``mon_warn_on_slow_ping_time`` specified in milliseconds over-rides the
computed value, causes a warning
when OSD heartbeat pings take longer than the specified amount.
New admin command ``ceph daemon mgr.# dump_osd_network [threshold]`` command will
:Description: Override ``mon warn on slow ping ratio`` with a specific value.
Issue a ``HEALTH_WARN`` in cluster log if any heartbeat
between OSDs exceeds ``mon warn on slow ping time``
- microseconds. The default is 0 (disabled).
+ milliseconds. The default is 0 (disabled).
:Type: Integer
:Default: ``0``
CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "0" || return 1
- test "$(cat $dir/json | jq '.threshold')" = "1000000" || return 1
+ test "$(cat $dir/json | jq '.threshold')" = "1000" || return 1
CEPH_ARGS='' ceph daemon $(get_asok_path mgr.x) dump_osd_network | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "0" || return 1
- test "$(cat $dir/json | jq '.threshold')" = "1000000" || return 1
+ test "$(cat $dir/json | jq '.threshold')" = "1000" || return 1
CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network 0 | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "4" || return 1
flush_pg_stats
CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "0" || return 1
- test "$(cat $dir/json | jq '.threshold')" = "1000000" || return 1
+ test "$(cat $dir/json | jq '.threshold')" = "1000" || return 1
CEPH_ARGS='' ceph daemon $(get_asok_path mgr.x) dump_osd_network | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "0" || return 1
- test "$(cat $dir/json | jq '.threshold')" = "1000000" || return 1
+ test "$(cat $dir/json | jq '.threshold')" = "1000" || return 1
CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network 0 | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "4" || return 1
flush_pg_stats
CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "0" || return 1
- test "$(cat $dir/json | jq '.threshold')" = "1000000" || return 1
+ test "$(cat $dir/json | jq '.threshold')" = "1000" || return 1
CEPH_ARGS='' ceph daemon $(get_asok_path mgr.x) dump_osd_network | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "0" || return 1
- test "$(cat $dir/json | jq '.threshold')" = "1000000" || return 1
+ test "$(cat $dir/json | jq '.threshold')" = "1000" || return 1
CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network 0 | tee $dir/json
test "$(cat $dir/json | jq '.entries | length')" = "4" || return 1
test "$(cat $dir/json | jq '.entries | length')" = "12" || return 1
test "$(cat $dir/json | jq '.threshold')" = "0" || return 1
+ # Just check the threshold output matches the input
+ CEPH_ARGS='' ceph daemon $(get_asok_path mgr.x) dump_osd_network 99 | tee $dir/json
+ test "$(cat $dir/json | jq '.threshold')" = "99" || return 1
+ CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_osd_network 98 | tee $dir/json
+ test "$(cat $dir/json | jq '.threshold')" = "98" || return 1
+
rm -f $dir/json
}
.add_service("mgr")
.set_description("Issue a health warning if there are fewer OSDs than osd_pool_default_size"),
- Option("mon_warn_on_slow_ping_time", Option::TYPE_UINT, Option::LEVEL_BASIC)
+ Option("mon_warn_on_slow_ping_time", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(0)
.add_service("mgr")
- .set_description("Override mon_warn_on_slow_ping_ratio with specified threshold in microseconds")
+ .set_description("Override mon_warn_on_slow_ping_ratio with specified threshold in milliseconds")
.add_see_also("mon_warn_on_slow_ping_ratio"),
- Option("mon_warn_on_slow_ping_ratio", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
+ Option("mon_warn_on_slow_ping_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(.05)
.add_service("mgr")
.set_description("Issue a health warning if heartbeat ping longer than percentage of osd_heartbeat_grace")
int64_t value = 0;
// Default to health warning level if nothing specified
if (!(cmd_getval(g_ceph_context, cmdmap, "value", value))) {
- value = static_cast<int64_t>(g_ceph_context->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time"));
+ // Convert milliseconds to microseconds
+ value = static_cast<int64_t>(g_ceph_context->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time")) * 1000;
if (value == 0) {
double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
}
+ } else {
+ // Convert user input to microseconds
+ value *= 1000;
}
if (value < 0)
value = 0;
// Network ping times (1min 5min 15min)
f->open_object_section("network_ping_times");
- f->dump_int("threshold", value);
+ f->dump_int("threshold", value / 1000);
f->open_array_section("entries");
for (auto &sitem : boost::adaptors::reverse(sorted)) {
ceph_assert(!value || sitem.pingtime >= value);
}
// SLOW_PING_TIME
- auto warn_slow_ping_time = cct->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time");
+ // Convert milliseconds to microseconds
+ auto warn_slow_ping_time = cct->_conf.get_val<uint64_t>("mon_warn_on_slow_ping_time") * 1000;
auto grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
if (warn_slow_ping_time == 0) {
double ratio = cct->_conf.get_val<double>("mon_warn_on_slow_ping_ratio");
} else if (admin_command == "dump_osd_network") {
int64_t value = 0;
if (!(cmd_getval(cct, cmdmap, "value", value))) {
- value = static_cast<int64_t>(g_conf().get_val<uint64_t>("mon_warn_on_slow_ping_time"));
+ // Convert milliseconds to microseconds
+ value = static_cast<int64_t>(g_conf().get_val<uint64_t>("mon_warn_on_slow_ping_time")) * 1000;
if (value == 0) {
double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
}
+ } else {
+ // Convert user input to microseconds
+ value *= 1000;
}
if (value < 0) value = 0;
//
// Network ping times (1min 5min 15min)
f->open_object_section("network_ping_times");
- f->dump_int("threshold", value);
+ f->dump_int("threshold", value / 1000);
f->open_array_section("entries");
for (auto &sitem : boost::adaptors::reverse(sorted)) {
ceph_assert(sitem.pingtime >= value);