From ea20d3522aaf644cef989c565e11dd781e420e18 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 18 Jul 2019 21:28:16 -0700 Subject: [PATCH] osd mon: Add last_update to osd_stat_t heartbeat info Ignore old heartbeat info which hasn't updated Signed-off-by: David Zafman --- src/mgr/ClusterState.cc | 12 ++++++++++++ src/mon/PGMap.cc | 7 ++++++- src/osd/OSD.cc | 13 +++++++++++++ src/osd/osd_types.cc | 11 +++++++++-- src/osd/osd_types.h | 1 + 5 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/mgr/ClusterState.cc b/src/mgr/ClusterState.cc index af87c982084..a4881f1cb4c 100644 --- a/src/mgr/ClusterState.cc +++ b/src/mgr/ClusterState.cc @@ -16,6 +16,7 @@ #include "messages/MPGStats.h" #include "mgr/ClusterState.h" +#include #include #define dout_context g_ceph_context @@ -241,6 +242,7 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& std::array min; std::array max; uint32_t last; + uint32_t last_update; bool operator<(const mgr_ping_time_t& rhs) const { if (pingtime < rhs.pingtime) @@ -279,6 +281,7 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& item.max[2] = j.second.back_max[2]; item.last = j.second.back_last; item.back = true; + item.last_update = j.second.last_update; sorted.emplace(item); } @@ -300,6 +303,7 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& item.max[2] = j.second.front_max[2]; item.last = j.second.front_last; item.back = false; + item.last_update = j.second.last_update; sorted.emplace(item); } } @@ -313,6 +317,14 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& ceph_assert(!value || sitem.pingtime >= value); f->open_object_section("entry"); + + const time_t lu(sitem.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + auto stale = g_ceph_context->_conf.get_val("osd_heartbeat_stale"); + f->dump_string("last update", lustr); + f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale); f->dump_int("from osd", sitem.from); f->dump_int("to osd", sitem.to); f->dump_string("interface", (sitem.back ? "back" : "front")); diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 634fa8258e6..7b0befc064c 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -2763,9 +2763,10 @@ void PGMap::get_health_checks( // SLOW_PING_TIME auto warn_slow_ping_time = cct->_conf.get_val("mon_warn_on_slow_ping_time"); + auto grace = cct->_conf.get_val("osd_heartbeat_grace"); if (warn_slow_ping_time == 0) { double ratio = cct->_conf.get_val("mon_warn_on_slow_ping_ratio"); - warn_slow_ping_time = cct->_conf.get_val("osd_heartbeat_grace"); + warn_slow_ping_time = grace; warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio } if (warn_slow_ping_time > 0) { @@ -2795,6 +2796,10 @@ void PGMap::get_health_checks( for (auto i : osd_stat) { for (auto j : i.second.hb_pingtime) { + // Maybe source info is old + if (now.sec() - j.second.last_update > grace * 60) + continue; + mon_ping_item_t back; back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]); back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]); diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 193e1c94c5a..496824f3308 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -2569,6 +2570,7 @@ will start to track new ops received afterwards."; std::array min; std::array max; uint32_t last; + uint32_t last_update; bool operator<(const osd_ping_time_t& rhs) const { if (pingtime < rhs.pingtime) @@ -2604,6 +2606,7 @@ will start to track new ops received afterwards."; item.max[2] = j.second.back_max[2]; item.last = j.second.back_last; item.back = true; + item.last_update = j.second.last_update; sorted.emplace(item); } if (j.second.front_last == 0) @@ -2622,6 +2625,7 @@ will start to track new ops received afterwards."; item.max[1] = j.second.front_max[1]; item.max[2] = j.second.front_max[2]; item.last = j.second.front_last; + item.last_update = j.second.last_update; item.back = false; sorted.emplace(item); } @@ -2635,6 +2639,14 @@ will start to track new ops received afterwards."; for (auto &sitem : boost::adaptors::reverse(sorted)) { ceph_assert(sitem.pingtime >= value); f->open_object_section("entry"); + + const time_t lu(sitem.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + auto stale = cct->_conf.get_val("osd_heartbeat_stale"); + f->dump_string("last update", lustr); + f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale); f->dump_int("from osd", whoami); f->dump_int("to osd", sitem.to); f->dump_string("interface", (sitem.back ? "back" : "front")); @@ -4874,6 +4886,7 @@ void OSD::handle_osd_ping(MOSDPing *m) { std::lock_guard l(service.stat_lock); + service.osd_stat.hb_pingtime[from].last_update = now.sec(); service.osd_stat.hb_pingtime[from].back_last = back_pingtime; uint32_t total = 0; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 21645fd854e..284f150edc0 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -432,6 +432,11 @@ void osd_stat_t::dump(Formatter *f) const for (auto &i : hb_pingtime) { f->open_object_section("entry"); f->dump_int("osd", i.first); + const time_t lu(i.second.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + f->dump_string("last update", lustr); f->open_array_section("interfaces"); f->open_object_section("interface"); f->dump_string("interface", "back"); @@ -522,6 +527,7 @@ void osd_stat_t::encode(ceph::buffer::list &bl, uint64_t features) const encode((int)hb_pingtime.size(), bl); for (auto i : hb_pingtime) { encode(i.first, bl); // osd + encode(i.second.last_update, bl); encode(i.second.back_pingtime[0], bl); encode(i.second.back_pingtime[1], bl); encode(i.second.back_pingtime[2], bl); @@ -630,6 +636,7 @@ void osd_stat_t::decode(ceph::buffer::list::const_iterator &bl) int osd; decode(osd, bl); struct Interfaces ifs; + decode(ifs.last_update, bl); decode(ifs.back_pingtime[0],bl); decode(ifs.back_pingtime[1], bl); decode(ifs.back_pingtime[2], bl); @@ -673,11 +680,11 @@ void osd_stat_t::generate_test_instances(std::list& o) o.back()->os_alerts[1].emplace( "some alert2", "some alert2 details"); struct Interfaces gen_interfaces = { - { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001, + 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001, { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 }; o.back()->hb_pingtime[20] = gen_interfaces; gen_interfaces = { - { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 }; + 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 }; o.back()->hb_pingtime[30] = gen_interfaces; } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 56d0dbead58..4ae099edd42 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -2373,6 +2373,7 @@ struct osd_stat_t { uint32_t num_per_pool_omap_osds = 0; struct Interfaces { + uint32_t last_update; // in seconds uint32_t back_pingtime[3]; uint32_t back_min[3]; uint32_t back_max[3]; -- 2.39.5