From a62ee09fc24e5496fdd0b3b94043969c45bcef94 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 18 Jul 2019 21:28:16 -0700 Subject: [PATCH] osd mon: Add last_update to osd_stat_t heartbeat info Ignore old heartbeat info which hasn't updated Signed-off-by: David Zafman (cherry picked from commit ea20d3522aaf644cef989c565e11dd781e420e18) --- src/mgr/ClusterState.cc | 12 ++++++++++++ src/mon/PGMap.cc | 7 ++++++- src/osd/OSD.cc | 13 +++++++++++++ src/osd/osd_types.cc | 11 +++++++++-- src/osd/osd_types.h | 1 + 5 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/mgr/ClusterState.cc b/src/mgr/ClusterState.cc index a84614d0140c1..140c1744d6102 100644 --- a/src/mgr/ClusterState.cc +++ b/src/mgr/ClusterState.cc @@ -16,6 +16,7 @@ #include "messages/MPGStats.h" #include "mgr/ClusterState.h" +#include #include #define dout_context g_ceph_context @@ -242,6 +243,7 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& std::array min; std::array max; uint32_t last; + uint32_t last_update; bool operator<(const mgr_ping_time_t& rhs) const { if (pingtime < rhs.pingtime) @@ -280,6 +282,7 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& item.max[2] = j.second.back_max[2]; item.last = j.second.back_last; item.back = true; + item.last_update = j.second.last_update; sorted.emplace(item); } @@ -301,6 +304,7 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& item.max[2] = j.second.front_max[2]; item.last = j.second.front_last; item.back = false; + item.last_update = j.second.last_update; sorted.emplace(item); } } @@ -314,6 +318,14 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& ceph_assert(!value || sitem.pingtime >= value); f->open_object_section("entry"); + + const time_t lu(sitem.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + auto stale = g_ceph_context->_conf.get_val("osd_heartbeat_stale"); + f->dump_string("last update", lustr); + f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale); f->dump_int("from osd", sitem.from); f->dump_int("to osd", sitem.to); f->dump_string("interface", (sitem.back ? "back" : "front")); diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index a74fb71f75ca3..343977ae6c76f 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -2713,9 +2713,10 @@ void PGMap::get_health_checks( // SLOW_PING_TIME auto warn_slow_ping_time = cct->_conf.get_val("mon_warn_on_slow_ping_time"); + auto grace = cct->_conf.get_val("osd_heartbeat_grace"); if (warn_slow_ping_time == 0) { double ratio = cct->_conf.get_val("mon_warn_on_slow_ping_ratio"); - warn_slow_ping_time = cct->_conf.get_val("osd_heartbeat_grace"); + warn_slow_ping_time = grace; warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio } if (warn_slow_ping_time > 0) { @@ -2745,6 +2746,10 @@ void PGMap::get_health_checks( for (auto i : osd_stat) { for (auto j : i.second.hb_pingtime) { + // Maybe source info is old + if (now.sec() - j.second.last_update > grace * 60) + continue; + mon_ping_item_t back; back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]); back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]); diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index a868527ab17f8..fa2de7925f219 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -2733,6 +2734,7 @@ will start to track new ops received afterwards."; std::array min; std::array max; uint32_t last; + uint32_t last_update; bool operator<(const osd_ping_time_t& rhs) const { if (pingtime < rhs.pingtime) @@ -2768,6 +2770,7 @@ will start to track new ops received afterwards."; item.max[2] = j.second.back_max[2]; item.last = j.second.back_last; item.back = true; + item.last_update = j.second.last_update; sorted.emplace(item); } if (j.second.front_last == 0) @@ -2786,6 +2789,7 @@ will start to track new ops received afterwards."; item.max[1] = j.second.front_max[1]; item.max[2] = j.second.front_max[2]; item.last = j.second.front_last; + item.last_update = j.second.last_update; item.back = false; sorted.emplace(item); } @@ -2799,6 +2803,14 @@ will start to track new ops received afterwards."; for (auto &sitem : boost::adaptors::reverse(sorted)) { ceph_assert(sitem.pingtime >= value); f->open_object_section("entry"); + + const time_t lu(sitem.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + auto stale = cct->_conf.get_val("osd_heartbeat_stale"); + f->dump_string("last update", lustr); + f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale); f->dump_int("from osd", whoami); f->dump_int("to osd", sitem.to); f->dump_string("interface", (sitem.back ? "back" : "front")); @@ -5287,6 +5299,7 @@ void OSD::handle_osd_ping(MOSDPing *m) { std::lock_guard l(service.stat_lock); + service.osd_stat.hb_pingtime[from].last_update = now.sec(); service.osd_stat.hb_pingtime[from].back_last = back_pingtime; uint32_t total = 0; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 2a05689547ff2..bd731401fbb0c 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -397,6 +397,11 @@ void osd_stat_t::dump(Formatter *f) const for (auto &i : hb_pingtime) { f->open_object_section("entry"); f->dump_int("osd", i.first); + const time_t lu(i.second.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + f->dump_string("last update", lustr); f->open_array_section("interfaces"); f->open_object_section("interface"); f->dump_string("interface", "back"); @@ -486,6 +491,7 @@ void osd_stat_t::encode(bufferlist &bl, uint64_t features) const encode((int)hb_pingtime.size(), bl); for (auto i : hb_pingtime) { encode(i.first, bl); // osd + encode(i.second.last_update, bl); encode(i.second.back_pingtime[0], bl); encode(i.second.back_pingtime[1], bl); encode(i.second.back_pingtime[2], bl); @@ -589,6 +595,7 @@ void osd_stat_t::decode(bufferlist::const_iterator &bl) int osd; decode(osd, bl); struct Interfaces ifs; + decode(ifs.last_update, bl); decode(ifs.back_pingtime[0],bl); decode(ifs.back_pingtime[1], bl); decode(ifs.back_pingtime[2], bl); @@ -632,11 +639,11 @@ void osd_stat_t::generate_test_instances(std::list& o) o.back()->os_alerts[1].emplace( "some alert2", "some alert2 details"); struct Interfaces gen_interfaces = { - { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001, + 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001, { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 }; o.back()->hb_pingtime[20] = gen_interfaces; gen_interfaces = { - { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 }; + 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 }; o.back()->hb_pingtime[30] = gen_interfaces; } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 554577922cb58..3e0e4d1f27693 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -2357,6 +2357,7 @@ struct osd_stat_t { uint32_t num_per_pool_osds = 0; struct Interfaces { + uint32_t last_update; // in seconds uint32_t back_pingtime[3]; uint32_t back_min[3]; uint32_t back_max[3]; -- 2.39.5