From: David Zafman Date: Fri, 19 Jul 2019 04:28:16 +0000 (-0700) Subject: osd mon: Add last_update to osd_stat_t heartbeat info X-Git-Tag: v13.2.7~26^2~13 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=63fc174489c50cf97dd4bace85da7848546a4ed4;p=ceph.git osd mon: Add last_update to osd_stat_t heartbeat info Ignore old heartbeat info which hasn't updated Signed-off-by: David Zafman (cherry picked from commit ea20d3522aaf644cef989c565e11dd781e420e18) Conflicts: src/osd/osd_types.h (osd_stat_t location in file changed) --- diff --git a/src/mgr/ClusterState.cc b/src/mgr/ClusterState.cc index 79c71e9fb71..1a06881f8e9 100644 --- a/src/mgr/ClusterState.cc +++ b/src/mgr/ClusterState.cc @@ -16,6 +16,7 @@ #include "messages/MPGStats.h" #include "mgr/ClusterState.h" +#include #include #define dout_context g_ceph_context @@ -231,6 +232,7 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& std::array min; std::array max; uint32_t last; + uint32_t last_update; bool operator<(const mgr_ping_time_t& rhs) const { if (pingtime < rhs.pingtime) @@ -269,6 +271,7 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& item.max[2] = j.second.back_max[2]; item.last = j.second.back_last; item.back = true; + item.last_update = j.second.last_update; sorted.emplace(item); } @@ -290,6 +293,7 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& item.max[2] = j.second.front_max[2]; item.last = j.second.front_last; item.back = false; + item.last_update = j.second.last_update; sorted.emplace(item); } } @@ -303,6 +307,14 @@ bool ClusterState::asok_command(std::string_view admin_command, const cmdmap_t& ceph_assert(!value || sitem.pingtime >= value); f->open_object_section("entry"); + + const time_t lu(sitem.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + auto stale = g_ceph_context->_conf.get_val("osd_heartbeat_stale"); + f->dump_string("last update", lustr); + f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale); f->dump_int("from osd", sitem.from); f->dump_int("to osd", sitem.to); f->dump_string("interface", (sitem.back ? "back" : "front")); diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 1d1cfce09be..de522a0bb81 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -2582,9 +2582,10 @@ void PGMap::get_health_checks( // SLOW_PING_TIME auto warn_slow_ping_time = cct->_conf.get_val("mon_warn_on_slow_ping_time"); + auto grace = cct->_conf.get_val("osd_heartbeat_grace"); if (warn_slow_ping_time == 0) { double ratio = cct->_conf.get_val("mon_warn_on_slow_ping_ratio"); - warn_slow_ping_time = cct->_conf.get_val("osd_heartbeat_grace"); + warn_slow_ping_time = grace; warn_slow_ping_time *= 1000000 * ratio; // Seconds of grace to microseconds at ratio } if (warn_slow_ping_time > 0) { @@ -2614,6 +2615,10 @@ void PGMap::get_health_checks( for (auto i : osd_stat) { for (auto j : i.second.hb_pingtime) { + // Maybe source info is old + if (now.sec() - j.second.last_update > grace * 60) + continue; + mon_ping_item_t back; back.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]); back.pingtime = std::max(back.pingtime, j.second.back_pingtime[2]); diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index f8b255e7c37..2af8ea73a42 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -2302,6 +2303,7 @@ will start to track new ops received afterwards."; std::array min; std::array max; uint32_t last; + uint32_t last_update; bool operator<(const osd_ping_time_t& rhs) const { if (pingtime < rhs.pingtime) @@ -2337,6 +2339,7 @@ will start to track new ops received afterwards."; item.max[2] = j.second.back_max[2]; item.last = j.second.back_last; item.back = true; + item.last_update = j.second.last_update; sorted.emplace(item); } if (j.second.front_last == 0) @@ -2355,6 +2358,7 @@ will start to track new ops received afterwards."; item.max[1] = j.second.front_max[1]; item.max[2] = j.second.front_max[2]; item.last = j.second.front_last; + item.last_update = j.second.last_update; item.back = false; sorted.emplace(item); } @@ -2368,6 +2372,14 @@ will start to track new ops received afterwards."; for (auto &sitem : boost::adaptors::reverse(sorted)) { ceph_assert(sitem.pingtime >= value); f->open_object_section("entry"); + + const time_t lu(sitem.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + auto stale = cct->_conf.get_val("osd_heartbeat_stale"); + f->dump_string("last update", lustr); + f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale); f->dump_int("from osd", whoami); f->dump_int("to osd", sitem.to); f->dump_string("interface", (sitem.back ? "back" : "front")); @@ -4827,6 +4839,7 @@ void OSD::handle_osd_ping(MOSDPing *m) { std::lock_guard l(service.stat_lock); + service.osd_stat.hb_pingtime[from].last_update = now.sec(); service.osd_stat.hb_pingtime[from].back_last = back_pingtime; uint32_t total = 0; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 092b5f759df..06ae7f530ec 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -363,6 +363,11 @@ void osd_stat_t::dump(Formatter *f) const for (auto &i : hb_pingtime) { f->open_object_section("entry"); f->dump_int("osd", i.first); + const time_t lu(i.second.last_update); + char buffer[26]; + string lustr(ctime_r(&lu, buffer)); + lustr.pop_back(); // Remove trailing \n + f->dump_string("last update", lustr); f->open_array_section("interfaces"); f->open_object_section("interface"); f->dump_string("interface", "back"); @@ -433,6 +438,7 @@ void osd_stat_t::encode(bufferlist &bl, uint64_t features) const encode((int)hb_pingtime.size(), bl); for (auto i : hb_pingtime) { encode(i.first, bl); // osd + encode(i.second.last_update, bl); encode(i.second.back_pingtime[0], bl); encode(i.second.back_pingtime[1], bl); encode(i.second.back_pingtime[2], bl); @@ -496,6 +502,7 @@ void osd_stat_t::decode(bufferlist::iterator &bl) int osd; decode(osd, bl); struct Interfaces ifs; + decode(ifs.last_update, bl); decode(ifs.back_pingtime[0],bl); decode(ifs.back_pingtime[1], bl); decode(ifs.back_pingtime[2], bl); @@ -537,11 +544,11 @@ void osd_stat_t::generate_test_instances(std::list& o) o.back()->snap_trim_queue_len = 8; o.back()->num_snap_trimming = 99; struct Interfaces gen_interfaces = { - { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001, + 123456789, { 1000, 900, 800 }, { 990, 890, 790 }, { 1010, 910, 810 }, 1001, { 1100, 1000, 900 }, { 1090, 990, 890 }, { 1110, 1010, 910 }, 1101 }; o.back()->hb_pingtime[20] = gen_interfaces; gen_interfaces = { - { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 }; + 987654321, { 100, 200, 300 }, { 90, 190, 290 }, { 110, 210, 310 }, 101 }; o.back()->hb_pingtime[30] = gen_interfaces; } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 6ffe6a0dcc6..b638c49eea3 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -934,6 +934,7 @@ struct osd_stat_t { uint32_t num_pgs = 0; struct Interfaces { + uint32_t last_update; // in seconds uint32_t back_pingtime[3]; uint32_t back_min[3]; uint32_t back_max[3];