From: su_nan@inspur.com Date: Fri, 6 Sep 2019 02:20:42 +0000 (+0800) Subject: mon/OSDMonitor: trim no-longer-exist failure reporters X-Git-Tag: v12.2.13~98^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=7a272c4396bf3b2a346be7a643aad0aff97deb63;p=ceph.git mon/OSDMonitor: trim no-longer-exist failure reporters remove the report from failure_info in OSDMonitor::check_failure(), if the reporter does not exist in the osdmap any longer. otherwise, we will run into assert() failure when trying to lookup the osd in osdmap using `osdmap.get_xinfo()`. Fixes: https://tracker.ceph.com/issues/41680 Signed-off-by: NancySu05 (cherry picked from commit 99ccc166ede5f296b86abfe9db0a3d597100edde) Conflicts: src/mon/OSDMonitor.cc - ceph_assert, g_conf() - account for C++17ism by moving iter initialization out of if statement --- diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index c0481c5b6e457..0a8d4669bec12 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -578,7 +578,6 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap) share_map_with_random_osd(); update_logger(); - process_failures(); // make sure our feature bits reflect the latest map @@ -2208,23 +2207,27 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) // help us localize the grace correction to a subset of the system // (say, a rack with a bad switch) that is unhappy. assert(fi.reporters.size()); - for (map::iterator p = fi.reporters.begin(); - p != fi.reporters.end(); - ++p) { + for (auto p = fi.reporters.begin(); p != fi.reporters.end();) { // get the parent bucket whose type matches with "reporter_subtree_level". // fall back to OSD if the level doesn't exist. - map reporter_loc = osdmap.crush->get_full_location(p->first); - map::iterator iter = reporter_loc.find(reporter_subtree_level); - if (iter == reporter_loc.end()) { - reporters_by_subtree.insert("osd." + to_string(p->first)); + if (osdmap.exists(p->first)) { + auto reporter_loc = osdmap.crush->get_full_location(p->first); + auto iter = reporter_loc.find(reporter_subtree_level); + if (iter == reporter_loc.end()) { + reporters_by_subtree.insert("osd." + to_string(p->first)); + } else { + reporters_by_subtree.insert(iter->second); + } + if (g_conf->mon_osd_adjust_heartbeat_grace) { + const osd_xinfo_t& xi = osdmap.get_xinfo(p->first); + utime_t elapsed = now - xi.down_stamp; + double decay = exp((double)elapsed * decay_k); + peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability; + } + ++p; } else { - reporters_by_subtree.insert(iter->second); - } - if (g_conf->mon_osd_adjust_heartbeat_grace) { - const osd_xinfo_t& xi = osdmap.get_xinfo(p->first); - utime_t elapsed = now - xi.down_stamp; - double decay = exp((double)elapsed * decay_k); - peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability; + fi.cancel_report(p->first);; + p = fi.reporters.erase(p); } }