From: su_nan@inspur.com Date: Fri, 6 Sep 2019 02:20:42 +0000 (+0800) Subject: mon/OSDMonitor: trim no-longer-exist failure reporters X-Git-Tag: v15.1.0~1409^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=99ccc166ede5f296b86abfe9db0a3d597100edde;p=ceph-ci.git mon/OSDMonitor: trim no-longer-exist failure reporters remove the report from failure_info in OSDMonitor::check_failure(), if the reporter does not exist in the osdmap any longer. otherwise, we will run into assert() failure when trying to lookup the osd in osdmap using `osdmap.get_xinfo()`. Fixes: https://tracker.ceph.com/issues/41680 Signed-off-by: NancySu05 --- diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 9b08b0bba7b..e21753e4d86 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -876,7 +876,6 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap) share_map_with_random_osd(); update_logger(); - process_failures(); // make sure our feature bits reflect the latest map @@ -3133,23 +3132,27 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) // help us localize the grace correction to a subset of the system // (say, a rack with a bad switch) that is unhappy. ceph_assert(fi.reporters.size()); - for (map::iterator p = fi.reporters.begin(); - p != fi.reporters.end(); - ++p) { + for (auto p = fi.reporters.begin(); p != fi.reporters.end();) { // get the parent bucket whose type matches with "reporter_subtree_level". // fall back to OSD if the level doesn't exist. - map reporter_loc = osdmap.crush->get_full_location(p->first); - map::iterator iter = reporter_loc.find(reporter_subtree_level); - if (iter == reporter_loc.end()) { - reporters_by_subtree.insert("osd." + to_string(p->first)); + if (osdmap.exists(p->first)) { + auto reporter_loc = osdmap.crush->get_full_location(p->first); + if (auto iter = reporter_loc.find(reporter_subtree_level); + iter == reporter_loc.end()) { + reporters_by_subtree.insert("osd." + to_string(p->first)); + } else { + reporters_by_subtree.insert(iter->second); + } + if (g_conf()->mon_osd_adjust_heartbeat_grace) { + const osd_xinfo_t& xi = osdmap.get_xinfo(p->first); + utime_t elapsed = now - xi.down_stamp; + double decay = exp((double)elapsed * decay_k); + peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability; + } + ++p; } else { - reporters_by_subtree.insert(iter->second); - } - if (g_conf()->mon_osd_adjust_heartbeat_grace) { - const osd_xinfo_t& xi = osdmap.get_xinfo(p->first); - utime_t elapsed = now - xi.down_stamp; - double decay = exp((double)elapsed * decay_k); - peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability; + fi.cancel_report(p->first);; + p = fi.reporters.erase(p); } }