]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/OSDMonitor: trim no-longer-exist failure reporters 30905/head
authorsu_nan@inspur.com <su_nan@inspur.com>
Fri, 6 Sep 2019 02:20:42 +0000 (10:20 +0800)
committerNathan Cutler <ncutler@suse.com>
Tue, 15 Oct 2019 08:41:13 +0000 (10:41 +0200)
remove the report from failure_info in OSDMonitor::check_failure(), if
the reporter does not exist in the osdmap any longer. otherwise, we will
run into assert() failure when trying to lookup the osd in osdmap using
`osdmap.get_xinfo()`.

Fixes: https://tracker.ceph.com/issues/41680
Signed-off-by: NancySu05 <su_nan@inspur.com>
(cherry picked from commit 99ccc166ede5f296b86abfe9db0a3d597100edde)

Conflicts:
src/mon/OSDMonitor.cc
- ceph_assert, g_conf()
- account for C++17ism by moving iter initialization out of if statement

src/mon/OSDMonitor.cc

index c0481c5b6e457d27ed78957c6d2c302bc0de511f..0a8d4669bec120e6b7851bab89c761b4695500d1 100644 (file)
@@ -578,7 +578,6 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
 
   share_map_with_random_osd();
   update_logger();
-
   process_failures();
 
   // make sure our feature bits reflect the latest map
@@ -2208,23 +2207,27 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
   // help us localize the grace correction to a subset of the system
   // (say, a rack with a bad switch) that is unhappy.
   assert(fi.reporters.size());
-  for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
-       p != fi.reporters.end();
-       ++p) {
+  for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
     // get the parent bucket whose type matches with "reporter_subtree_level".
     // fall back to OSD if the level doesn't exist.
-    map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
-    map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
-    if (iter == reporter_loc.end()) {
-      reporters_by_subtree.insert("osd." + to_string(p->first));
+    if (osdmap.exists(p->first)) {
+      auto reporter_loc = osdmap.crush->get_full_location(p->first);
+      auto iter = reporter_loc.find(reporter_subtree_level);
+      if (iter == reporter_loc.end()) {
+        reporters_by_subtree.insert("osd." + to_string(p->first));
+      } else {
+        reporters_by_subtree.insert(iter->second);
+      }
+      if (g_conf->mon_osd_adjust_heartbeat_grace) {
+        const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
+        utime_t elapsed = now - xi.down_stamp;
+        double decay = exp((double)elapsed * decay_k);
+        peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
+      }
+      ++p;
     } else {
-      reporters_by_subtree.insert(iter->second);
-    }
-    if (g_conf->mon_osd_adjust_heartbeat_grace) {
-      const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
-      utime_t elapsed = now - xi.down_stamp;
-      double decay = exp((double)elapsed * decay_k);
-      peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
+      fi.cancel_report(p->first);;
+      p = fi.reporters.erase(p);
     }
   }