mon/OSDMonitor: drop stale failure_info

author Kefu Chai <kchai@redhat.com>

Thu, 11 Mar 2021 13:13:13 +0000 (21:13 +0800)

committer singuliere <singuliere@autistici.org>

Thu, 1 Apr 2021 21:06:54 +0000 (23:06 +0200)
author Kefu Chai <kchai@redhat.com>
Thu, 11 Mar 2021 13:13:13 +0000 (21:13 +0800)
committer singuliere <singuliere@autistici.org>
Thu, 1 Apr 2021 21:06:54 +0000 (23:06 +0200)
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc

index d776db14bec25e2bf8abfa91c4cfc890e77431cf..833e7c4cc134f6fce6c441901431b2c229842eeb 100644 (file)
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -3104,8 +3104,15 @@ bool OSDMonitor::check_failures(utime_t now)
      auto& [target_osd, fi] = *p;
      if (can_mark_down(target_osd)) {
        found_failure |= check_failure(now, target_osd, fi);
+      ++p;
+    } else if (is_failure_stale(now, fi)) {
+      dout(10) << " dropping stale failure_info for osd." << target_osd
+              << " from " << fi.reporters.size() << " reporters"
+              << dendl;
+      p = failure_info.erase(p);
+    } else {
+      ++p;
      }
-    ++p;
    }
    return found_failure;
  }
@@ -3209,6 +3216,17 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
    return false;
  }
  
+bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
+{
+  // if it takes too long to either cancel the report to mark the osd down,
+  // some reporters must have failed to cancel their reports. let's just
+  // forget these reports.
+  const utime_t failed_for = now - fi.get_failed_since();
+  auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
+  auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
+  return failed_for >= (heartbeat_grace + heartbeat_stale);
+}
+
  void OSDMonitor::force_failure(int target_osd, int by)
  {
    // already pending failure?
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h

index 21e1c0fc2a0ef34d808f1be1e5248a01503c9db2..fb941a6a0e56ef740a4dd75f514b92e7a7491a13 100644 (file)
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -241,6 +241,7 @@ public:
    bool check_failures(utime_t now);
    bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
    utime_t get_grace_time(utime_t now, int target_osd, failure_info_t& fi) const;
+  bool is_failure_stale(utime_t now, failure_info_t& fi) const;
    void force_failure(int target_osd, int by);
  
    bool _have_pending_crush();
author	Kefu Chai <kchai@redhat.com>
	Thu, 11 Mar 2021 13:13:13 +0000 (21:13 +0800)
committer	singuliere <singuliere@autistici.org>
	Thu, 1 Apr 2021 21:06:54 +0000 (23:06 +0200)
src/mon/OSDMonitor.cc		patch \| blob \| history
src/mon/OSDMonitor.h		patch \| blob \| history