mon/OSDMonitor: drop stale failure_info

author Kefu Chai <kchai@redhat.com>

Thu, 11 Mar 2021 13:13:13 +0000 (21:13 +0800)

committer Kefu Chai <kchai@redhat.com>

Fri, 7 May 2021 03:06:38 +0000 (11:06 +0800)
author Kefu Chai <kchai@redhat.com>
Thu, 11 Mar 2021 13:13:13 +0000 (21:13 +0800)
committer Kefu Chai <kchai@redhat.com>
Fri, 7 May 2021 03:06:38 +0000 (11:06 +0800)
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc

index cc3747b09d84532bcde431ce4a5e0d4423134650..848d94e8a2849c952a74f65e7d529753db7ac860 100644 (file)
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -2906,8 +2906,15 @@ bool OSDMonitor::check_failures(utime_t now)
      auto& [target_osd, fi] = *p;
      if (can_mark_down(target_osd)) {
        found_failure |= check_failure(now, target_osd, fi);
+      ++p;
+    } else if (is_failure_stale(now, fi)) {
+      dout(10) << " dropping stale failure_info for osd." << target_osd
+              << " from " << fi.reporters.size() << " reporters"
+              << dendl;
+      p = failure_info.erase(p);
+    } else {
+      ++p;
      }
-    ++p;
    }
    return found_failure;
  }
@@ -3004,6 +3011,17 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
    return false;
  }
  
+bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
+{
+  // if it takes too long to either cancel the report to mark the osd down,
+  // some reporters must have failed to cancel their reports. let's just
+  // forget these reports.
+  const utime_t failed_for = now - fi.get_failed_since();
+  auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
+  auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
+  return failed_for >= (heartbeat_grace + heartbeat_stale);
+}
+
  void OSDMonitor::force_failure(int target_osd, int by)
  {
    // already pending failure?
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h

index b40b3e7f7f212ea26f0823577846db20b06a985e..3abba3bcfb61d14562ede831cd67cfa16682649b 100644 (file)
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -248,6 +248,7 @@ public:
  
    bool check_failures(utime_t now);
    bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
+  bool is_failure_stale(utime_t now, failure_info_t& fi) const;
    void force_failure(int target_osd, int by);
  
    bool _have_pending_crush();
author	Kefu Chai <kchai@redhat.com>
	Thu, 11 Mar 2021 13:13:13 +0000 (21:13 +0800)
committer	Kefu Chai <kchai@redhat.com>
	Fri, 7 May 2021 03:06:38 +0000 (11:06 +0800)
src/mon/OSDMonitor.cc		patch \| blob \| history
src/mon/OSDMonitor.h		patch \| blob \| history