mon/OSDMonitor: drop stale failure_info

author Kefu Chai <kchai@redhat.com>

Thu, 11 Mar 2021 13:13:13 +0000 (21:13 +0800)

committer Kefu Chai <kchai@redhat.com>

Fri, 19 Mar 2021 16:04:32 +0000 (00:04 +0800)
author Kefu Chai <kchai@redhat.com>
Thu, 11 Mar 2021 13:13:13 +0000 (21:13 +0800)
committer Kefu Chai <kchai@redhat.com>
Fri, 19 Mar 2021 16:04:32 +0000 (00:04 +0800)
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc

index 38b051bf77df25efd3fe1ffceaa255b08b9e1878..521d57899e059fae527e838f185f55599f2d5bfe 100644 (file)
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -3181,8 +3181,15 @@ bool OSDMonitor::check_failures(utime_t now)
      auto& [target_osd, fi] = *p;
      if (can_mark_down(target_osd)) {
        found_failure |= check_failure(now, target_osd, fi);
+      ++p;
+    } else if (is_failure_stale(now, fi)) {
+      dout(10) << " dropping stale failure_info for osd." << target_osd
+              << " from " << fi.reporters.size() << " reporters"
+              << dendl;
+      p = failure_info.erase(p);
+    } else {
+      ++p;
      }
-    ++p;
    }
    return found_failure;
  }
@@ -3286,6 +3293,17 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
    return false;
  }
  
+bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
+{
+  // if it takes too long to either cancel the report to mark the osd down,
+  // some reporters must have failed to cancel their reports. let's just
+  // forget these reports.
+  const utime_t failed_for = now - fi.get_failed_since();
+  auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
+  auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
+  return failed_for >= (heartbeat_grace + heartbeat_stale);
+}
+
  void OSDMonitor::force_failure(int target_osd, int by)
  {
    // already pending failure?
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h

index 23d8d4ab1ed56bce7f1e263cbe9c13a28c75e67a..1617f9bfc5a21adc9708d2471d1378ff512093bb 100644 (file)
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -237,6 +237,7 @@ public:
    bool check_failures(utime_t now);
    bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
    utime_t get_grace_time(utime_t now, int target_osd, failure_info_t& fi) const;
+  bool is_failure_stale(utime_t now, failure_info_t& fi) const;
    void force_failure(int target_osd, int by);
  
    bool _have_pending_crush();
author	Kefu Chai <kchai@redhat.com>
	Thu, 11 Mar 2021 13:13:13 +0000 (21:13 +0800)
committer	Kefu Chai <kchai@redhat.com>
	Fri, 19 Mar 2021 16:04:32 +0000 (00:04 +0800)
src/mon/OSDMonitor.cc		patch \| blob \| history
src/mon/OSDMonitor.h		patch \| blob \| history