From 422f70d69fa600585443c21de1fcd49c9d2bae5d Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Thu, 11 Mar 2021 21:13:13 +0800 Subject: [PATCH] mon/OSDMonitor: drop stale failure_info failure_info keeps strong references of the MOSDFailure messages sent by osd or peon monitors, whenever monitor starts to handle an MOSDFailure message, it registers it in its OpTracker. and the failure report messageis unregistered when monitor acks them by either canceling them or replying the reporters with a new osdmap marking the target osd down. but if this does not happen, the failure reports just pile up in OpTracker. and monitor considers them as slow ops. and they are reported as SLOW_OPS health warning. in theory, it does not take long to mark an unresponsive osd down if we have enough reporters. but there is chance, that a reporter fails to cancel its report before it reboots, and the monitor also fails to collect enough reports and mark the target osd down. so the target osd never gets an osdmap marking it down, so it won't send an alive message to monitor to fix this. in this change, we check for the stale failure info in tick(), and simply drop the stale reports. so the messages can released and marked "done". Fixes: https://tracker.ceph.com/issues/47380 Signed-off-by: Kefu Chai (cherry picked from commit a124ee85b03e15f4ea371358008ecac65f9f4e50) --- src/mon/OSDMonitor.cc | 20 +++++++++++++++++++- src/mon/OSDMonitor.h | 1 + 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index d776db14bec..833e7c4cc13 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -3104,8 +3104,15 @@ bool OSDMonitor::check_failures(utime_t now) auto& [target_osd, fi] = *p; if (can_mark_down(target_osd)) { found_failure |= check_failure(now, target_osd, fi); + ++p; + } else if (is_failure_stale(now, fi)) { + dout(10) << " dropping stale failure_info for osd." << target_osd + << " from " << fi.reporters.size() << " reporters" + << dendl; + p = failure_info.erase(p); + } else { + ++p; } - ++p; } return found_failure; } @@ -3209,6 +3216,17 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) return false; } +bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const +{ + // if it takes too long to either cancel the report to mark the osd down, + // some reporters must have failed to cancel their reports. let's just + // forget these reports. + const utime_t failed_for = now - fi.get_failed_since(); + auto heartbeat_grace = cct->_conf.get_val("osd_heartbeat_grace"); + auto heartbeat_stale = cct->_conf.get_val("osd_heartbeat_stale"); + return failed_for >= (heartbeat_grace + heartbeat_stale); +} + void OSDMonitor::force_failure(int target_osd, int by) { // already pending failure? diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 21e1c0fc2a0..fb941a6a0e5 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -241,6 +241,7 @@ public: bool check_failures(utime_t now); bool check_failure(utime_t now, int target_osd, failure_info_t& fi); utime_t get_grace_time(utime_t now, int target_osd, failure_info_t& fi) const; + bool is_failure_stale(utime_t now, failure_info_t& fi) const; void force_failure(int target_osd, int by); bool _have_pending_crush(); -- 2.47.3