]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/OSDMonitor: drop stale failure_info even if can_mark_down() 41982/head
authorKefu Chai <kchai@redhat.com>
Tue, 25 May 2021 06:17:34 +0000 (14:17 +0800)
committerNeha Ojha <nojha@redhat.com>
Tue, 22 Jun 2021 22:43:52 +0000 (22:43 +0000)
in a124ee85b03e15f4ea371358008ecac65f9f4e50, we add a check to drop
stale failure_info reports. but if osdmap does not prohibit us from
marking the osd in question down, the branch checking the stale info
is not executed. in general, it is allowed to mark an osd down, so
the fix of a124ee85b03e15f4ea371358008ecac65f9f4e50 just fails to
work.

in this change, we check for stale failure report of osd in question
as long as the osd is not marked down in the same function. this should
address the slow ops of failure report issue.

Fixes: https://tracker.ceph.com/issues/50964
Signed-off-by: Kefu Chai <kchai@redhat.com>
(cherry picked from commit df6916a56841f89d66fd211729a0a7adc13042cf)

src/mon/OSDMonitor.cc

index 3ed70fb8755755f1e627b8f20b0aef7e48c78b5e..4ccc3a4e503f55015ea4c8630a9c3d34b3046eaf 100644 (file)
@@ -3183,8 +3183,9 @@ bool OSDMonitor::check_failures(utime_t now)
   auto p = failure_info.begin();
   while (p != failure_info.end()) {
     auto& [target_osd, fi] = *p;
-    if (can_mark_down(target_osd)) {
-      found_failure |= check_failure(now, target_osd, fi);
+    if (can_mark_down(target_osd) &&
+       check_failure(now, target_osd, fi)) {
+      found_failure = true;
       ++p;
     } else if (is_failure_stale(now, fi)) {
       dout(10) << " dropping stale failure_info for osd." << target_osd