]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mon/HealthMonitor: allow muted alert counts to decrease but not increase
authorSage Weil <sage@redhat.com>
Wed, 31 Jul 2019 07:04:20 +0000 (02:04 -0500)
committerSage Weil <sage@redhat.com>
Thu, 15 Aug 2019 01:40:08 +0000 (20:40 -0500)
If the summary starts with a digit, parse a count.

If the count goes up, clear the mute.

If the count goes down, update the mute so that we ratchet the threshold
down.

Signed-off-by: Sage Weil <sage@redhat.com>
src/mon/HealthMonitor.cc

index f0ac5412f84c251d82ecea9bd433778fda5efafe..7c51e025baa28672b4659633cce31df2127c4272 100644 (file)
@@ -387,15 +387,44 @@ bool HealthMonitor::check_mutes()
                        << " cleared (passed TTL " << p->second.ttl << ")";
       p = pending_mutes.erase(p);
       changed = true;
-    } else if (!p->second.sticky &&
-              all.checks.count(p->first) == 0) {
-      mon->clog->info() << "Health alert mute " << p->first
-                       << " cleared (health alert cleared)";
-      p = pending_mutes.erase(p);
-      changed = true;
-    } else {
-      ++p;
+      continue;
+    }
+    if (!p->second.sticky) {
+      auto q = all.checks.find(p->first);
+      if (q == all.checks.end()) {
+       mon->clog->info() << "Health alert mute " << p->first
+                         << " cleared (health alert cleared)";
+       p = pending_mutes.erase(p);
+       changed = true;
+       continue;
+      }
+      if (p->second.summary.size() && std::isdigit(p->second.summary[0])) {
+       int64_t mute_val = atoll(p->second.summary.c_str());
+       int64_t cur_val = atoll(q->second.summary.c_str());
+       if (cur_val > mute_val) {
+         mon->clog->info() << "Health alert mute " << p->first
+                           << " cleared (count increased from " << mute_val
+                           << " to " << cur_val << ")";
+         p = pending_mutes.erase(p);
+         changed = true;
+         continue;
+       }
+       if (p->second.summary != q->second.summary) {
+         // update summary string for good measure
+         p->second.summary = q->second.summary;
+         changed = true;
+       }
+      } else {
+       if (p->second.summary != q->second.summary) {
+         mon->clog->info() << "Health alert mute " << p->first
+                           << " cleared (summary changed)";
+         p = pending_mutes.erase(p);
+         changed = true;
+         continue;
+       }
+      }
     }
+    ++p;
   }
   return changed;
 }