From 7385e917bbcc6805610c7b663133970b445d53b1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 31 Jul 2019 04:51:38 -0500 Subject: [PATCH] mon/HealthCheck: check mutes based on count, not parsing the summary string This is more explicit and robust, and works with the PG warnings, which don't conform to the "%d ..." form that the other messages do. Signed-off-by: Sage Weil --- src/mon/HealthMonitor.cc | 24 +++++++++++++++--------- src/mon/health_check.h | 4 ++++ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index a750eba9ef0ec..dac93bb9438b9 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -297,6 +297,7 @@ bool HealthMonitor::prepare_command(MonOpRequestRef op) health_check_map_t all; gather_all_health_checks(&all); string summary; + int64_t count = 0; if (!sticky) { auto p = all.checks.find(code); if (p == all.checks.end()) { @@ -304,6 +305,7 @@ bool HealthMonitor::prepare_command(MonOpRequestRef op) ss << "health alert " << code << " is not currently raised"; goto out; } + count = p->second.count; summary = p->second.summary; } auto& m = pending_mutes[code]; @@ -311,6 +313,7 @@ bool HealthMonitor::prepare_command(MonOpRequestRef op) m.ttl = ttl; m.sticky = sticky; m.summary = summary; + m.count = count; } else if (prefix == "health unmute") { string code; if (cmd_getval(g_ceph_context, cmdmap, "code", code)) { @@ -398,23 +401,26 @@ bool HealthMonitor::check_mutes() changed = true; continue; } - if (p->second.summary.size() && std::isdigit(p->second.summary[0])) { - int64_t mute_val = atoll(p->second.summary.c_str()); - int64_t cur_val = atoll(q->second.summary.c_str()); - if (cur_val > mute_val) { + if (p->second.count) { + // count-based mute + if (q->second.count > p->second.count) { mon->clog->info() << "Health alert mute " << p->first - << " cleared (count increased from " << mute_val - << " to " << cur_val << ")"; + << " cleared (count increased from " << p->second.count + << " to " << q->second.count << ")"; p = pending_mutes.erase(p); changed = true; continue; } - if (p->second.summary != q->second.summary) { - // update summary string for good measure - p->second.summary = q->second.summary; + if (q->second.count < p->second.count) { + // rachet down the mute + dout(10) << __func__ << " mute " << p->first << " count " + << p->second.count << " -> " << q->second.count + << dendl; + p->second.count = q->second.count; changed = true; } } else { + // summary-based mute if (p->second.summary != q->second.summary) { mon->clog->info() << "Health alert mute " << p->first << " cleared (summary changed)"; diff --git a/src/mon/health_check.h b/src/mon/health_check.h index 107f67293d356..bff9166b12da2 100644 --- a/src/mon/health_check.h +++ b/src/mon/health_check.h @@ -75,6 +75,7 @@ struct health_mute_t { utime_t ttl; bool sticky = false; string summary; + int64_t count; DENC(health_mute_t, v, p) { DENC_START(1, 1, p); @@ -82,6 +83,7 @@ struct health_mute_t { denc(v.ttl, p); denc(v.sticky, p); denc(v.summary, p); + denc(v.count, p); DENC_FINISH(p); } @@ -92,6 +94,7 @@ struct health_mute_t { } f->dump_bool("sticky", sticky); f->dump_string("summary", summary); + f->dump_int("count", count); } static void generate_test_instances(std::list& ls) { @@ -101,6 +104,7 @@ struct health_mute_t { ls.back()->ttl = utime_t(1, 2); ls.back()->sticky = true; ls.back()->summary = "foo bar"; + ls.back()->count = 2; } }; WRITE_CLASS_DENC(health_mute_t) -- 2.39.5