cancel_probe_timeout();
timecheck_finish();
health_events_cleanup();
+ health_check_log_times.clear();
scrub_event_cancel();
leader_since = utime_t();
if (!g_conf->mon_health_to_clog) {
return;
}
+
+ const utime_t now = ceph_clock_now();
+
// FIXME: log atomically as part of @t instead of using clog.
dout(10) << __func__ << " updated " << updated.checks.size()
<< " previous " << previous.checks.size()
<< dendl;
+ const auto min_log_period = g_conf->get_val<int64_t>(
+ "mon_health_log_update_period");
for (auto& p : updated.checks) {
auto q = previous.checks.find(p.first);
+ bool logged = false;
if (q == previous.checks.end()) {
// new
ostringstream ss;
ss << "Health check failed: " << p.second.summary << " ("
<< p.first << ")";
clog->health(p.second.severity) << ss.str();
+
+ logged = true;
} else {
if (p.second.summary != q->second.summary ||
p.second.severity != q->second.severity) {
- // summary or severity changed (ignore detail changes at this level)
- ostringstream ss;
+
+ auto status_iter = health_check_log_times.find(p.first);
+ if (status_iter != health_check_log_times.end()) {
+ if (p.second.severity == q->second.severity &&
+ now - status_iter->second.updated_at < min_log_period) {
+ // We already logged this recently and the severity is unchanged,
+ // so skip emitting an update of the summary string.
+ // We'll get an update out of tick() later if the check
+ // is still failing.
+ continue;
+ }
+ }
+
+ // summary or severity changed (ignore detail changes at this level)
+ ostringstream ss;
ss << "Health check update: " << p.second.summary << " (" << p.first << ")";
clog->health(p.second.severity) << ss.str();
+
+ logged = true;
+ }
+ }
+ // Record the time at which we last logged, so that we can check this
+ // when considering whether/when to print update messages.
+ if (logged) {
+ auto iter = health_check_log_times.find(p.first);
+ if (iter == health_check_log_times.end()) {
+ health_check_log_times.emplace(p.first, HealthCheckLogStatus(
+ p.second.severity, p.second.summary, now));
+ } else {
+ iter->second = HealthCheckLogStatus(
+ p.second.severity, p.second.summary, now);
}
}
}
clog->info() << "Health check cleared: " << p.first << " (was: "
<< p.second.summary << ")";
}
+
+ if (health_check_log_times.count(p.first)) {
+ health_check_log_times.erase(p.first);
+ }
}
}
{
// ok go.
dout(11) << "tick" << dendl;
+ const utime_t now = ceph_clock_now();
+ // Check if we need to emit any delayed health check updated messages
+ if (is_leader()) {
+ const auto min_period = g_conf->get_val<int64_t>(
+ "mon_health_log_update_period");
+ for (auto& svc : paxos_service) {
+ auto health = svc->get_health_checks();
+
+ for (const auto &i : health.checks) {
+ const std::string &code = i.first;
+ const std::string &summary = i.second.summary;
+ const health_status_t severity = i.second.severity;
+
+ auto status_iter = health_check_log_times.find(code);
+ if (status_iter == health_check_log_times.end()) {
+ continue;
+ }
+
+ auto &log_status = status_iter->second;
+ bool const changed = log_status.last_message != summary
+ || log_status.severity != severity;
+
+ if (changed && now - log_status.updated_at > min_period) {
+ log_status.last_message = summary;
+ log_status.updated_at = now;
+ log_status.severity = severity;
+
+ ostringstream ss;
+ ss << "Health check update: " << summary << " (" << code << ")";
+ clog->health(severity) << ss.str();
+ }
+ }
+ }
+ }
+
+
for (vector<PaxosService*>::iterator p = paxos_service.begin(); p != paxos_service.end(); ++p) {
(*p)->tick();
(*p)->maybe_trim();
}
// trim sessions
- utime_t now = ceph_clock_now();
{
Mutex::Locker l(session_map_lock);
auto p = session_map.sessions.begin();
const health_check_map_t& previous,
MonitorDBStore::TransactionRef t);
+protected:
+
+ class HealthCheckLogStatus {
+ public:
+ health_status_t severity;
+ std::string last_message;
+ utime_t updated_at = 0;
+ HealthCheckLogStatus(health_status_t severity_,
+ const std::string &last_message_,
+ utime_t updated_at_)
+ : severity(severity_),
+ last_message(last_message_),
+ updated_at(updated_at_)
+ {}
+ };
+ std::map<std::string, HealthCheckLogStatus> health_check_log_times;
+
+public:
+
void get_cluster_status(stringstream &ss, Formatter *f);
void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);