// add a report
failure_info_t& fi = failure_info[target_osd];
fi.add_report(reporter, failed_since);
+
+ utime_t grace(g_conf->osd_heartbeat_grace, 0);
+ utime_t max_failed_since = fi.get_failed_since();
+
dout(10) << " osd." << target_osd << " has "
<< fi.reporters.size() << " reporters and "
- << fi.num_reports << " reports" << dendl;
+ << fi.num_reports << " reports, "
+ << grace << " grace, max_failed_since " << max_failed_since
+ << dendl;
- if (((int)fi.reporters.size() >= g_conf->osd_min_down_reporters) &&
+ if (max_failed_since + grace < now &&
+ ((int)fi.reporters.size() >= g_conf->osd_min_down_reporters) &&
(fi.num_reports >= g_conf->osd_min_down_reports)) {
dout(1) << " we have enough reports/reporters to mark osd." << target_osd << " down" << dendl;
pending_inc.new_state[target_osd] = CEPH_OSD_UP;
failure_info_t() : num_reports(0) {}
+ utime_t get_failed_since() {
+ if (max_failed_since == utime_t() && reporters.size()) {
+ // the old max must have canceled; recalculate.
+ for (map<int, failure_reporter_t>::iterator p = reporters.begin();
+ p != reporters.end();
+ ++p)
+ if (p->second.failed_since > max_failed_since)
+ max_failed_since = p->second.failed_since;
+ }
+ return max_failed_since;
+ }
+
void add_report(int who, utime_t failed_since) {
map<int, failure_reporter_t>::iterator p = reporters.find(who);
if (p == reporters.end()) {