OPTION(mon_sync_debug_provider_fallback, OPT_INT, -1) // monitor to be used as fallback if sync provider fails
OPTION(mon_inject_sync_get_chunk_delay, OPT_DOUBLE, 0) // inject N second delay on each get_chunk request
OPTION(mon_osd_min_down_reporters, OPT_INT, 1) // number of OSDs who need to report a down OSD for it to count
-OPTION(mon_osd_min_down_reports, OPT_INT, 3) // number of times a down OSD must be reported for it to count
OPTION(mon_osd_force_trim_to, OPT_INT, 0) // force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous, use with care)
OPTION(mon_mds_force_trim_to, OPT_INT, 0) // force mon to trim mdsmaps to this point (dangerous, use with care)
}
dout(10) << " osd." << target_osd << " has "
- << fi.reporters.size() << " reporters and "
- << fi.num_reports << " reports, "
- << grace << " grace (" << orig_grace << " + " << my_grace << " + " << peer_grace << "), max_failed_since " << max_failed_since
+ << fi.reporters.size() << " reporters, "
+ << grace << " grace (" << orig_grace << " + " << my_grace
+ << " + " << peer_grace << "), max_failed_since " << max_failed_since
<< dendl;
// already pending failure?
}
if (failed_for >= grace &&
- ((int)fi.reporters.size() >= g_conf->mon_osd_min_down_reporters) &&
- (fi.num_reports >= g_conf->mon_osd_min_down_reports)) {
- dout(1) << " we have enough reports/reporters to mark osd." << target_osd << " down" << dendl;
+ ((int)fi.reporters.size() >= g_conf->mon_osd_min_down_reporters)) {
+ dout(1) << " we have enough reporters to mark osd." << target_osd
+ << " down" << dendl;
pending_inc.new_state[target_osd] = CEPH_OSD_UP;
mon->clog->info() << osdmap.get_inst(target_osd) << " failed ("
- << fi.num_reports << " reports from " << (int)fi.reporters.size() << " peers after "
+ << (int)fi.reporters.size() << " reporters after "
<< failed_for << " >= grace " << grace << ")\n";
return true;
}
{
op->mark_osdmon_event(__func__);
MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
- dout(1) << "prepare_failure " << m->get_target() << " from " << m->get_orig_source_inst()
+ dout(1) << "prepare_failure " << m->get_target()
+ << " from " << m->get_orig_source_inst()
<< " is reporting failure:" << m->if_osd_failed() << dendl;
int target_osd = m->get_target().name.num();
// calculate failure time
utime_t now = ceph_clock_now(g_ceph_context);
- utime_t failed_since = m->get_recv_stamp() - utime_t(m->failed_for ? m->failed_for : g_conf->osd_heartbeat_grace, 0);
+ utime_t failed_since =
+ m->get_recv_stamp() -
+ utime_t(m->failed_for ? m->failed_for : g_conf->osd_heartbeat_grace, 0);
if (m->if_osd_failed()) {
// add a report
} else {
// remove the report
mon->clog->debug() << m->get_target() << " failure report canceled by "
- << m->get_orig_source_inst() << "\n";
+ << m->get_orig_source_inst() << "\n";
if (failure_info.count(target_osd)) {
failure_info_t& fi = failure_info[target_osd];
list<MonOpRequestRef> ls;
ls.pop_front();
}
if (fi.reporters.empty()) {
- dout(10) << " removing last failure_info for osd." << target_osd << dendl;
+ dout(10) << " removing last failure_info for osd." << target_osd
+ << dendl;
failure_info.erase(target_osd);
} else {
dout(10) << " failure_info for osd." << target_osd << " now "
- << fi.reporters.size() << " reporters and "
- << fi.num_reports << " reports" << dendl;
+ << fi.reporters.size() << " reporters" << dendl;
}
} else {
dout(10) << " no failure_info for osd." << target_osd << dendl;
/// information about a particular peer's failure reports for one osd
struct failure_reporter_t {
- int num_reports; ///< reports from this reporter
utime_t failed_since; ///< when they think it failed
- MonOpRequestRef op; ///< most recent failure op request
+ MonOpRequestRef op; ///< failure op request
- failure_reporter_t() : num_reports(0) {}
- failure_reporter_t(utime_t s) : num_reports(1), failed_since(s) {}
+ failure_reporter_t() {}
+ failure_reporter_t(utime_t s) : failed_since(s) {}
~failure_reporter_t() { }
};
/// information about all failure reports for one osd
struct failure_info_t {
- map<int, failure_reporter_t> reporters; ///< reporter -> # reports
+ map<int, failure_reporter_t> reporters; ///< reporter -> failed_since etc
utime_t max_failed_since; ///< most recent failed_since
- int num_reports;
- failure_info_t() : num_reports(0) {}
+ failure_info_t() {}
utime_t get_failed_since() {
if (max_failed_since == utime_t() && !reporters.empty()) {
// set the message for the latest report. return any old op request we had,
// if any, so we can discard it.
MonOpRequestRef add_report(int who, utime_t failed_since,
- MonOpRequestRef op) {
+ MonOpRequestRef op) {
map<int, failure_reporter_t>::iterator p = reporters.find(who);
if (p == reporters.end()) {
if (max_failed_since == utime_t())
else if (max_failed_since < failed_since)
max_failed_since = failed_since;
p = reporters.insert(map<int, failure_reporter_t>::value_type(who, failure_reporter_t(failed_since))).first;
- } else {
- p->second.num_reports++;
}
- num_reports++;
MonOpRequestRef ret = p->second.op;
p->second.op = op;
map<int, failure_reporter_t>::iterator p = reporters.find(who);
if (p == reporters.end())
return;
- num_reports -= p->second.num_reports;
reporters.erase(p);
if (reporters.empty())
max_failed_since = utime_t();