OPTION(mon_sync_debug_provider, OPT_INT, -1) // monitor to be used as the sync provider
OPTION(mon_sync_debug_provider_fallback, OPT_INT, -1) // monitor to be used as fallback if sync provider fails
OPTION(mon_inject_sync_get_chunk_delay, OPT_DOUBLE, 0) // inject N second delay on each get_chunk request
-OPTION(mon_osd_min_down_reporters, OPT_INT, 2) // number of OSDs who need to report a down OSD for it to count
+OPTION(mon_osd_min_down_reporters, OPT_INT, 2) // number of OSDs from different subtrees who need to report a down OSD for it to count
+OPTION(mon_osd_reporter_subtree_level , OPT_STR, "host") // in which level of parent bucket the reporters are counted
OPTION(mon_osd_force_trim_to, OPT_INT, 0) // force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous, use with care)
OPTION(mon_mds_force_trim_to, OPT_INT, 0) // force mon to trim mdsmaps to this point (dangerous, use with care)
bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
{
+ set<string> reporters_by_subtree;
+ string reporter_subtree_level = g_conf->mon_osd_reporter_subtree_level;
utime_t orig_grace(g_conf->osd_heartbeat_grace, 0);
utime_t max_failed_since = fi.get_failed_since();
utime_t failed_for = now - max_failed_since;
for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
p != fi.reporters.end();
++p) {
+ // get the parent bucket whose type matches with "reporter_subtree_level".
+ // fall back to OSD if the level doesn't exist.
+ map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
+ map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
+ if (iter == reporter_loc.end()) {
+ reporters_by_subtree.insert("osd." + to_string(p->first));
+ } else {
+ reporters_by_subtree.insert(iter->second);
+ }
+
const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
utime_t elapsed = now - xi.down_stamp;
double decay = exp((double)elapsed * decay_k);
return true;
}
+
if (failed_for >= grace &&
- ((int)fi.reporters.size() >= g_conf->mon_osd_min_down_reporters)) {
+ (int)reporters_by_subtree.size() >= g_conf->mon_osd_min_down_reporters) {
dout(1) << " we have enough reporters to mark osd." << target_osd
<< " down" << dendl;
pending_inc.new_state[target_osd] = CEPH_OSD_UP;
mon->clog->info() << osdmap.get_inst(target_osd) << " failed ("
- << (int)fi.reporters.size() << " reporters after "
- << failed_for << " >= grace " << grace << ")\n";
+ << (int)reporters_by_subtree.size() << " reporters from different "
+ << reporter_subtree_level << " after "
+ << failed_for << " >= grace " << grace << ")\n";
return true;
}
return false;