From 83e58257a913bae88b32a57c57469262cd7d16b3 Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Wed, 4 Aug 2010 16:02:17 -0700 Subject: [PATCH] osdmon: Adjust failure reporting. MOSDFailure can now be a failure or a not-failure report. If it is a failure, OSDMon will add it to a map of failure reports until a given OSD exceeds the minimum reports/reporters to qualify as failed, then marks it down. Also, only propose new actually marking an osd failed OSD behavior is not changed, and currently the min reports/reporters are set to 1 so external OSDMon behavior will also stay the same. --- src/config.cc | 2 + src/config.h | 2 + src/messages/MOSDFailure.h | 19 +++++--- src/mon/OSDMonitor.cc | 90 ++++++++++++++++++++++++++------------ src/mon/OSDMonitor.h | 1 + 5 files changed, 81 insertions(+), 33 deletions(-) diff --git a/src/config.cc b/src/config.cc index 9ab1614d7cc93..2418e86d07539 100644 --- a/src/config.cc +++ b/src/config.cc @@ -482,6 +482,8 @@ static struct config_option config_optionsp[] = { OPTION(osd_mon_heartbeat_interval, 0, OPT_INT, 30), // if no peers, ping monitor OPTION(osd_heartbeat_grace, 0, OPT_INT, 20), OPTION(osd_mon_report_interval, 0, OPT_INT, 5), // pg stats, failures, up_thru, boot. + OPTION(osd_min_down_reporters, 0, OPT_INT, 1), // number of OSDs who need to report a down OSD for it to count + OPTION(osd_min_down_reports, 0, OPT_INT, 1), // number of times a down OSD must be reported for it to count OPTION(osd_replay_window, 0, OPT_INT, 45), OPTION(osd_max_pull, 0, OPT_INT, 2), OPTION(osd_preserve_trimmed_log, 0, OPT_BOOL, true), diff --git a/src/config.h b/src/config.h index b8db0352f3296..3d42a347d45eb 100644 --- a/src/config.h +++ b/src/config.h @@ -343,6 +343,8 @@ struct md_config_t { int osd_mon_heartbeat_interval; int osd_heartbeat_grace; int osd_mon_report_interval; + int osd_min_down_reporters; + int osd_min_down_reports; int osd_replay_window; int osd_max_pull; bool osd_preserve_trimmed_log; diff --git a/src/messages/MOSDFailure.h b/src/messages/MOSDFailure.h index 111e3c1d7f3f0..0f68f5b51d61d 100644 --- a/src/messages/MOSDFailure.h +++ b/src/messages/MOSDFailure.h @@ -22,37 +22,44 @@ class MOSDFailure : public PaxosServiceMessage { public: ceph_fsid_t fsid; - entity_inst_t failed; + entity_inst_t target_osd; + __u8 is_failed; epoch_t epoch; MOSDFailure() : PaxosServiceMessage(MSG_OSD_FAILURE, 0) {} MOSDFailure(const ceph_fsid_t &fs, entity_inst_t f, epoch_t e) : PaxosServiceMessage(MSG_OSD_FAILURE, e), - fsid(fs), failed(f), epoch(e) {} + fsid(fs), target_osd(f), is_failed(true), epoch(e) {} private: ~MOSDFailure() {} public: - entity_inst_t get_failed() { return failed; } + entity_inst_t get_target() { return target_osd; } + bool if_osd_failed() { return is_failed; } epoch_t get_epoch() { return epoch; } void decode_payload() { bufferlist::iterator p = payload.begin(); paxos_decode(p); ::decode(fsid, p); - ::decode(failed, p); + ::decode(target_osd, p); ::decode(epoch, p); + if (header.version >=2) + ::decode(is_failed, p); + else is_failed = true; } void encode_payload() { + header.version = 2; paxos_encode(); ::encode(fsid, payload); - ::encode(failed, payload); + ::encode(target_osd, payload); ::encode(epoch, payload); + ::encode(is_failed, payload); } const char *get_type_name() { return "osd_failure"; } void print(ostream& out) { - out << "osd_failure(" << failed << " e" << epoch << " v" << version << ")"; + out << "osd_failure(" << target_osd << " e" << epoch << " v" << version << ")"; } }; diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 6411137eb2934..bd8a6f9b7a25b 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -311,8 +311,8 @@ bool OSDMonitor::should_propose(double& delay) bool OSDMonitor::preprocess_failure(MOSDFailure *m) { - // who is failed - int badboy = m->get_failed().name.num(); + // who is target_osd + int badboy = m->get_target().name.num(); // check permissions MonSession *session = m->get_session(); @@ -329,14 +329,6 @@ bool OSDMonitor::preprocess_failure(MOSDFailure *m) goto didit; } - /* - * FIXME - * this whole thing needs a rework of some sort. we shouldn't - * be taking any failure report on faith. if A and B can't talk - * to each other either A or B should be killed, but we should - * make some attempt to make sure we choose the right one. - */ - // first, verify the reporting host is valid if (m->get_orig_source().is_osd()) { int from = m->get_orig_source().num(); @@ -352,13 +344,13 @@ bool OSDMonitor::preprocess_failure(MOSDFailure *m) // weird? if (!osdmap.have_inst(badboy)) { - dout(5) << "preprocess_failure dne(/dup?): " << m->get_failed() << ", from " << m->get_orig_source_inst() << dendl; + dout(5) << "preprocess_failure dne(/dup?): " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl; if (m->get_epoch() < osdmap.get_epoch()) send_incremental(m, m->get_epoch()+1); goto didit; } - if (osdmap.get_inst(badboy) != m->get_failed()) { - dout(5) << "preprocess_failure wrong osd: report " << m->get_failed() << " != map's " << osdmap.get_inst(badboy) + if (osdmap.get_inst(badboy) != m->get_target()) { + dout(5) << "preprocess_failure wrong osd: report " << m->get_target() << " != map's " << osdmap.get_inst(badboy) << ", from " << m->get_orig_source_inst() << dendl; if (m->get_epoch() < osdmap.get_epoch()) send_incremental(m, m->get_epoch()+1); @@ -366,13 +358,13 @@ bool OSDMonitor::preprocess_failure(MOSDFailure *m) } // already reported? if (osdmap.is_down(badboy)) { - dout(5) << "preprocess_failure dup: " << m->get_failed() << ", from " << m->get_orig_source_inst() << dendl; + dout(5) << "preprocess_failure dup: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl; if (m->get_epoch() < osdmap.get_epoch()) send_incremental(m, m->get_epoch()+1); goto didit; } - dout(10) << "preprocess_failure new: " << m->get_failed() << ", from " << m->get_orig_source_inst() << dendl; + dout(10) << "preprocess_failure new: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl; return false; didit: @@ -383,27 +375,71 @@ bool OSDMonitor::preprocess_failure(MOSDFailure *m) bool OSDMonitor::prepare_failure(MOSDFailure *m) { stringstream ss; - dout(1) << "prepare_failure " << m->get_failed() << " from " << m->get_orig_source_inst() << dendl; + dout(1) << "prepare_failure " << m->get_target() << " from " << m->get_orig_source_inst() + << " is reporting failure:" << m->if_osd_failed() << dendl; - ss << m->get_failed() << " failed (by " << m->get_orig_source_inst() << ")"; + ss << m->get_target() << " failed (by " << m->get_orig_source_inst() << ")"; mon->get_logclient()->log(LOG_INFO, ss); - // FIXME - // take their word for it - int badboy = m->get_failed().name.num(); - assert(osdmap.is_up(badboy)); - assert(osdmap.get_addr(badboy) == m->get_failed().addr); - - pending_inc.new_down[badboy] = false; + int target_osd = m->get_target().name.num(); + int reporter = m->get_orig_source().num(); + assert(osdmap.is_up(target_osd)); + assert(osdmap.get_addr(target_osd) == m->get_target().addr); - paxos->wait_for_commit(new C_Reported(this, m)); + if (m->if_osd_failed()) { + int reports = 0; + int reporters = 0; + + if (failed_notes.count(target_osd)) { + multimap >::iterator i = failed_notes.lower_bound(target_osd); + while ((i != failed_notes.end()) && (i->first == target_osd)) { + if (i->second.first == reporter) { + ++i->second.second; + dout(10) << "adding new failure report from osd" << reporter + << " on osd" << target_osd << dendl; + reporter = -1; + } + ++reporters; + reports += i->second.second; + ++i; + } + } + if (reporter != -1) { //didn't get counted yet + failed_notes.insert(pair > + (target_osd, pair(reporter, 1))); + ++reporters; + ++reports; + dout(10) << "osd" << reporter + << " is adding failure report on osd" << target_osd << dendl; + } + + if ((reporters >= g_conf.osd_min_down_reporters) && + (reports >= g_conf.osd_min_down_reports)) { + dout(1) << "have enough reports/reporters to mark osd" << target_osd + << " as down" << dendl; + pending_inc.new_down[target_osd] = false; + paxos->wait_for_commit(new C_Reported(this, m)); + //clear out failure reports + failed_notes.erase(failed_notes.lower_bound(target_osd), + failed_notes.upper_bound(target_osd)); + return true; + } + } else { //remove the report + multimap >::iterator i = failed_notes.lower_bound(target_osd); + while ((i->first == target_osd) && (i->second.first != reporter)) + ++i; + if (i->second.first != reporter) + dout(0) << "got an OSD not-failed report from osd" << reporter + << " that hasn't reported failure! (or in previous epoch?)" << dendl; + else failed_notes.erase(i); + } - return true; + return false; } void OSDMonitor::_reported_failure(MOSDFailure *m) { - dout(7) << "_reported_failure on " << m->get_failed() << ", telling " << m->get_orig_source_inst() << dendl; + dout(7) << "_reported_failure on " << m->get_target() << ", telling " << m->get_orig_source_inst() << dendl; send_latest(m, m->get_epoch()); } diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index f649aa331039c..1e98b2ed43d3c 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -45,6 +45,7 @@ private: // [leader] OSDMap::Incremental pending_inc; + multimap > failed_notes; // > map down_pending_out; // osd down -> out map osd_weight; -- 2.39.5