OPTION(osd_mon_heartbeat_interval, 0, OPT_INT, 30), // if no peers, ping monitor
OPTION(osd_heartbeat_grace, 0, OPT_INT, 20),
OPTION(osd_mon_report_interval, 0, OPT_INT, 5), // pg stats, failures, up_thru, boot.
+ OPTION(osd_min_down_reporters, 0, OPT_INT, 1), // number of OSDs who need to report a down OSD for it to count
+ OPTION(osd_min_down_reports, 0, OPT_INT, 1), // number of times a down OSD must be reported for it to count
OPTION(osd_replay_window, 0, OPT_INT, 45),
OPTION(osd_max_pull, 0, OPT_INT, 2),
OPTION(osd_preserve_trimmed_log, 0, OPT_BOOL, true),
class MOSDFailure : public PaxosServiceMessage {
public:
ceph_fsid_t fsid;
- entity_inst_t failed;
+ entity_inst_t target_osd;
+ __u8 is_failed;
epoch_t epoch;
MOSDFailure() : PaxosServiceMessage(MSG_OSD_FAILURE, 0) {}
MOSDFailure(const ceph_fsid_t &fs, entity_inst_t f, epoch_t e) :
PaxosServiceMessage(MSG_OSD_FAILURE, e),
- fsid(fs), failed(f), epoch(e) {}
+ fsid(fs), target_osd(f), is_failed(true), epoch(e) {}
private:
~MOSDFailure() {}
public:
- entity_inst_t get_failed() { return failed; }
+ entity_inst_t get_target() { return target_osd; }
+ bool if_osd_failed() { return is_failed; }
epoch_t get_epoch() { return epoch; }
void decode_payload() {
bufferlist::iterator p = payload.begin();
paxos_decode(p);
::decode(fsid, p);
- ::decode(failed, p);
+ ::decode(target_osd, p);
::decode(epoch, p);
+ if (header.version >=2)
+ ::decode(is_failed, p);
+ else is_failed = true;
}
void encode_payload() {
+ header.version = 2;
paxos_encode();
::encode(fsid, payload);
- ::encode(failed, payload);
+ ::encode(target_osd, payload);
::encode(epoch, payload);
+ ::encode(is_failed, payload);
}
const char *get_type_name() { return "osd_failure"; }
void print(ostream& out) {
- out << "osd_failure(" << failed << " e" << epoch << " v" << version << ")";
+ out << "osd_failure(" << target_osd << " e" << epoch << " v" << version << ")";
}
};
bool OSDMonitor::preprocess_failure(MOSDFailure *m)
{
- // who is failed
- int badboy = m->get_failed().name.num();
+ // who is target_osd
+ int badboy = m->get_target().name.num();
// check permissions
MonSession *session = m->get_session();
goto didit;
}
- /*
- * FIXME
- * this whole thing needs a rework of some sort. we shouldn't
- * be taking any failure report on faith. if A and B can't talk
- * to each other either A or B should be killed, but we should
- * make some attempt to make sure we choose the right one.
- */
-
// first, verify the reporting host is valid
if (m->get_orig_source().is_osd()) {
int from = m->get_orig_source().num();
// weird?
if (!osdmap.have_inst(badboy)) {
- dout(5) << "preprocess_failure dne(/dup?): " << m->get_failed() << ", from " << m->get_orig_source_inst() << dendl;
+ dout(5) << "preprocess_failure dne(/dup?): " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
if (m->get_epoch() < osdmap.get_epoch())
send_incremental(m, m->get_epoch()+1);
goto didit;
}
- if (osdmap.get_inst(badboy) != m->get_failed()) {
- dout(5) << "preprocess_failure wrong osd: report " << m->get_failed() << " != map's " << osdmap.get_inst(badboy)
+ if (osdmap.get_inst(badboy) != m->get_target()) {
+ dout(5) << "preprocess_failure wrong osd: report " << m->get_target() << " != map's " << osdmap.get_inst(badboy)
<< ", from " << m->get_orig_source_inst() << dendl;
if (m->get_epoch() < osdmap.get_epoch())
send_incremental(m, m->get_epoch()+1);
}
// already reported?
if (osdmap.is_down(badboy)) {
- dout(5) << "preprocess_failure dup: " << m->get_failed() << ", from " << m->get_orig_source_inst() << dendl;
+ dout(5) << "preprocess_failure dup: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
if (m->get_epoch() < osdmap.get_epoch())
send_incremental(m, m->get_epoch()+1);
goto didit;
}
- dout(10) << "preprocess_failure new: " << m->get_failed() << ", from " << m->get_orig_source_inst() << dendl;
+ dout(10) << "preprocess_failure new: " << m->get_target() << ", from " << m->get_orig_source_inst() << dendl;
return false;
didit:
bool OSDMonitor::prepare_failure(MOSDFailure *m)
{
stringstream ss;
- dout(1) << "prepare_failure " << m->get_failed() << " from " << m->get_orig_source_inst() << dendl;
+ dout(1) << "prepare_failure " << m->get_target() << " from " << m->get_orig_source_inst()
+ << " is reporting failure:" << m->if_osd_failed() << dendl;
- ss << m->get_failed() << " failed (by " << m->get_orig_source_inst() << ")";
+ ss << m->get_target() << " failed (by " << m->get_orig_source_inst() << ")";
mon->get_logclient()->log(LOG_INFO, ss);
- // FIXME
- // take their word for it
- int badboy = m->get_failed().name.num();
- assert(osdmap.is_up(badboy));
- assert(osdmap.get_addr(badboy) == m->get_failed().addr);
-
- pending_inc.new_down[badboy] = false;
+ int target_osd = m->get_target().name.num();
+ int reporter = m->get_orig_source().num();
+ assert(osdmap.is_up(target_osd));
+ assert(osdmap.get_addr(target_osd) == m->get_target().addr);
- paxos->wait_for_commit(new C_Reported(this, m));
+ if (m->if_osd_failed()) {
+ int reports = 0;
+ int reporters = 0;
+
+ if (failed_notes.count(target_osd)) {
+ multimap<int, pair<int, int> >::iterator i = failed_notes.lower_bound(target_osd);
+ while ((i != failed_notes.end()) && (i->first == target_osd)) {
+ if (i->second.first == reporter) {
+ ++i->second.second;
+ dout(10) << "adding new failure report from osd" << reporter
+ << " on osd" << target_osd << dendl;
+ reporter = -1;
+ }
+ ++reporters;
+ reports += i->second.second;
+ ++i;
+ }
+ }
+ if (reporter != -1) { //didn't get counted yet
+ failed_notes.insert(pair<int, pair<int, int> >
+ (target_osd, pair<int, int>(reporter, 1)));
+ ++reporters;
+ ++reports;
+ dout(10) << "osd" << reporter
+ << " is adding failure report on osd" << target_osd << dendl;
+ }
+
+ if ((reporters >= g_conf.osd_min_down_reporters) &&
+ (reports >= g_conf.osd_min_down_reports)) {
+ dout(1) << "have enough reports/reporters to mark osd" << target_osd
+ << " as down" << dendl;
+ pending_inc.new_down[target_osd] = false;
+ paxos->wait_for_commit(new C_Reported(this, m));
+ //clear out failure reports
+ failed_notes.erase(failed_notes.lower_bound(target_osd),
+ failed_notes.upper_bound(target_osd));
+ return true;
+ }
+ } else { //remove the report
+ multimap<int, pair<int, int> >::iterator i = failed_notes.lower_bound(target_osd);
+ while ((i->first == target_osd) && (i->second.first != reporter))
+ ++i;
+ if (i->second.first != reporter)
+ dout(0) << "got an OSD not-failed report from osd" << reporter
+ << " that hasn't reported failure! (or in previous epoch?)" << dendl;
+ else failed_notes.erase(i);
+ }
- return true;
+ return false;
}
void OSDMonitor::_reported_failure(MOSDFailure *m)
{
- dout(7) << "_reported_failure on " << m->get_failed() << ", telling " << m->get_orig_source_inst() << dendl;
+ dout(7) << "_reported_failure on " << m->get_target() << ", telling " << m->get_orig_source_inst() << dendl;
send_latest(m, m->get_epoch());
}