From 4f1792d7694e065e3bcdb48d94d66e936c390daf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Sep 2012 14:37:55 -0700 Subject: [PATCH] osd: include failed_for in MOSDFailure reports The monitor will need this to dynamically adjust the heartbeat grace. Closes: #3044 Signed-off-by: Sage Weil --- src/messages/MOSDFailure.h | 16 +++++++++++----- src/osd/OSD.cc | 17 +++++++++++------ src/osd/OSD.h | 5 +---- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/messages/MOSDFailure.h b/src/messages/MOSDFailure.h index 3b8ed0b4a2cce..807ab2b679f2a 100644 --- a/src/messages/MOSDFailure.h +++ b/src/messages/MOSDFailure.h @@ -21,18 +21,19 @@ class MOSDFailure : public PaxosServiceMessage { - static const int HEAD_VERSION = 2; + static const int HEAD_VERSION = 3; public: uuid_d fsid; entity_inst_t target_osd; __u8 is_failed; epoch_t epoch; + int32_t failed_for; // known to be failed since at least this long MOSDFailure() : PaxosServiceMessage(MSG_OSD_FAILURE, 0, HEAD_VERSION) { } - MOSDFailure(const uuid_d &fs, entity_inst_t f, epoch_t e) + MOSDFailure(const uuid_d &fs, entity_inst_t f, int duration, epoch_t e) : PaxosServiceMessage(MSG_OSD_FAILURE, e, HEAD_VERSION), - fsid(fs), target_osd(f), is_failed(true), epoch(e) { } + fsid(fs), target_osd(f), is_failed(true), epoch(e), failed_for(duration) { } private: ~MOSDFailure() {} @@ -47,10 +48,14 @@ public: ::decode(fsid, p); ::decode(target_osd, p); ::decode(epoch, p); - if (header.version >=2) + if (header.version >= 2) ::decode(is_failed, p); else is_failed = true; + if (header.version >= 3) + ::decode(failed_for, p); + else + failed_for = 0; } void encode_payload(uint64_t features) { paxos_encode(); @@ -58,11 +63,12 @@ public: ::encode(target_osd, payload); ::encode(epoch, payload); ::encode(is_failed, payload); + ::encode(failed_for, payload); } const char *get_type_name() const { return "osd_failure"; } void print(ostream& out) const { - out << "osd_failure(" << target_osd << " e" << epoch << " v" << version << ")"; + out << "osd_failure(" << target_osd << " for " << failed_for << " e" << epoch << " v" << version << ")"; } }; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 5add675055892..675665ff8f472 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1957,16 +1957,19 @@ void OSD::heartbeat_check() derr << "heartbeat_check: no reply from osd." << p->first << " ever, first ping sent " << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl; + + // fail + failure_queue[p->first] = p->second.last_tx; } else { if (p->second.last_rx > cutoff) continue; // got recent reply derr << "heartbeat_check: no reply from osd." << p->first << " since " << p->second.last_rx << " (cutoff " << cutoff << ")" << dendl; - } - // fail! - queue_failure(p->first); + // fail + failure_queue[p->first] = p->second.last_rx; + } } } @@ -2459,10 +2462,12 @@ void OSD::send_failures() heartbeat_lock.Lock(); locked = true; } + utime_t now = ceph_clock_now(g_ceph_context); while (!failure_queue.empty()) { - int osd = *failure_queue.begin(); + int osd = failure_queue.begin()->first; + int failed_for = (int)(double)(now - failure_queue.begin()->second); entity_inst_t i = osdmap->get_inst(osd); - monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, osdmap->get_epoch())); + monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for, osdmap->get_epoch())); failure_pending[osd] = i; failure_queue.erase(osd); } @@ -2471,7 +2476,7 @@ void OSD::send_failures() void OSD::send_still_alive(epoch_t epoch, entity_inst_t i) { - MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, epoch); + MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch); m->is_failed = false; monc->send_mon_message(m); } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 9cf4c43df70ae..a644cf6a811d9 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -825,13 +825,10 @@ protected: void send_alive(); // -- failures -- - set failure_queue; + map failure_queue; map failure_pending; - void queue_failure(int n) { - failure_queue.insert(n); - } void send_failures(); void send_still_alive(epoch_t epoch, entity_inst_t i); -- 2.39.5