From d979e48d934407e087d5608d67c9484b692dfd22 Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Thu, 5 Aug 2010 09:48:41 -0700 Subject: [PATCH] osd: Adjust failure reporting. When a failure report is sent to the mon, the failed OSD is added to pending_failures. If the OSD gets a heartbeat from an OSD in pending_failures, it sends an MOSDFailure message repealing the previous failure report. If an OSD is marked as failed but a message hasn't been sent, it's simply removed from the failed list. --- src/osd/OSD.cc | 16 ++++++++++++++++ src/osd/OSD.h | 2 ++ 2 files changed, 18 insertions(+) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 0d85a213386..a1848c3d913 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1334,6 +1334,12 @@ void OSD::handle_osd_ping(MOSDPing *m) } heartbeat_from_stamp[from] = g_clock.now(); // don't let _my_ lag interfere. + // remove from failure lists if needed + if (failure_pending.count(from)) { + send_still_alive(from); + failure_pending.erase(from); + } + failure_queue.erase(from); } else { dout(10) << " ignoring " << m->get_source_inst() << dendl; } @@ -1376,6 +1382,7 @@ void OSD::heartbeat_check() << " since " << heartbeat_from_stamp[p->first] << " (cutoff " << grace << ")" << dendl; queue_failure(p->first); + } } } @@ -1620,9 +1627,18 @@ void OSD::send_failures() int osd = *failure_queue.begin(); monc->send_mon_message(new MOSDFailure(monc->get_fsid(), osdmap->get_inst(osd), osdmap->get_epoch())); failure_queue.erase(osd); + failure_pending.insert(osd); } } +void OSD::send_still_alive(int osd) +{ + MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osdmap->get_inst(osd), + osdmap->get_epoch()); + m->is_failed = false; + monc->send_mon_message(m); +} + void OSD::send_pg_stats() { assert(osd_lock.is_locked()); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 25af40500ff..0c6a73d0559 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -546,10 +546,12 @@ protected: set failure_queue; set failure_pending; + void queue_failure(int n) { failure_queue.insert(n); } void send_failures(); + void send_still_alive(int osd); // -- pg stats -- Mutex pg_stat_queue_lock; -- 2.47.3