]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Adjust failure reporting.
authorGreg Farnum <gregf@hq.newdream.net>
Thu, 5 Aug 2010 16:48:41 +0000 (09:48 -0700)
committerGreg Farnum <gregf@hq.newdream.net>
Thu, 5 Aug 2010 18:09:36 +0000 (11:09 -0700)
When a failure report is sent to the mon, the failed OSD is added to
pending_failures. If the OSD gets a heartbeat from an OSD in pending_failures,
it sends an MOSDFailure message repealing the previous failure report.
If an OSD is marked as failed but a message hasn't been sent, it's simply
removed from the failed list.

src/osd/OSD.cc
src/osd/OSD.h

index 0d85a213386531a03bb91e0438f842b038d486c6..a1848c3d913525beffb1ca83e80385f458a9b8f2 100644 (file)
@@ -1334,6 +1334,12 @@ void OSD::handle_osd_ping(MOSDPing *m)
     }
 
     heartbeat_from_stamp[from] = g_clock.now();  // don't let _my_ lag interfere.
+    // remove from failure lists if needed
+    if (failure_pending.count(from)) {
+      send_still_alive(from);
+      failure_pending.erase(from);
+    }
+    failure_queue.erase(from);
   } else {
     dout(10) << " ignoring " << m->get_source_inst() << dendl;
   }
@@ -1376,6 +1382,7 @@ void OSD::heartbeat_check()
              << " since " << heartbeat_from_stamp[p->first]
              << " (cutoff " << grace << ")" << dendl;
       queue_failure(p->first);
+
     }
   }
 }
@@ -1620,9 +1627,18 @@ void OSD::send_failures()
     int osd = *failure_queue.begin();
     monc->send_mon_message(new MOSDFailure(monc->get_fsid(), osdmap->get_inst(osd), osdmap->get_epoch()));
     failure_queue.erase(osd);
+    failure_pending.insert(osd);
   }
 }
 
+void OSD::send_still_alive(int osd)
+{
+  MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osdmap->get_inst(osd),
+                                  osdmap->get_epoch());
+  m->is_failed = false;
+  monc->send_mon_message(m);
+}
+
 void OSD::send_pg_stats()
 {
   assert(osd_lock.is_locked());
index 25af40500ff05d378b2a8fe6da1b80bb2f761df1..0c6a73d055958afb9b03b0b24888a570839a3de2 100644 (file)
@@ -546,10 +546,12 @@ protected:
   set<int> failure_queue;
   set<int> failure_pending;
 
+
   void queue_failure(int n) {
     failure_queue.insert(n);
   }
   void send_failures();
+  void send_still_alive(int osd);
 
   // -- pg stats --
   Mutex pg_stat_queue_lock;