]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: resend pending failure reports with a new mon session
authorSage Weil <sage@redhat.com>
Fri, 18 Sep 2015 01:48:30 +0000 (21:48 -0400)
committerSage Weil <sage@redhat.com>
Mon, 23 Nov 2015 13:36:14 +0000 (08:36 -0500)
Signed-off-by: Sage Weil <sage@redhat.com>
src/osd/OSD.cc
src/osd/OSD.h

index 8642179ef0383e71d1a7ef04fdaa8e6665d7fab2..b528914e39d72b4e071e521a2f6422014312356d 100644 (file)
@@ -3776,7 +3776,7 @@ void OSD::handle_osd_ping(MOSDPing *m)
        }
        if (failure_pending.count(from)) {
          dout(10) << "handle_osd_ping canceling in-flight failure report for osd." << from<< dendl;
-         send_still_alive(curmap->get_epoch(), failure_pending[from]);
+         send_still_alive(curmap->get_epoch(), failure_pending[from].second);
          failure_pending.erase(from);
        }
       }
@@ -4381,6 +4381,7 @@ void OSD::ms_handle_connect(Connection *con)
       // resend everything, it's a new session
       send_alive();
       service.send_pg_temp();
+      requeue_failures();
       send_failures();
       send_pg_stats(now);
 
@@ -4719,6 +4720,22 @@ void OSD::got_full_map(epoch_t e)
   }
 }
 
+void OSD::requeue_failures()
+{
+  assert(osd_lock.is_locked());
+  Mutex::Locker l(heartbeat_lock);
+  unsigned old_queue = failure_queue.size();
+  unsigned old_pending = failure_pending.size();
+  for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
+        failure_pending.begin();
+       p != failure_pending.end();
+       ++p) {
+    failure_queue[p->first] = p->second.first;
+  }
+  dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
+          << failure_queue.size() << dendl;
+}
+
 void OSD::send_failures()
 {
   assert(osd_lock.is_locked());
@@ -4730,7 +4747,7 @@ void OSD::send_failures()
     entity_inst_t i = osdmap->get_inst(osd);
     monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
                                           osdmap->get_epoch()));
-    failure_pending[osd] = i;
+    failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
     failure_queue.erase(osd);
   }
 }
index 8aef344305edfba2404fcb1b6c426aa727fafa2c..7ca705a85f71767a8a5fa625b38fa580af310407 100644 (file)
@@ -2001,9 +2001,9 @@ protected:
 
   // -- failures --
   map<int,utime_t> failure_queue;
-  map<int,entity_inst_t> failure_pending;
-
+  map<int,pair<utime_t,entity_inst_t> > failure_pending;
 
+  void requeue_failures();
   void send_failures();
   void send_still_alive(epoch_t epoch, const entity_inst_t &i);