]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
SimpleMessenger: Pipe: do not block on takeover while holding global lock
authorGreg Farnum <gfarnum@redhat.com>
Tue, 28 Oct 2014 23:45:43 +0000 (16:45 -0700)
committerSage Weil <sage@redhat.com>
Mon, 12 Jan 2015 19:17:06 +0000 (11:17 -0800)
We previously were able to cause deadlocks:
1) Existing pipe is fast_dispatching
2) Replacement incoming pipe is accepted
  *) blocks on stop_and_wait() of existing Pipe
3) External things are blocked on SimpleMessenger::lock() while
  blocking completion of the fast dispatch.

To resolve this, if we detect that an existing Pipe we want to take over is
in the process of fast dispatching, we unlock our locks and wait on it to
finish. Then we go back to the lookup step and retry.

The effect of this should be safe:
1) We are not making any changes to the existing Pipe in new ways
2) We have not registered the new Pipe anywhere
3) We have not sent back any replies based on Messenger state to
   the remote endpoint.

Backport: giant
Fixes: #9921
Signed-off-by: Greg Farnum <gfarnum@redhat.com>
(cherry picked from commit 2d6980570af2226fdee0edfcfe5a8e7f60fae615)

src/msg/Pipe.cc
src/msg/Pipe.h

index 9dc5c349dbd9b8a0e12afc15a63172ef15135311..e73e2b1c8326f9094bc279c938e8422c5e1f5130 100644 (file)
@@ -85,7 +85,7 @@ Pipe::Pipe(SimpleMessenger *r, int st, PipeConnection *con)
     state(st),
     connection_state(NULL),
     reader_running(false), reader_needs_join(false),
-    reader_dispatching(false),
+    reader_dispatching(false), notify_on_dispatch_done(false),
     writer_running(false),
     in_q(&(r->dispatch_queue)),
     send_keepalive(false),
@@ -453,6 +453,7 @@ int Pipe::accept()
 
     ldout(msgr->cct,10) << "accept:  setting up session_security." << dendl;
 
+  retry_existing_lookup:
     msgr->lock.Lock();
     pipe_lock.Lock();
     if (msgr->dispatch_queue.stop)
@@ -464,6 +465,21 @@ int Pipe::accept()
     existing = msgr->_lookup_pipe(peer_addr);
     if (existing) {
       existing->pipe_lock.Lock(true);  // skip lockdep check (we are locking a second Pipe here)
+      if (existing->reader_dispatching) {
+       /** we need to wait, or we can deadlock if downstream
+        *  fast_dispatchers are (naughtily!) waiting on resources
+        *  held by somebody trying to make use of the SimpleMessenger lock.
+        *  So drop locks, wait, and retry. It just looks like a slow network
+        *  to everybody else.
+        */
+       pipe_lock.Unlock();
+       msgr->lock.Unlock();
+       existing->notify_on_dispatch_done = true;
+       while (existing->reader_dispatching)
+         existing->cond.Wait(existing->pipe_lock);
+       existing->pipe_lock.Unlock();
+       goto retry_existing_lookup;
+      }
 
       if (connect.global_seq < existing->peer_global_seq) {
        ldout(msgr->cct,10) << "accept existing " << existing << ".gseq " << existing->peer_global_seq
@@ -1591,8 +1607,11 @@ void Pipe::reader()
           in_q->fast_dispatch(m);
           pipe_lock.Lock();
          reader_dispatching = false;
-         if (state == STATE_CLOSED) // there might be somebody waiting
+         if (state == STATE_CLOSED ||
+             notify_on_dispatch_done) { // there might be somebody waiting
+           notify_on_dispatch_done = false;
            cond.Signal();
+         }
         } else {
           in_q->enqueue(m, m->get_priority(), conn_id);
         }
index 5496b5ce47d657190bea5bb4652bb68547b70696..347140359ab24e65bd05f14b41ff6849f11867b8 100644 (file)
@@ -188,6 +188,7 @@ class DispatchQueue;
 
     bool reader_running, reader_needs_join;
     bool reader_dispatching; /// reader thread is dispatching without pipe_lock
+    bool notify_on_dispatch_done; /// something wants a signal when dispatch done
     bool writer_running;
 
     map<int, list<Message*> > out_q;  // priority queue for outbound msgs