From: Greg Farnum Date: Tue, 28 Oct 2014 23:45:43 +0000 (-0700) Subject: SimpleMessenger: Pipe: do not block on takeover while holding global lock X-Git-Tag: v0.90~23^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2d6980570af2226fdee0edfcfe5a8e7f60fae615;p=ceph.git SimpleMessenger: Pipe: do not block on takeover while holding global lock We previously were able to cause deadlocks: 1) Existing pipe is fast_dispatching 2) Replacement incoming pipe is accepted *) blocks on stop_and_wait() of existing Pipe 3) External things are blocked on SimpleMessenger::lock() while blocking completion of the fast dispatch. To resolve this, if we detect that an existing Pipe we want to take over is in the process of fast dispatching, we unlock our locks and wait on it to finish. Then we go back to the lookup step and retry. The effect of this should be safe: 1) We are not making any changes to the existing Pipe in new ways 2) We have not registered the new Pipe anywhere 3) We have not sent back any replies based on Messenger state to the remote endpoint. Backport: giant Signed-off-by: Greg Farnum --- diff --git a/src/msg/simple/Pipe.cc b/src/msg/simple/Pipe.cc index efd5e4d1ed26..4acabae1e7c6 100644 --- a/src/msg/simple/Pipe.cc +++ b/src/msg/simple/Pipe.cc @@ -87,7 +87,7 @@ Pipe::Pipe(SimpleMessenger *r, int st, PipeConnection *con) state(st), connection_state(NULL), reader_running(false), reader_needs_join(false), - reader_dispatching(false), + reader_dispatching(false), notify_on_dispatch_done(false), writer_running(false), in_q(&(r->dispatch_queue)), send_keepalive(false), @@ -461,6 +461,7 @@ int Pipe::accept() ldout(msgr->cct,10) << "accept: setting up session_security." << dendl; + retry_existing_lookup: msgr->lock.Lock(); pipe_lock.Lock(); if (msgr->dispatch_queue.stop) @@ -472,6 +473,21 @@ int Pipe::accept() existing = msgr->_lookup_pipe(peer_addr); if (existing) { existing->pipe_lock.Lock(true); // skip lockdep check (we are locking a second Pipe here) + if (existing->reader_dispatching) { + /** we need to wait, or we can deadlock if downstream + * fast_dispatchers are (naughtily!) waiting on resources + * held by somebody trying to make use of the SimpleMessenger lock. + * So drop locks, wait, and retry. It just looks like a slow network + * to everybody else. + */ + pipe_lock.Unlock(); + msgr->lock.Unlock(); + existing->notify_on_dispatch_done = true; + while (existing->reader_dispatching) + existing->cond.Wait(existing->pipe_lock); + existing->pipe_lock.Unlock(); + goto retry_existing_lookup; + } if (connect.global_seq < existing->peer_global_seq) { ldout(msgr->cct,10) << "accept existing " << existing << ".gseq " << existing->peer_global_seq @@ -1600,8 +1616,11 @@ void Pipe::reader() in_q->fast_dispatch(m); pipe_lock.Lock(); reader_dispatching = false; - if (state == STATE_CLOSED) // there might be somebody waiting + if (state == STATE_CLOSED || + notify_on_dispatch_done) { // there might be somebody waiting + notify_on_dispatch_done = false; cond.Signal(); + } } else { in_q->enqueue(m, m->get_priority(), conn_id); } diff --git a/src/msg/simple/Pipe.h b/src/msg/simple/Pipe.h index 3f33d31b7da2..59d7136f7956 100644 --- a/src/msg/simple/Pipe.h +++ b/src/msg/simple/Pipe.h @@ -197,6 +197,7 @@ class DispatchQueue; bool reader_running, reader_needs_join; bool reader_dispatching; /// reader thread is dispatching without pipe_lock + bool notify_on_dispatch_done; /// something wants a signal when dispatch done bool writer_running; map > out_q; // priority queue for outbound msgs