]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: fix deferred_aio deadlock 16051/head
authorSage Weil <sage@redhat.com>
Sat, 1 Jul 2017 14:56:58 +0000 (10:56 -0400)
committerSage Weil <sage@redhat.com>
Sat, 1 Jul 2017 14:56:58 +0000 (10:56 -0400)
We submit aios while holding deferred_lock, and we take deferred_lock in
the aio completion handler (of which there is only 1).  That means that
if the aio queue fills up, the submitter will block with the lock while
the finisher won't be able to take the lock, progress, and drain completed
aios, leading to a deadlock.

Fix this by submiting aios *without* deferred_lock held.  Instead, demote
to a deferred_submit_lock.  Further, in the finisher, submit aios
via a finisher (this only happens when in deferred_aggressive mode which
is rare and not performance-sensitive).

Signed-off-by: Sage Weil <sage@redhat.com>
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h

index aa3d051b9299b863d648cc9752097fb8eb28d1cb..bef5a3311344445093eec59d1bdcd961a03e3261 100644 (file)
@@ -8020,9 +8020,11 @@ void BlueStore::_osr_drain_preceding(TransContext *txc)
   ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag?
   {
     // submit anything pending
-    std::lock_guard<std::mutex> l(deferred_lock);
+    deferred_lock.lock();
     if (osr->deferred_pending) {
-      _deferred_submit(osr);
+      _deferred_submit_unlock(osr);
+    } else {
+      deferred_lock.unlock();
     }
   }
   {
@@ -8049,8 +8051,7 @@ void BlueStore::_osr_drain_all()
   ++deferred_aggressive;
   {
     // submit anything pending
-    std::lock_guard<std::mutex> l(deferred_lock);
-    _deferred_try_submit();
+    deferred_try_submit();
   }
   {
     // wake up any previously finished deferred events
@@ -8455,10 +8456,9 @@ void BlueStore::_kv_finalize_thread()
       deferred_stable.clear();
 
       if (!deferred_aggressive) {
-       std::lock_guard<std::mutex> l(deferred_lock);
        if (deferred_queue_size >= deferred_batch_ops.load() ||
            throttle_deferred_bytes.past_midpoint()) {
-         _deferred_try_submit();
+         deferred_try_submit();
        }
       }
 
@@ -8485,7 +8485,7 @@ bluestore_deferred_op_t *BlueStore::_get_deferred_op(
 void BlueStore::_deferred_queue(TransContext *txc)
 {
   dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
-  std::lock_guard<std::mutex> l(deferred_lock);
+  deferred_lock.lock();
   if (!txc->osr->deferred_pending &&
       !txc->osr->deferred_running) {
     deferred_queue.push_back(*txc->osr);
@@ -8507,22 +8507,31 @@ void BlueStore::_deferred_queue(TransContext *txc)
   }
   if (deferred_aggressive &&
       !txc->osr->deferred_running) {
-    _deferred_submit(txc->osr.get());
+    _deferred_submit_unlock(txc->osr.get());
+  } else {
+    deferred_lock.unlock();
   }
 }
 
-void BlueStore::_deferred_try_submit()
+void BlueStore::deferred_try_submit()
 {
   dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
           << deferred_queue_size << " txcs" << dendl;
+  std::lock_guard<std::mutex> l(deferred_lock);
+  vector<OpSequencerRef> osrs;
+  osrs.reserve(deferred_queue.size());
   for (auto& osr : deferred_queue) {
-    if (!osr.deferred_running) {
-      _deferred_submit(&osr);
+    osrs.push_back(&osr);
+  }
+  for (auto& osr : osrs) {
+    if (!osr->deferred_running) {
+      _deferred_submit_unlock(osr.get());
+      deferred_lock.lock();
     }
   }
 }
 
-void BlueStore::_deferred_submit(OpSequencer *osr)
+void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
 {
   dout(10) << __func__ << " osr " << osr
           << " " << osr->deferred_pending->iomap.size() << " ios pending "
@@ -8570,6 +8579,10 @@ void BlueStore::_deferred_submit(OpSequencer *osr)
     bl.claim_append(i->second.bl);
     ++i;
   }
+
+  // demote to deferred_submit_lock, then drop that too
+  std::lock_guard<std::mutex> l(deferred_submit_lock);
+  deferred_lock.unlock();
   bdev->aio_submit(&b->ioc);
 }
 
@@ -8587,7 +8600,10 @@ void BlueStore::_deferred_aio_finish(OpSequencer *osr)
       auto q = deferred_queue.iterator_to(*osr);
       deferred_queue.erase(q);
     } else if (deferred_aggressive) {
-      _deferred_submit(osr);
+      dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
+      finishers[0]->queue(new FunctionContext([&](int) {
+           deferred_try_submit();
+         }));
     }
   }
 
index 1e2543ee4aaff9134f28a01703dff5b336c4dcbc..bb85b0514494ecd768b1d4bb8134be8806afe529 100644 (file)
@@ -1831,7 +1831,7 @@ private:
   interval_set<uint64_t> bluefs_extents;  ///< block extents owned by bluefs
   interval_set<uint64_t> bluefs_extents_reclaiming; ///< currently reclaiming
 
-  std::mutex deferred_lock;
+  std::mutex deferred_lock, deferred_submit_lock;
   std::atomic<uint64_t> deferred_seq = {0};
   deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending
   int deferred_queue_size = 0;         ///< num txc's queued across all osrs
@@ -2024,12 +2024,8 @@ private:
 
   bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o);
   void _deferred_queue(TransContext *txc);
-  void deferred_try_submit() {
-    std::lock_guard<std::mutex> l(deferred_lock);
-    _deferred_try_submit();
-  }
-  void _deferred_try_submit();
-  void _deferred_submit(OpSequencer *osr);
+  void deferred_try_submit();
+  void _deferred_submit_unlock(OpSequencer *osr);
   void _deferred_aio_finish(OpSequencer *osr);
   int _deferred_replay();