]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: allow multiple DeferredBatches in flight at once
authorSage Weil <sage@redhat.com>
Thu, 3 Aug 2017 14:03:45 +0000 (10:03 -0400)
committerSage Weil <sage@redhat.com>
Fri, 4 Aug 2017 02:54:18 +0000 (22:54 -0400)
The current code only allows two DeferredBatches per osr: one that is
accumulating new writes and one that is currently in flight to disk.  If
the previous batch is in flight ot disk, then the currently accumulating
one can't also be queued for disk.

This can cause problems, notably that described in
http://tracker.ceph.com/issues/20295, where one transaction is trying to
grab deferred_throttle but cannot due to other in-progress txcs that
include deferred IO.  The short version is that it cannot queue all of the
IO needed, and that later when the IO does complete it is awkward to
determine whether other IO queued behind it also needs to be queued
immediately.  And since it's not, this leads to a deadlock/stall.

Simply allowing multiple batches of IO to be in flight at once is a simple
fix.  Specifically, in queue_transactions(), if throttle_deferred_bytes
get_or_fail() fails, we can now deferred_submit_all() and be sure that
other IO will be submitted and thus complete and release the throttle that
we need to continue.

It is possible that the deferred_aggressive behavior could be simplified
now that the old restriction is dropped, but that needs a closer review
of the code.

Fixes: http://tracker.ceph.com/issues/20295
Signed-off-by: Sage Weil <sage@redhat.com>
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h

index 3d9b695ffa960a56c29b663569534dfed24a62a4..7276b60fa40a9754e6d731da0579d553b14e27e2 100644 (file)
@@ -8062,7 +8062,7 @@ void BlueStore::_txc_finish(TransContext *txc)
   if (submit_deferred) {
     // we're pinning memory; flush!  we could be more fine-grained here but
     // i'm not sure it's worth the bother.
-    deferred_try_submit();
+    deferred_submit_all();
   }
 
   if (empty && osr->zombie) {
@@ -8125,7 +8125,7 @@ void BlueStore::_osr_drain_all()
   ++deferred_aggressive;
   {
     // submit anything pending
-    deferred_try_submit();
+    deferred_submit_all();
   }
   {
     // wake up any previously finished deferred events
@@ -8535,7 +8535,7 @@ void BlueStore::_kv_finalize_thread()
       if (!deferred_aggressive) {
        if (deferred_queue_size >= deferred_batch_ops.load() ||
            throttle_deferred_bytes.past_midpoint()) {
-         deferred_try_submit();
+         deferred_submit_all();
        }
       }
 
@@ -8564,7 +8564,7 @@ void BlueStore::_deferred_queue(TransContext *txc)
   dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl;
   deferred_lock.lock();
   if (!txc->osr->deferred_pending &&
-      !txc->osr->deferred_running) {
+      txc->osr->deferred_running.empty()) {
     deferred_queue.push_back(*txc->osr);
   }
   if (!txc->osr->deferred_pending) {
@@ -8582,15 +8582,14 @@ void BlueStore::_deferred_queue(TransContext *txc)
        cct, wt.seq, e.offset, e.length, p);
     }
   }
-  if (deferred_aggressive &&
-      !txc->osr->deferred_running) {
+  if (deferred_aggressive) {
     _deferred_submit_unlock(txc->osr.get());
   } else {
     deferred_lock.unlock();
   }
 }
 
-void BlueStore::deferred_try_submit()
+void BlueStore::deferred_submit_all()
 {
   dout(20) << __func__ << " " << deferred_queue.size() << " osrs, "
           << deferred_queue_size << " txcs" << dendl;
@@ -8601,7 +8600,7 @@ void BlueStore::deferred_try_submit()
     osrs.push_back(&osr);
   }
   for (auto& osr : osrs) {
-    if (osr->deferred_pending && !osr->deferred_running) {
+    if (osr->deferred_pending) {
       _deferred_submit_unlock(osr.get());
       deferred_lock.lock();
     }
@@ -8610,17 +8609,15 @@ void BlueStore::deferred_try_submit()
 
 void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
 {
-  dout(10) << __func__ << " osr " << osr
+  auto b = osr->deferred_pending;
+  dout(10) << __func__ << " osr " << osr << " b " << b
           << " " << osr->deferred_pending->iomap.size() << " ios pending "
           << dendl;
-  assert(osr->deferred_pending);
-  assert(!osr->deferred_running);
-
-  auto b = osr->deferred_pending;
+  assert(b);
   deferred_queue_size -= b->seq_bytes.size();
   assert(deferred_queue_size >= 0);
 
-  osr->deferred_running = osr->deferred_pending;
+  osr->deferred_running.push_back(osr->deferred_pending);
   osr->deferred_pending = nullptr;
 
   uint64_t start = 0, pos = 0;
@@ -8663,24 +8660,21 @@ void BlueStore::_deferred_submit_unlock(OpSequencer *osr)
   bdev->aio_submit(&b->ioc);
 }
 
-void BlueStore::_deferred_aio_finish(OpSequencer *osr)
+void BlueStore::_deferred_aio_finish(DeferredBatch *b)
 {
-  dout(10) << __func__ << " osr " << osr << dendl;
-  assert(osr->deferred_running);
-  DeferredBatch *b = osr->deferred_running;
+  OpSequencer *osr = b->osr;
+  dout(10) << __func__ << " osr " << osr << " b " << b << dendl;
 
   {
     std::lock_guard<std::mutex> l(deferred_lock);
-    assert(osr->deferred_running == b);
-    osr->deferred_running = nullptr;
-    if (!osr->deferred_pending) {
+    auto p = std::find(osr->deferred_running.begin(),
+                      osr->deferred_running.end(),
+                      b);
+    assert(p != osr->deferred_running.end());
+    osr->deferred_running.erase(p);
+    if (osr->deferred_running.empty() && !osr->deferred_pending) {
       auto q = deferred_queue.iterator_to(*osr);
       deferred_queue.erase(q);
-    } else if (deferred_aggressive) {
-      dout(20) << __func__ << " queuing async deferred_try_submit" << dendl;
-      finishers[0]->queue(new FunctionContext([&](int) {
-           deferred_try_submit();
-         }));
     }
   }
 
@@ -8814,7 +8808,7 @@ int BlueStore::queue_transactions(
   if (txc->deferred_txn) {
     // ensure we do not block here because of deferred writes
     if (!throttle_deferred_bytes.get_or_fail(txc->cost)) {
-      deferred_try_submit();
+      deferred_submit_all();
       throttle_deferred_bytes.get(txc->cost);
     }
   }
index 06a5418a64bf9e835765802db9f7ddeaa4de2952..c3ddfae449a43cc9b67978a3c8a85597a85d83f3 100644 (file)
@@ -1629,7 +1629,7 @@ public:
                       bufferlist::const_iterator& p);
 
     void aio_finish(BlueStore *store) override {
-      store->_deferred_aio_finish(osr);
+      store->_deferred_aio_finish(this);
     }
   };
 
@@ -1647,7 +1647,7 @@ public:
 
     boost::intrusive::list_member_hook<> deferred_osr_queue_item;
 
-    DeferredBatch *deferred_running = nullptr;
+    deque<DeferredBatch*> deferred_running;
     DeferredBatch *deferred_pending = nullptr;
 
     Sequencer *parent;
@@ -2035,9 +2035,9 @@ private:
 
   bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o);
   void _deferred_queue(TransContext *txc);
-  void deferred_try_submit();
+  void deferred_submit_all();
   void _deferred_submit_unlock(OpSequencer *osr);
-  void _deferred_aio_finish(OpSequencer *osr);
+  void _deferred_aio_finish(DeferredBatch *b);
   int _deferred_replay();
 
 public: