From ca32d575eb2673737198a63643d5d1923151eba3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Aug 2017 10:03:45 -0400 Subject: [PATCH] os/bluestore: allow multiple DeferredBatches in flight at once The current code only allows two DeferredBatches per osr: one that is accumulating new writes and one that is currently in flight to disk. If the previous batch is in flight ot disk, then the currently accumulating one can't also be queued for disk. This can cause problems, notably that described in http://tracker.ceph.com/issues/20295, where one transaction is trying to grab deferred_throttle but cannot due to other in-progress txcs that include deferred IO. The short version is that it cannot queue all of the IO needed, and that later when the IO does complete it is awkward to determine whether other IO queued behind it also needs to be queued immediately. And since it's not, this leads to a deadlock/stall. Simply allowing multiple batches of IO to be in flight at once is a simple fix. Specifically, in queue_transactions(), if throttle_deferred_bytes get_or_fail() fails, we can now deferred_submit_all() and be sure that other IO will be submitted and thus complete and release the throttle that we need to continue. It is possible that the deferred_aggressive behavior could be simplified now that the old restriction is dropped, but that needs a closer review of the code. Fixes: http://tracker.ceph.com/issues/20295 Signed-off-by: Sage Weil --- src/os/bluestore/BlueStore.cc | 48 +++++++++++++++-------------------- src/os/bluestore/BlueStore.h | 8 +++--- 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 3d9b695ffa960..7276b60fa40a9 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -8062,7 +8062,7 @@ void BlueStore::_txc_finish(TransContext *txc) if (submit_deferred) { // we're pinning memory; flush! we could be more fine-grained here but // i'm not sure it's worth the bother. - deferred_try_submit(); + deferred_submit_all(); } if (empty && osr->zombie) { @@ -8125,7 +8125,7 @@ void BlueStore::_osr_drain_all() ++deferred_aggressive; { // submit anything pending - deferred_try_submit(); + deferred_submit_all(); } { // wake up any previously finished deferred events @@ -8535,7 +8535,7 @@ void BlueStore::_kv_finalize_thread() if (!deferred_aggressive) { if (deferred_queue_size >= deferred_batch_ops.load() || throttle_deferred_bytes.past_midpoint()) { - deferred_try_submit(); + deferred_submit_all(); } } @@ -8564,7 +8564,7 @@ void BlueStore::_deferred_queue(TransContext *txc) dout(20) << __func__ << " txc " << txc << " osr " << txc->osr << dendl; deferred_lock.lock(); if (!txc->osr->deferred_pending && - !txc->osr->deferred_running) { + txc->osr->deferred_running.empty()) { deferred_queue.push_back(*txc->osr); } if (!txc->osr->deferred_pending) { @@ -8582,15 +8582,14 @@ void BlueStore::_deferred_queue(TransContext *txc) cct, wt.seq, e.offset, e.length, p); } } - if (deferred_aggressive && - !txc->osr->deferred_running) { + if (deferred_aggressive) { _deferred_submit_unlock(txc->osr.get()); } else { deferred_lock.unlock(); } } -void BlueStore::deferred_try_submit() +void BlueStore::deferred_submit_all() { dout(20) << __func__ << " " << deferred_queue.size() << " osrs, " << deferred_queue_size << " txcs" << dendl; @@ -8601,7 +8600,7 @@ void BlueStore::deferred_try_submit() osrs.push_back(&osr); } for (auto& osr : osrs) { - if (osr->deferred_pending && !osr->deferred_running) { + if (osr->deferred_pending) { _deferred_submit_unlock(osr.get()); deferred_lock.lock(); } @@ -8610,17 +8609,15 @@ void BlueStore::deferred_try_submit() void BlueStore::_deferred_submit_unlock(OpSequencer *osr) { - dout(10) << __func__ << " osr " << osr + auto b = osr->deferred_pending; + dout(10) << __func__ << " osr " << osr << " b " << b << " " << osr->deferred_pending->iomap.size() << " ios pending " << dendl; - assert(osr->deferred_pending); - assert(!osr->deferred_running); - - auto b = osr->deferred_pending; + assert(b); deferred_queue_size -= b->seq_bytes.size(); assert(deferred_queue_size >= 0); - osr->deferred_running = osr->deferred_pending; + osr->deferred_running.push_back(osr->deferred_pending); osr->deferred_pending = nullptr; uint64_t start = 0, pos = 0; @@ -8663,24 +8660,21 @@ void BlueStore::_deferred_submit_unlock(OpSequencer *osr) bdev->aio_submit(&b->ioc); } -void BlueStore::_deferred_aio_finish(OpSequencer *osr) +void BlueStore::_deferred_aio_finish(DeferredBatch *b) { - dout(10) << __func__ << " osr " << osr << dendl; - assert(osr->deferred_running); - DeferredBatch *b = osr->deferred_running; + OpSequencer *osr = b->osr; + dout(10) << __func__ << " osr " << osr << " b " << b << dendl; { std::lock_guard l(deferred_lock); - assert(osr->deferred_running == b); - osr->deferred_running = nullptr; - if (!osr->deferred_pending) { + auto p = std::find(osr->deferred_running.begin(), + osr->deferred_running.end(), + b); + assert(p != osr->deferred_running.end()); + osr->deferred_running.erase(p); + if (osr->deferred_running.empty() && !osr->deferred_pending) { auto q = deferred_queue.iterator_to(*osr); deferred_queue.erase(q); - } else if (deferred_aggressive) { - dout(20) << __func__ << " queuing async deferred_try_submit" << dendl; - finishers[0]->queue(new FunctionContext([&](int) { - deferred_try_submit(); - })); } } @@ -8814,7 +8808,7 @@ int BlueStore::queue_transactions( if (txc->deferred_txn) { // ensure we do not block here because of deferred writes if (!throttle_deferred_bytes.get_or_fail(txc->cost)) { - deferred_try_submit(); + deferred_submit_all(); throttle_deferred_bytes.get(txc->cost); } } diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 06a5418a64bf9..c3ddfae449a43 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1629,7 +1629,7 @@ public: bufferlist::const_iterator& p); void aio_finish(BlueStore *store) override { - store->_deferred_aio_finish(osr); + store->_deferred_aio_finish(this); } }; @@ -1647,7 +1647,7 @@ public: boost::intrusive::list_member_hook<> deferred_osr_queue_item; - DeferredBatch *deferred_running = nullptr; + deque deferred_running; DeferredBatch *deferred_pending = nullptr; Sequencer *parent; @@ -2035,9 +2035,9 @@ private: bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, OnodeRef o); void _deferred_queue(TransContext *txc); - void deferred_try_submit(); + void deferred_submit_all(); void _deferred_submit_unlock(OpSequencer *osr); - void _deferred_aio_finish(OpSequencer *osr); + void _deferred_aio_finish(DeferredBatch *b); int _deferred_replay(); public: -- 2.39.5