From a4b90122686ff4d31bc07bfda93e629d6d64e5a4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 9 Mar 2017 14:17:47 -0500 Subject: [PATCH] os/bluestore: batch up to bluestore_deferred_batch_ops before submitting Allow several deferred writes to accumulate before we submit them. In general we have no time pressure, and on HDD (and perhaps sometimes SSD) it is beneficial to accumulate and batch these so that they result in fewer seeks. On HDD, this is particularly true of seeks away from the journal. And on sequential workloads this can avoid seeks. In may even allow the block layer or SSD firmware to merge IOs and perform fewer writes. Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/os/bluestore/BlueStore.cc | 37 ++++++++++++++++++++++++++--------- src/os/bluestore/BlueStore.h | 5 +++-- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 27f8be5bd51..f29940b8a79 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1108,6 +1108,7 @@ OPTION(bluestore_max_ops, OPT_U64, 512) OPTION(bluestore_max_bytes, OPT_U64, 64*1024*1024) OPTION(bluestore_deferred_max_ops, OPT_U64, 512) OPTION(bluestore_deferred_max_bytes, OPT_U64, 128*1024*1024) +OPTION(bluestore_deferred_batch_ops, OPT_U64, 8) OPTION(bluestore_nid_prealloc, OPT_INT, 1024) OPTION(bluestore_blobid_prealloc, OPT_U64, 10240) OPTION(bluestore_clone_cow, OPT_BOOL, true) // do copy-on-write for clones diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index c47894fb65f..1fa2ef1d1ed 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -7441,7 +7441,12 @@ void BlueStore::_osr_drain_all() s = osr_set; } - deferred_aggressive_cleanup = true; + deferred_aggressive = true; + { + // submit anything pending + std::lock_guard l(deferred_lock); + _deferred_try_submit(); + } { // wake up any previously finished deferred events std::lock_guard l(kv_lock); @@ -7451,7 +7456,7 @@ void BlueStore::_osr_drain_all() dout(20) << __func__ << " drain " << osr << dendl; osr->drain(); } - deferred_aggressive_cleanup = false; + deferred_aggressive = false; dout(10) << __func__ << " done" << dendl; } @@ -7609,6 +7614,13 @@ void BlueStore::_kv_sync_thread() deferred_cleaning.pop_front(); } + if (!deferred_aggressive) { + std::lock_guard l(deferred_lock); + if (deferred_queue_size >= (int)g_conf->bluestore_deferred_batch_ops) { + _deferred_try_submit(); + } + } + // this is as good a place as any ... _reap_collections(); @@ -7652,14 +7664,17 @@ void BlueStore::_deferred_queue(TransContext *txc) deferred_queue.push_back(*txc->osr); } txc->osr->deferred_pending.push_back(*txc); - if (txc->osr->deferred_running.empty()) { + ++deferred_queue_size; + if (deferred_aggressive && + txc->osr->deferred_running.empty()) { _deferred_try_submit(txc->osr.get()); } } void BlueStore::_deferred_try_submit() { - dout(20) << __func__ << " " << deferred_queue.size() << " osrs" << dendl; + dout(20) << __func__ << " " << deferred_queue.size() << " osrs, " + << deferred_queue_size << " txcs" << dendl; for (auto& osr : deferred_queue) { if (osr.deferred_running.empty()) { _deferred_try_submit(&osr); @@ -7671,8 +7686,12 @@ void BlueStore::_deferred_try_submit(OpSequencer *osr) { dout(10) << __func__ << " osr " << osr << " " << osr->deferred_pending.size() << " pending " << dendl; + assert(!osr->deferred_pending.empty()); assert(osr->deferred_running.empty()); - osr->deferred_pending.swap(osr->deferred_running); + + deferred_queue_size -= osr->deferred_pending.size(); + assert(deferred_queue_size >= 0); + osr->deferred_running.swap(osr->deferred_pending); // attach all IO to the last in the batch TransContext *last = &osr->deferred_running.back(); @@ -7729,11 +7748,11 @@ int BlueStore::_deferred_finish(TransContext *txc) assert(txc->osr->deferred_txc == txc); txc->osr->deferred_blocks.clear(); finished.swap(txc->osr->deferred_running); - if (!txc->osr->deferred_pending.empty()) { - _deferred_try_submit(txc->osr.get()); - } else { + if (txc->osr->deferred_pending.empty()) { auto q = deferred_queue.iterator_to(*txc->osr); deferred_queue.erase(q); + } else if (deferred_aggressive) { + _deferred_try_submit(txc->osr.get()); } } @@ -7751,7 +7770,7 @@ int BlueStore::_deferred_finish(TransContext *txc) // in the normal case, do not bother waking up the kv thread; it will // catch us on the next commit anyway. - if (deferred_aggressive_cleanup) { + if (deferred_aggressive) { kv_cond.notify_one(); } return 0; diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 3f316082e0b..8790e04bfc9 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1684,8 +1684,9 @@ private: std::mutex deferred_lock; std::atomic deferred_seq = {0}; - deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending - bool deferred_aggressive_cleanup = false; ///< aggressive wakeup of kv thread + deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending + int deferred_queue_size = 0; ///< num txc's queued across all osrs + bool deferred_aggressive = false; ///< aggressive wakeup of kv thread int m_finisher_num = 1; vector finishers; -- 2.39.5