From: Sage Weil Date: Thu, 9 Mar 2017 22:28:58 +0000 (-0500) Subject: os/bluestore: avoid extra dev flush on single device when all io is deferred X-Git-Tag: v12.0.1~12^2~17 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1fefeeb39ef9f81a63b0e9643fc4c3d37cea86ba;p=ceph-ci.git os/bluestore: avoid extra dev flush on single device when all io is deferred If we have no non-deferred IO to flush, and we are running bluefs on a single shared device, then we can rely on the bluefs flush to make our current batch of deferred ios stable. Separate deferred into a "done" and "stable" list. If we do sync, put everything from "done" onto "stable". Otherwise, after we do our kv commit via bluefs, move "done" to "stable" then. Signed-off-by: Sage Weil --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index e0dab54ab7b..39803670b1f 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -3939,6 +3939,7 @@ int BlueStore::_open_db(bool create) bluefs->get_block_device_size(BlueFS::BDEV_DB) - BLUEFS_START); } bluefs_shared_bdev = BlueFS::BDEV_SLOW; + bluefs_single_shared_device = false; } else { bluefs_shared_bdev = BlueFS::BDEV_DB; } @@ -3993,6 +3994,7 @@ int BlueStore::_open_db(bool create) BDEV_LABEL_BLOCK_SIZE); } cct->_conf->set_val("rocksdb_separate_wal_dir", "true"); + bluefs_single_shared_device = false; } else { cct->_conf->set_val("rocksdb_separate_wal_dir", "false"); } @@ -7486,7 +7488,9 @@ void BlueStore::_kv_sync_thread() std::unique_lock l(kv_lock); while (true) { assert(kv_committing.empty()); - if (kv_queue.empty() && deferred_cleanup_queue.empty()) { + if (kv_queue.empty() && + deferred_done_queue.empty() && + deferred_stable_queue.empty()) { if (kv_stop) break; dout(20) << __func__ << " sleep" << dendl; @@ -7495,22 +7499,59 @@ void BlueStore::_kv_sync_thread() dout(20) << __func__ << " wake" << dendl; } else { deque kv_submitting; - deque deferred_cleaning; + deque deferred_done, deferred_stable; dout(20) << __func__ << " committing " << kv_queue.size() << " submitting " << kv_queue_unsubmitted.size() - << " cleaning " << deferred_cleanup_queue.size() << dendl; + << " deferred done " << deferred_done_queue.size() + << " stable " << deferred_stable_queue.size() + << dendl; kv_committing.swap(kv_queue); kv_submitting.swap(kv_queue_unsubmitted); - deferred_cleaning.swap(deferred_cleanup_queue); + deferred_done.swap(deferred_done_queue); + deferred_stable.swap(deferred_stable_queue); utime_t start = ceph_clock_now(); l.unlock(); - dout(30) << __func__ << " committing txc " << kv_committing << dendl; - dout(30) << __func__ << " submitting txc " << kv_submitting << dendl; - dout(30) << __func__ << " deferred_cleaning txc " << deferred_cleaning << dendl; + dout(30) << __func__ << " committing " << kv_committing << dendl; + dout(30) << __func__ << " submitting " << kv_submitting << dendl; + dout(30) << __func__ << " deferred_done " << deferred_done << dendl; + dout(30) << __func__ << " deferred_stable " << deferred_stable << dendl; - // flush/barrier on block device - bdev->flush(); + int num_aios = 0; + for (auto txc : kv_committing) { + if (txc->had_ios) { + ++num_aios; + } + } + + bool force_flush = false; + // if bluefs is sharing the same device as data (only), then we + // can rely on the bluefs commit to flush the device and make + // deferred aios stable. that means that if we do have done deferred + // txcs AND we are not on a single device, we need to force a flush. + if (!bluefs || (!bluefs_single_shared_device && !deferred_done.empty())) { + force_flush = true; + } + if (kv_committing.empty() && kv_submitting.empty() && + deferred_stable.empty()) { + force_flush = true; // there's nothing else to commit! + } + if (deferred_aggressive) { + force_flush = true; + } + + if (num_aios || force_flush) { + dout(20) << __func__ << " num_aios=" << num_aios + << " force_flush=" << (int)force_flush + << ", flushing, deferred done->stable" << dendl; + // flush/barrier on block device + bdev->flush(); + + // if we flush then deferred done are now deferred stable + deferred_stable.insert(deferred_stable.end(), deferred_done.begin(), + deferred_done.end()); + deferred_done.clear(); + } // we will use one final transaction to force a sync KeyValueDB::Transaction synct = db->get_transaction(); @@ -7552,9 +7593,11 @@ void BlueStore::_kv_sync_thread() txc->osr->qcond.notify_all(); } } - for (auto txc : kv_committing) { - if (txc->had_ios) { - --txc->osr->txc_with_unstable_io; + if (num_aios) { + for (auto txc : kv_committing) { + if (txc->had_ios) { + --txc->osr->txc_with_unstable_io; + } } } @@ -7575,7 +7618,7 @@ void BlueStore::_kv_sync_thread() } // cleanup sync deferred keys - for (auto txc : deferred_cleaning) { + for (auto txc : deferred_stable) { bluestore_deferred_transaction_t& wt = *txc->deferred_txn; if (!wt.released.empty()) { // kraken replay compat only @@ -7606,7 +7649,7 @@ void BlueStore::_kv_sync_thread() utime_t finish = ceph_clock_now(); utime_t dur = finish - start; dout(20) << __func__ << " committed " << kv_committing.size() - << " cleaned " << deferred_cleaning.size() + << " cleaned " << deferred_stable.size() << " in " << dur << dendl; while (!kv_committing.empty()) { TransContext *txc = kv_committing.front(); @@ -7614,10 +7657,10 @@ void BlueStore::_kv_sync_thread() _txc_state_proc(txc); kv_committing.pop_front(); } - while (!deferred_cleaning.empty()) { - TransContext *txc = deferred_cleaning.front(); + while (!deferred_stable.empty()) { + TransContext *txc = deferred_stable.front(); _txc_state_proc(txc); - deferred_cleaning.pop_front(); + deferred_stable.pop_front(); } if (!deferred_aggressive) { @@ -7646,6 +7689,9 @@ void BlueStore::_kv_sync_thread() } l.lock(); + // previously deferred "done" are now "stable" by virtue of this + // commit cycle. + deferred_stable_queue.swap(deferred_done); } } dout(10) << __func__ << " finish" << dendl; @@ -7770,7 +7816,7 @@ int BlueStore::_deferred_finish(TransContext *txc) txc->osr->qcond.notify_all(); throttle_deferred_ops.put(txc->ops); throttle_deferred_bytes.put(txc->bytes); - deferred_cleanup_queue.push_back(txc); + deferred_done_queue.push_back(txc); } finished.clear(); diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 8790e04bfc9..394f9e9d07b 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1653,6 +1653,7 @@ public: private: BlueFS *bluefs = nullptr; unsigned bluefs_shared_bdev = 0; ///< which bluefs bdev we are sharing + bool bluefs_single_shared_device = true; KeyValueDB *db = nullptr; BlockDevice *bdev = nullptr; std::string freelist_type; @@ -1698,7 +1699,8 @@ private: deque kv_queue; ///< ready, already submitted deque kv_queue_unsubmitted; ///< ready, need submit by kv thread deque kv_committing; ///< currently syncing - deque deferred_cleanup_queue; ///< deferred done, ready for cleanup + deque deferred_done_queue; ///< deferred ios done + deque deferred_stable_queue; ///< deferred ios done + stable PerfCounters *logger = nullptr;