From c10a794f72613ffdccdd64e24840dfce93174f30 Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Thu, 14 Nov 2024 18:19:34 +0000 Subject: [PATCH] os/bluestore: Modify _deferred_replay Modify _deferred_replay to execute it directly, without involving BlueStore state machine. In result, kv-sync thread is not necessary. RocksDB L entries (deferred writes) are removed directly. Fixes: https://tracker.ceph.com/issues/68060, part responsible for stale L entries. Signed-off-by: Adam Kupczyk --- src/os/bluestore/BlueStore.cc | 40 ++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index c521ee8fbfc2..3330d8a1e7b9 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -14866,7 +14866,7 @@ void BlueStore::_kv_sync_thread() auto sync_start = mono_clock::now(); #endif // submit synct synchronously (block and wait for it to commit) - int r = db_was_opened_read_only || cct->_conf->bluestore_debug_omit_kv_commit ? + int r = cct->_conf->bluestore_debug_omit_kv_commit ? 0 : db->submit_transaction_sync(synct); ceph_assert(r == 0); @@ -15268,19 +15268,14 @@ int BlueStore::_deferred_replay() } ); } - CollectionRef ch = _get_collection(coll_t::meta()); - bool fake_ch = false; - if (!ch) { - // hmm, replaying initial mkfs? - ch = static_cast(create_new_collection(coll_t::meta()).get()); - fake_ch = true; - } - OpSequencer *osr = static_cast(ch->osr.get()); if (tracepoint_debug_deferred_replay_start) tracepoint_debug_deferred_replay_start(); + IOContext ioctx(cct, nullptr); + KeyValueDB::Transaction t = db->get_transaction(); KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED); for (it->lower_bound(string()); it->valid(); it->next(), ++count) { dout(20) << __func__ << " replay " << pretty_binary_string(it->key()) << dendl; + t->rmkey(PREFIX_DEFERRED, it->key()); bluestore_deferred_transaction_t *deferred_txn = new bluestore_deferred_transaction_t; bufferlist bl = it->value(); @@ -15297,22 +15292,29 @@ int BlueStore::_deferred_replay() bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents); if (has_some) { if (tracepoint_debug_deferred_replay_track) tracepoint_debug_deferred_replay_track(*deferred_txn); - TransContext *txc = _txc_create(ch.get(), osr, nullptr); - txc->deferred_txn = deferred_txn; - txc->set_state(TransContext::STATE_KV_DONE); - _txc_state_proc(txc); + for (auto& op: deferred_txn->ops) { + for (auto& e : op.extents) { + bufferlist t; + op.data.splice(0, e.length, &t); + bdev->aio_write(e.offset, t, &ioctx, false); + } + } } else { delete deferred_txn; } } out: - dout(20) << __func__ << " draining osr" << dendl; - _osr_register_zombie(osr); - _osr_drain_all(); - if (tracepoint_debug_deferred_replay_end) tracepoint_debug_deferred_replay_end(); - if (fake_ch) { - new_coll_map.clear(); + bdev->aio_submit(&ioctx); + dout(20) << __func__ << "waiting to complete IO" << dendl; + ioctx.aio_wait(); + dout(20) << __func__ << "wait done" << dendl; + if (!db_was_opened_read_only) { + db->submit_transaction_sync(t); + dout(20) << __func__ << "removed L keys" << dendl; + } else { + dout(10) << __func__ << "DB read-pnly, skipped L keys removal" << dendl; } + if (tracepoint_debug_deferred_replay_end) tracepoint_debug_deferred_replay_end(); dout(10) << __func__ << " completed " << count << " events" << dendl; return r; } -- 2.47.3