From f2b33bc9719fa5f7bafbbd0a4e591ae350fc85af Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Fri, 15 Nov 2024 15:11:33 +0000 Subject: [PATCH] os/bluestore: Split _deferred_replay into 1) apply IO and 2) remove keys Modify _deferred_replay to separate: - applying IO to the disk - DB transaction to remove keys Changed _open_db_and_around. It now calls _deferred_replay. Adapted callers, including fsck. Fixed: https://tracker.ceph.com/issues/68060, original report. Signed-off-by: Adam Kupczyk --- src/os/bluestore/BlueStore.cc | 113 +++++++++++++++++----------------- src/os/bluestore/BlueStore.h | 8 ++- 2 files changed, 64 insertions(+), 57 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 3330d8a1e7b9..99b6b2d7ab5b 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -7668,9 +7668,19 @@ int BlueStore::_is_bluefs(bool create, bool* ret) * opens both DB and dependant super_meta, FreelistManager and allocator * in the proper order */ -int BlueStore::_open_db_and_around(bool read_only, bool to_repair) -{ - dout(5) << __func__ << "::NCB::read_only=" << read_only << ", to_repair=" << to_repair << dendl; +int BlueStore::_open_db_and_around( + bool read_only, + bool to_repair, + bool apply_deferred, + bool remove_deferred) +{ + dout(5) << __func__ << "read_only=" << read_only + << ", to_repair=" << to_repair + << ", deferred=" << (apply_deferred?"apply;":"noapply;") + << (remove_deferred?"remove":"noremove") << dendl; + ceph_assert(remove_deferred == false || apply_deferred == true); + ceph_assert(read_only == false || remove_deferred == false); + std::vector keys_to_remove; { string type; int r = read_meta("type", &type); @@ -7730,6 +7740,11 @@ int BlueStore::_open_db_and_around(bool read_only, bool to_repair) if (bdev_label_multi) { _main_bdev_label_try_reserve(); } + // This is the place where we can apply deferred writes + // without risk of some interaction with RocksDB allocating. + if (apply_deferred) { + _deferred_replay(remove_deferred ? &keys_to_remove : nullptr); + } // Re-open in the proper mode(s). @@ -7747,6 +7762,14 @@ int BlueStore::_open_db_and_around(bool read_only, bool to_repair) _post_init_alloc(); } + if (remove_deferred && !keys_to_remove.empty()) { + KeyValueDB::Transaction deferred_keys_remove_txn = db->get_transaction(); + for (auto& s : keys_to_remove) { + deferred_keys_remove_txn->rmkey(PREFIX_DEFERRED, s); + } + db->submit_transaction_sync(deferred_keys_remove_txn); + } + // when function is called in repair mode (to_repair=true) we skip db->open()/create() // we can't change bluestore allocation so no need to invlidate allocation-file if (fm->is_null_manager() && !read_only && !to_repair) { @@ -9081,7 +9104,7 @@ int BlueStore::mount_readonly() } }); - r = _deferred_replay(); + r = _deferred_replay(nullptr); if (r < 0) { return r; } @@ -9217,8 +9240,7 @@ int BlueStore::_mount() return -EINVAL; } - dout(5) << __func__ << "::NCB::calling open_db_and_around(read/write)" << dendl; - int r = _open_db_and_around(false); + int r = _open_db_and_around(false, false, true, true); if (r < 0) { return r; } @@ -9256,11 +9278,6 @@ int BlueStore::_mount() } }); - r = _deferred_replay(); - if (r < 0) { - return r; - } - mempool_thread.init(); if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) && @@ -10616,7 +10633,7 @@ int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair) // in deep mode we need R/W write access to be able to replay deferred ops const bool read_only = !(repair || depth == FSCK_DEEP); - int r = _open_db_and_around(read_only); + int r = _open_db_and_around(read_only, false, !read_only, !read_only); if (r < 0) { return r; } @@ -10642,17 +10659,6 @@ int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair) mempool_thread.shutdown(); _shutdown_cache(); }); - // we need finisher and kv_{sync,finalize}_thread *just* for replay - // enable in repair or deep mode modes only - if (!read_only) { - _kv_start(); - r = _deferred_replay(); - _kv_stop(); - } - - if (r < 0) { - return r; - } return _fsck_on_open(depth, repair); } @@ -14700,6 +14706,7 @@ void BlueStore::_kv_sync_thread() deque deferred_stable_queue; ///< deferred ios done + stable std::unique_lock l{kv_lock}; ceph_assert(!kv_sync_started); + ceph_assert(!db_was_opened_read_only); kv_sync_started = true; kv_cond.notify_all(); @@ -15254,7 +15261,7 @@ void BlueStore::_deferred_aio_finish(OpSequencer *osr) } } -int BlueStore::_deferred_replay() +int BlueStore::_deferred_replay(std::vector* keys_to_remove) { dout(10) << __func__ << " start" << dendl; int count = 0; @@ -15270,50 +15277,46 @@ int BlueStore::_deferred_replay() } if (tracepoint_debug_deferred_replay_start) tracepoint_debug_deferred_replay_start(); IOContext ioctx(cct, nullptr); - KeyValueDB::Transaction t = db->get_transaction(); KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED); - for (it->lower_bound(string()); it->valid(); it->next(), ++count) { + for (it->lower_bound(string()); it->valid(); /*iterator update outside*/) { dout(20) << __func__ << " replay " << pretty_binary_string(it->key()) << dendl; - t->rmkey(PREFIX_DEFERRED, it->key()); - bluestore_deferred_transaction_t *deferred_txn = - new bluestore_deferred_transaction_t; + if (keys_to_remove) { + keys_to_remove->push_back(it->key()); + } + bluestore_deferred_transaction_t deferred_txn; bufferlist bl = it->value(); auto p = bl.cbegin(); try { - decode(*deferred_txn, p); + decode(deferred_txn, p); + + bool has_some = _eliminate_outdated_deferred(&deferred_txn, bluefs_extents); + if (has_some) { + if (tracepoint_debug_deferred_replay_track) tracepoint_debug_deferred_replay_track(deferred_txn); + for (auto& op: deferred_txn.ops) { + for (auto& e : op.extents) { + bufferlist t; + op.data.splice(0, e.length, &t); + bdev->aio_write(e.offset, t, &ioctx, false); + } + } + } } catch (ceph::buffer::error& e) { derr << __func__ << " failed to decode deferred txn " << pretty_binary_string(it->key()) << dendl; - delete deferred_txn; r = -EIO; - goto out; } - bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents); - if (has_some) { - if (tracepoint_debug_deferred_replay_track) tracepoint_debug_deferred_replay_track(*deferred_txn); - for (auto& op: deferred_txn->ops) { - for (auto& e : op.extents) { - bufferlist t; - op.data.splice(0, e.length, &t); - bdev->aio_write(e.offset, t, &ioctx, false); - } - } - } else { - delete deferred_txn; + // update loop iteration here, so we can inject action + ++count; + it->next(); + if (ioctx.num_pending.load() > 100 || !it->valid()) { + dout(20) << __func__ << "submitting IO batch" << dendl; + bdev->aio_submit(&ioctx); + ioctx.aio_wait(); + dout(20) << __func__ << "wait done" << dendl; + ioctx.release_running_aios(); } } - out: - bdev->aio_submit(&ioctx); - dout(20) << __func__ << "waiting to complete IO" << dendl; - ioctx.aio_wait(); - dout(20) << __func__ << "wait done" << dendl; - if (!db_was_opened_read_only) { - db->submit_transaction_sync(t); - dout(20) << __func__ << "removed L keys" << dendl; - } else { - dout(10) << __func__ << "DB read-pnly, skipped L keys removal" << dendl; - } if (tracepoint_debug_deferred_replay_end) tracepoint_debug_deferred_replay_end(); dout(10) << __func__ << " completed " << count << " events" << dendl; return r; diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 12a9140ff794..6fb602f85bfd 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2812,7 +2812,11 @@ private: * opens both DB and dependant super_meta, FreelistManager and allocator * in the proper order */ - int _open_db_and_around(bool read_only, bool to_repair = false); + int _open_db_and_around( + bool read_only, + bool to_repair = false, + bool apply_deferred = false, + bool remove_deferred = false); void _close_db_and_around(); void _close_around_db(); @@ -2937,7 +2941,7 @@ public: private: void _deferred_submit_unlock(OpSequencer *osr); void _deferred_aio_finish(OpSequencer *osr); - int _deferred_replay(); + int _deferred_replay(std::vector* keys_to_remove); bool _eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn, interval_set& bluefs_extents); -- 2.47.3