From 4735b0a4215d754ead759bd1e7f1f309cfc72a75 Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Wed, 29 Jun 2022 10:31:43 +0200 Subject: [PATCH] os/bluestore: Fix deferred writes corrupting RocksDB Deferred writes can sometimes update regions that are no longer mapped to any object. This cannot happen when BlueStore is running, as blobs are being held, and allocations are not released until deferred op is executed. However in case of restart allocations that deferred is targetting are already freed. Deferred replay is done on BlueStore bootup, before any new object can be allocated, so no collision with object is possible. But BlueFS can allocate space from block with deferred ops still pending. Fixes: https://tracker.ceph.com/issues/54547 Signed-off-by: Adam Kupczyk (cherry picked from commit 2afb951a490aac9ca4d01614867396cbd06b793d) Conflicts: src/os/bluestore/BlueStore.cc Modified non-critical BlueFS call foreach_block_extents to get_block_extents. --- src/os/bluestore/BlueStore.cc | 87 +++++++++++++++++++++++++++++++++-- src/os/bluestore/BlueStore.h | 2 + 2 files changed, 85 insertions(+), 4 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 6c9e737b4568e..3239a825846d8 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -13918,6 +13918,10 @@ int BlueStore::_deferred_replay() dout(10) << __func__ << " start" << dendl; int count = 0; int r = 0; + interval_set bluefs_extents; + if (bluefs) { + bluefs->get_block_extents(bluefs_layout.shared_bdev, &bluefs_extents); + } CollectionRef ch = _get_collection(coll_t::meta()); bool fake_ch = false; if (!ch) { @@ -13943,10 +13947,15 @@ int BlueStore::_deferred_replay() r = -EIO; goto out; } - TransContext *txc = _txc_create(ch.get(), osr, nullptr); - txc->deferred_txn = deferred_txn; - txc->set_state(TransContext::STATE_KV_DONE); - _txc_state_proc(txc); + bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents); + if (has_some) { + TransContext *txc = _txc_create(ch.get(), osr, nullptr); + txc->deferred_txn = deferred_txn; + txc->set_state(TransContext::STATE_KV_DONE); + _txc_state_proc(txc); + } else { + delete deferred_txn; + } } out: dout(20) << __func__ << " draining osr" << dendl; @@ -13959,6 +13968,76 @@ int BlueStore::_deferred_replay() return r; } +bool BlueStore::_eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn, + interval_set& bluefs_extents) +{ + bool has_some = false; + dout(30) << __func__ << " bluefs_extents: " << std::hex << bluefs_extents << std::dec << dendl; + auto it = deferred_txn->ops.begin(); + while (it != deferred_txn->ops.end()) { + // We process a pair of _data_/_extents_ (here: it->data/it->extents) + // by eliminating _extents_ that belong to bluefs, removing relevant parts of _data_ + // example: + // +------------+---------------+---------------+---------------+ + // | data | aaaaaaaabbbbb | bbbbcccccdddd | ddddeeeeeefff | + // | extent | 40000 - 44000 | 50000 - 58000 | 58000 - 60000 | + // | in bluefs? | no | yes | no | + // +------------+---------------+---------------+---------------+ + // result: + // +------------+---------------+---------------+ + // | data | aaaaaaaabbbbb | ddddeeeeeefff | + // | extent | 40000 - 44000 | 58000 - 60000 | + // +------------+---------------+---------------+ + PExtentVector new_extents; + ceph::buffer::list new_data; + uint32_t data_offset = 0; // this tracks location of extent 'e' inside it->data + dout(30) << __func__ << " input extents: " << it->extents << dendl; + for (auto& e: it->extents) { + interval_set region; + region.insert(e.offset, e.length); + + auto mi = bluefs_extents.lower_bound(e.offset); + if (mi != bluefs_extents.begin()) { + --mi; + if (mi.get_end() <= e.offset) { + ++mi; + } + } + while (mi != bluefs_extents.end() && mi.get_start() < e.offset + e.length) { + // The interval_set does not like (asserts) when we erase interval that does not exist. + // Hence we do we implement (region-mi) by ((region+mi)-mi). + region.union_insert(mi.get_start(), mi.get_len()); + region.erase(mi.get_start(), mi.get_len()); + ++mi; + } + // 'region' is now a subset of e, without parts used by bluefs + // we trim coresponding parts from it->data (actally constructing new_data / new_extents) + for (auto ki = region.begin(); ki != region.end(); ki++) { + ceph::buffer::list chunk; + // A chunk from it->data; data_offset is a an offset where 'e' was located; + // 'ki.get_start() - e.offset' is an offset of ki inside 'e'. + chunk.substr_of(it->data, data_offset + (ki.get_start() - e.offset), ki.get_len()); + new_data.claim_append(chunk); + new_extents.emplace_back(bluestore_pextent_t(ki.get_start(), ki.get_len())); + } + data_offset += e.length; + } + dout(30) << __func__ << " output extents: " << new_extents << dendl; + if (it->data.length() != new_data.length()) { + dout(10) << __func__ << " trimmed deferred extents: " << it->extents << "->" << new_extents << dendl; + } + if (new_extents.size() == 0) { + it = deferred_txn->ops.erase(it); + } else { + has_some = true; + std::swap(it->extents, new_extents); + std::swap(it->data, new_data); + ++it; + } + } + return has_some; +} + // --------------------------- // transactions diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 10fdd0f721a0a..5f1b84d9100ac 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2684,6 +2684,8 @@ private: void _deferred_submit_unlock(OpSequencer *osr); void _deferred_aio_finish(OpSequencer *osr); int _deferred_replay(); + bool _eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn, + interval_set& bluefs_extents); public: using mempool_dynamic_bitset = -- 2.39.5