From: Adam Kupczyk Date: Tue, 1 Jul 2025 13:47:14 +0000 (+0000) Subject: os/bluestore: Add new onode recovery method X-Git-Tag: v21.0.1~8^2~7 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2bf2dddb5b7612c1d0503bb12ea1b6b8f96a4eab;p=ceph.git os/bluestore: Add new onode recovery method Added read_allocation_from_onodes_mt function (originally copied from read_allocation_from_onodes). Added Decoder_AllocationsAndStatFS class (originally copied from ExtentDecoderpartial). There are significant differences from originals: - shared blobs are not scanned at all - to not account allocations more than once, collisions are detected on SimpleBitmap level; only the first onode referencing shared blob will mark allocation - Blobs are not preserved - instead we remember only if blob or spanning blob was compressed The underlying logic is make recovery faster and prepare for multithread refactor. Signed-off-by: Adam Kupczyk --- diff --git a/src/crimson/os/alienstore/CMakeLists.txt b/src/crimson/os/alienstore/CMakeLists.txt index 18d9d8b99dc..6a227cfdb87 100644 --- a/src/crimson/os/alienstore/CMakeLists.txt +++ b/src/crimson/os/alienstore/CMakeLists.txt @@ -62,6 +62,7 @@ set(alien_store_srcs ${PROJECT_SOURCE_DIR}/src/os/bluestore/Compression.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStore_debug.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueAdmin.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/OnodeScan.cc ${PROJECT_SOURCE_DIR}/src/os/memstore/MemStore.cc) add_library(crimson-alienstore STATIC diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 12cb8573f18..c411a1000fb 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -353,7 +353,7 @@ static void get_shared_blob_key(uint64_t sbid, string *key) _key_encode_u64(sbid, key); } -static int get_key_shared_blob(const string& key, uint64_t *sbid) +int get_key_shared_blob(const string& key, uint64_t *sbid) { const char *p = key.c_str(); if (key.length() < sizeof(uint64_t)) @@ -439,7 +439,7 @@ static int _get_key_object(const char *p, ghobject_t *oid) } template -static int get_key_object(const S& key, ghobject_t *oid) +int get_key_object(const S& key, ghobject_t *oid) { if (key.length() < ENCODED_KEY_PREFIX_LEN) return -1; @@ -449,6 +449,8 @@ static int get_key_object(const S& key, ghobject_t *oid) return _get_key_object(p, oid); } +template int get_key_object(const string& key, ghobject_t *oid); + template static void _get_object_key(const ghobject_t& oid, S *key) { @@ -556,7 +558,7 @@ int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset) return 0; } -static bool is_extent_shard_key(const string& key) +bool is_extent_shard_key(const string& key) { return *key.rbegin() == EXTENT_SHARD_KEY_SUFFIX; } diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index a22a0f1d556..b7a3f253b3d 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -4133,6 +4133,7 @@ private: std::map actual_pool_vstatfs; volatile_statfs actual_store_vstatfs; }; + class Decoder_AllocationsAndStatFS; class ExtentDecoderPartial : public ExtentMap::ExtentDecoder { BlueStore& store; read_alloc_stats_t& stats; @@ -4214,6 +4215,7 @@ private: int read_allocation_from_drive_on_startup(); int reconstruct_allocations(SimpleBitmap *smbmp, read_alloc_stats_t &stats); int read_allocation_from_onodes(SimpleBitmap *smbmp, read_alloc_stats_t& stats); + int read_allocation_from_onodes_mt(SimpleBitmap *smbmp, read_alloc_stats_t& stats); int commit_freelist_type(); int commit_to_null_manager(); int commit_to_real_manager(); diff --git a/src/os/bluestore/CMakeLists.txt b/src/os/bluestore/CMakeLists.txt index 3ae5cd9987c..eea9bc53557 100644 --- a/src/os/bluestore/CMakeLists.txt +++ b/src/os/bluestore/CMakeLists.txt @@ -19,6 +19,7 @@ add_library(bluestore OBJECT HybridAllocator.cc Writer.cc Compression.cc + OnodeScan.cc BlueAdmin.cc BlueEnv.cc) diff --git a/src/os/bluestore/OnodeScan.cc b/src/os/bluestore/OnodeScan.cc new file mode 100644 index 00000000000..fb474ed76d6 --- /dev/null +++ b/src/os/bluestore/OnodeScan.cc @@ -0,0 +1,240 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2025 IBM + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ +#include "BlueStore.h" +#include "common/pretty_binary.h" +#include "simple_bitmap.h" +#include "common/debug.h" +using namespace std; + +// kv store prefixes, copied from BlueStore.cc +const string PREFIX_OBJ = "O"; // object name -> onode_t + +#undef dout_prefix +#define dout_prefix *_dout << "bs.onode_scan " +#undef dout_context +#define dout_context cct +#define dout_subsys ceph_subsys_bluestore + +int get_key_shared_blob(const string& key, uint64_t *sbid); +bool is_extent_shard_key(const string& key); +template + int get_key_object(const S& key, ghobject_t *oid); +int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset); + +struct bool_vector_t { + static constexpr uint8_t null_gen = 0; + std::vector gen_markers; + uint8_t current_gen = 1; + void mark(uint32_t pos) { + if (gen_markers.size() <= pos) { + gen_markers.resize(pos * 5 / 4 + 1); + } + gen_markers[pos] = current_gen; + } + bool check(uint32_t pos) { + return pos < gen_markers.size() && gen_markers[pos] == current_gen; + } + void reset() { + current_gen++; + if (current_gen == null_gen) { + // visited all possible generation markers, we need to clear table + memset(gen_markers.data(), null_gen, gen_markers.size()); + current_gen++; // step out of invalid generation + } + } +}; + +class BlueStore::Decoder_AllocationsAndStatFS : public BlueStore::ExtentMap::ExtentDecoder { + using Extent = BlueStore::Extent; + BlueStore &store; + read_alloc_stats_t &stats; + SimpleBitmap &sbmap; + uint8_t min_alloc_size_order; + Extent extent; + ghobject_t oid; + volatile_statfs *per_pool_statfs = nullptr; + bool_vector_t is_local_blob_compressed; + bool_vector_t is_spanning_blob_compressed; + + void _consume_new_blob(bool spanning, uint64_t extent_no, uint64_t sbid, BlobRef b); + +protected: + void consume_blobid(Extent *, bool spanning, uint64_t blobid) override; + void consume_blob(Extent *le, uint64_t extent_no, uint64_t sbid, BlobRef b) override; + void consume_spanning_blob(uint64_t sbid, BlobRef b) override; + Extent *get_next_extent() override { + ++stats.extent_count; + extent = Extent(); + return &extent; + } + void add_extent(Extent *) override {} + +public: + Decoder_AllocationsAndStatFS( + BlueStore &_store, + read_alloc_stats_t &_stats, + SimpleBitmap &_sbmap, + uint8_t _min_alloc_size_order) + : store(_store), stats(_stats), sbmap(_sbmap), min_alloc_size_order(_min_alloc_size_order) {} + const ghobject_t &get_oid() const { return oid; } + void reset(const ghobject_t _oid, volatile_statfs *_per_pool_statfs); + void reset_new_shard(); +}; + +void BlueStore::Decoder_AllocationsAndStatFS::_consume_new_blob( + bool spanning, + uint64_t extent_no, + uint64_t sbid, + BlobRef b) +{ + [[maybe_unused]] auto cct = store.cct; + ceph_assert(per_pool_statfs); + ceph_assert(oid != ghobject_t()); + + auto &blob = b->get_blob(); + bool compressed = blob.is_compressed(); + if(spanning) { + dout(20) << __func__ << " spanning " << b->id << dendl; + ceph_assert(b->id >= 0); + if (compressed) { + is_spanning_blob_compressed.mark(b->id); + } + ++stats.spanning_blob_count; + } else { + dout(20) << __func__ << " local " << extent_no << dendl; + if (compressed) { + is_local_blob_compressed.mark(extent_no); + } + } + + uint64_t new_marked_allocations = 0; + for (auto &pe : blob.get_extents()) { + if (pe.offset != bluestore_pextent_t::INVALID_OFFSET) { + new_marked_allocations += sbmap.set_atomic( + pe.offset >> min_alloc_size_order, pe.length >> min_alloc_size_order); + } + } + per_pool_statfs->allocated() += new_marked_allocations + << min_alloc_size_order; + if (compressed) { + per_pool_statfs->compressed_allocated() += + new_marked_allocations << min_alloc_size_order; + per_pool_statfs->compressed() += blob.get_compressed_payload_length(); + ++stats.compressed_blob_count; + } +} + +void BlueStore::Decoder_AllocationsAndStatFS::consume_blobid( + Extent* le, bool spanning, uint64_t blobid) +{ + [[maybe_unused]] auto cct = store.cct; + dout(20) << __func__ << " " << spanning << " " << blobid << dendl; + auto& vec = spanning ? is_spanning_blob_compressed : is_local_blob_compressed; + per_pool_statfs->stored() += le->length; + if (vec.check(blobid)) { + per_pool_statfs->compressed_original() += le->length; + } +} + +void BlueStore::Decoder_AllocationsAndStatFS::consume_blob( + Extent* le, uint64_t extent_no, uint64_t sbid, BlobRef b) +{ + _consume_new_blob(false, extent_no, sbid, b); + per_pool_statfs->stored() += le->length; + if (b->get_blob().is_compressed()) { + per_pool_statfs->compressed_original() += le->length; + } +} + +void BlueStore::Decoder_AllocationsAndStatFS::consume_spanning_blob( + uint64_t sbid, BlobRef b) +{ + _consume_new_blob(true, 0/*doesn't matter*/, sbid, b); +} + +void BlueStore::Decoder_AllocationsAndStatFS::reset( + const ghobject_t _oid, volatile_statfs* _per_pool_statfs) +{ + oid = _oid; + per_pool_statfs = _per_pool_statfs; + is_local_blob_compressed.reset(); + is_spanning_blob_compressed.reset(); +} + +void BlueStore::Decoder_AllocationsAndStatFS::reset_new_shard() { + is_local_blob_compressed.reset(); +} + + +int BlueStore::read_allocation_from_onodes_mt(SimpleBitmap *sbmap, read_alloc_stats_t& stats) +{ + auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); + if (!it) { + derr << "failed getting onode's iterator" << dendl; + return -ENOENT; + } + + uint64_t kv_count = 0; + uint64_t count_interval = 1'000'000; + Decoder_AllocationsAndStatFS edecoder(*this, stats, *sbmap, min_alloc_size_order); + + // iterate over all ONodes stored in RocksDB + for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) { + // trace an even after every million processed objects (typically every 5-10 seconds) + if (kv_count && (kv_count % count_interval == 0) ) { + dout(5) << __func__ << " processed objects count = " << kv_count << dendl; + } + + auto key = it->key(); + auto okey = key; + dout(20) << __func__ << " decode onode " << pretty_binary_string(key) << dendl; + ghobject_t oid; + if (!is_extent_shard_key(it->key())) { + int r = get_key_object(okey, &oid); + if (r != 0) { + derr << __func__ << " failed to decode onode key = " + << pretty_binary_string(okey) << dendl; + return -EIO; + } + edecoder.reset(oid, + &stats.actual_pool_vstatfs[oid.hobj.get_logical_pool()]); + Onode dummy_on(cct); + Onode::decode_raw(&dummy_on, + it->value(), + edecoder, + segment_size != 0); + ++stats.onode_count; + } else { + edecoder.reset_new_shard(); + uint32_t offset; + int r = get_key_extent_shard(key, &okey, &offset); + if (r != 0) { + derr << __func__ << " failed to decode onode extent key = " + << pretty_binary_string(key) << dendl; + return -EIO; + } + r = get_key_object(okey, &oid); + if (r != 0) { + derr << __func__ + << " failed to decode onode key= " << pretty_binary_string(okey) + << " from extent key= " << pretty_binary_string(key) + << dendl; + return -EIO; + } + ceph_assert(oid == edecoder.get_oid()); + edecoder.decode_some(it->value(), nullptr); + ++stats.shard_count; + } + } + return 0; +}