From: Adam Kupczyk Date: Thu, 3 Jul 2025 08:04:01 +0000 (+0000) Subject: os/bluestore: Multithreaded allocation recovery X-Git-Tag: v21.0.1~8^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bfd4e18eaadcdd5d562328f20039941a5b22a1d3;p=ceph.git os/bluestore: Multithreaded allocation recovery Added multithreading processing for allocation recovery. Added new config "bluestore_allocation_recovery_threads". Signed-off-by: Adam Kupczyk --- diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 5e583932b4c..3460267c155 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -5375,6 +5375,19 @@ options: desc: Remove allocation info from RocksDB and store the info in a new allocation file default: true with_legacy: true +- name: bluestore_allocation_recovery_threads + type: uint + level: basic + desc: Amount of threads for allocation recovery after OSD crash. + long_desc: The optimal value varies. For NVMe DBs may be as large as 8. + Value 0 selects legacy recovery. + Value 1 denotes new recovery but on single thread. + default: 0 + with_legacy: false + flags: + - startup + see_also: + - bluestore_allocation_from_file - name: bluestore_debug_inject_allocation_from_file_failure type: float level: dev diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 5db636e37cb..3a64fb42b50 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -20889,7 +20889,12 @@ int BlueStore::reconstruct_allocations(SimpleBitmap *sbmap, read_alloc_stats_t & stats.extent_count++; // then set all space taken by Objects - int ret = read_allocation_from_onodes(sbmap, stats); + int ret; + if (cct->_conf.get_val("bluestore_allocation_recovery_threads") == 0) { + ret = read_allocation_from_onodes(sbmap, stats); + } else { + ret = read_allocation_from_onodes_mt(sbmap, stats); + } if (ret < 0) { derr << "failed read_allocation_from_onodes()" << dendl; return ret; diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 537f8e605a7..1f9dc8e69fb 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -4224,6 +4224,8 @@ private: int reconstruct_allocations(SimpleBitmap *smbmp, read_alloc_stats_t &stats); int read_allocation_from_onodes(SimpleBitmap *smbmp, read_alloc_stats_t& stats); int read_allocation_from_onodes_mt(SimpleBitmap *smbmp, read_alloc_stats_t& stats); + class OnodeScanMT; + friend OnodeScanMT; int commit_freelist_type(); int commit_to_null_manager(); int commit_to_real_manager(); diff --git a/src/os/bluestore/OnodeScan.cc b/src/os/bluestore/OnodeScan.cc index fb474ed76d6..e6f148ce4a7 100644 --- a/src/os/bluestore/OnodeScan.cc +++ b/src/os/bluestore/OnodeScan.cc @@ -176,65 +176,175 @@ void BlueStore::Decoder_AllocationsAndStatFS::reset_new_shard() { } -int BlueStore::read_allocation_from_onodes_mt(SimpleBitmap *sbmap, read_alloc_stats_t& stats) -{ - auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); - if (!it) { - derr << "failed getting onode's iterator" << dendl; - return -ENOENT; +class BlueStore::OnodeScanMT { + BlueStore& store; + SimpleBitmap *sbmap; + read_alloc_stats_t& stats; + vector chunks; + uint32_t chunk_pos = 0; + uint64_t total = 0; + uint64_t interval = 1'000'000; + ceph::mutex lock = ceph::make_mutex("BlueStore::OnodeScanMT::lock"); + void report_progress(uint32_t thread_no, uint64_t no_completed) { + auto &cct = store.cct; + std::lock_guard l(lock); + if (total / interval != (total + no_completed) / interval) { + dout(5) << __func__ << " processed objects count = " + << (total + no_completed) / interval * interval << dendl; + } + total += no_completed; } - uint64_t kv_count = 0; - uint64_t count_interval = 1'000'000; - Decoder_AllocationsAndStatFS edecoder(*this, stats, *sbmap, min_alloc_size_order); + bool ask_for_work( + string& start_key, + string& upper_bound_key) { + std::lock_guard l(lock); + if (chunk_pos < chunks.size()) { + start_key = chunks[chunk_pos].first_key; + upper_bound_key = chunks[chunk_pos].upper_bound; + chunk_pos++; + return true; + } else { + return false; + } + } - // iterate over all ONodes stored in RocksDB - for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) { - // trace an even after every million processed objects (typically every 5-10 seconds) - if (kv_count && (kv_count % count_interval == 0) ) { - dout(5) << __func__ << " processed objects count = " << kv_count << dendl; + int scan_onodes_range( + read_alloc_stats_t& stats, + const string& start_key, + const string& upper_bound_key) + { + auto& db = store.db; + auto& cct = store.cct; + auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); + if (!it) { + derr << "failed getting onode's iterator" << dendl; + return -ENOENT; } - auto key = it->key(); - auto okey = key; - dout(20) << __func__ << " decode onode " << pretty_binary_string(key) << dendl; - ghobject_t oid; - if (!is_extent_shard_key(it->key())) { - int r = get_key_object(okey, &oid); - if (r != 0) { - derr << __func__ << " failed to decode onode key = " - << pretty_binary_string(okey) << dendl; - return -EIO; + uint64_t kv_count = 0; + uint64_t last_completed = 0; + uint64_t count_interval = 100'000; + Decoder_AllocationsAndStatFS edecoder(store, stats, *sbmap, + store.min_alloc_size_order); + it->lower_bound(start_key); + // skip to first key that is beginning of Onode + while (it->valid() && is_extent_shard_key(it->key())) { + it->next(); + } + // iterate over all onodes in requested range + for (; it->valid(); it->next(), kv_count++) { + if (kv_count && (kv_count % count_interval == 0)) { + report_progress(0, kv_count - last_completed); + last_completed = kv_count; } - edecoder.reset(oid, - &stats.actual_pool_vstatfs[oid.hobj.get_logical_pool()]); - Onode dummy_on(cct); - Onode::decode_raw(&dummy_on, - it->value(), - edecoder, - segment_size != 0); - ++stats.onode_count; - } else { - edecoder.reset_new_shard(); - uint32_t offset; - int r = get_key_extent_shard(key, &okey, &offset); - if (r != 0) { - derr << __func__ << " failed to decode onode extent key = " - << pretty_binary_string(key) << dendl; - return -EIO; + auto key = it->key(); + auto okey = key; + dout(20) << __func__ << " decode onode " << pretty_binary_string(key) << dendl; + ghobject_t oid; + if (!is_extent_shard_key(it->key())) { + if (it->key() >= upper_bound_key) { + // Drag iteration after upper bound until new onode is found. + break; + } + int r = get_key_object(okey, &oid); + if (r != 0) { + derr << __func__ + << " failed to decode onode key = " << pretty_binary_string(okey) + << dendl; + continue; + } + edecoder.reset(oid, + &stats.actual_pool_vstatfs[oid.hobj.get_logical_pool()]); + Onode dummy_on(cct); + Onode::decode_raw(&dummy_on, it->value(), edecoder, store.segment_size != 0); + ++stats.onode_count; + } else { + edecoder.reset_new_shard(); + uint32_t offset; + get_key_extent_shard(key, &okey, &offset); + int r = get_key_object(okey, &oid); + if (r != 0) { + derr << __func__ + << " failed to decode onode key= " << pretty_binary_string(okey) + << " from extent key= " << pretty_binary_string(key) << dendl; + continue; + } + if (oid != edecoder.get_oid()) { + continue; + } + edecoder.decode_some(it->value(), nullptr); + ++stats.shard_count; } - r = get_key_object(okey, &oid); - if (r != 0) { - derr << __func__ - << " failed to decode onode key= " << pretty_binary_string(okey) - << " from extent key= " << pretty_binary_string(key) - << dendl; - return -EIO; + } + report_progress(0, kv_count - last_completed); + return 0; + } + + void scanner_thread( + uint32_t thread_no, + read_alloc_stats_t& stats) + { + [[maybe_unused]] auto& cct = store.cct; + string start_key; + string upper_bound_key; + while(ask_for_work(start_key, upper_bound_key)) { + dout(10) << "thread " << thread_no << " runs: " << pretty_binary_string(start_key) + << "..." << pretty_binary_string(upper_bound_key) << dendl; + scan_onodes_range(stats, start_key, upper_bound_key); + } + } + +public: + OnodeScanMT(BlueStore& store, SimpleBitmap *sbmap, read_alloc_stats_t& stats) + : store(store), sbmap(sbmap), stats(stats) {} + + int scan() { + [[maybe_unused]] auto& cct = store.cct; + std::vector thr_stats; + std::vector thr; + + //bluestore_allocation_recovery_threads + size_t num_threads = cct->_conf.get_val("bluestore_allocation_recovery_threads"); + ceph_assert(num_threads > 0); + std::vector timers; + store.db->util_divide_key_range( + PREFIX_OBJ, "", string(100, '\377'), num_threads, 50'000'000, 0.05, chunks); + for (size_t i = 0; i < chunks.size(); i++) { + dout(10) << i << ": " << pretty_binary_string(chunks[i].first_key) + << "..." << pretty_binary_string(chunks[i].upper_bound) << dendl; + } + + num_threads = std::min(num_threads, chunks.size()); + thr_stats.resize(num_threads); + thr.resize(num_threads); + timers.resize(num_threads); + for (size_t i = 0; i < num_threads; i++) { + thr[i] = std::thread( + &BlueStore::OnodeScanMT::scanner_thread, this, i, std::ref(thr_stats[i])); + } + for (size_t i = 0; i < num_threads; i++) { + thr[i].join(); + stats.onode_count += thr_stats[i].onode_count; + stats.shard_count += thr_stats[i].shard_count; + stats.shared_blob_count += thr_stats[i].shared_blob_count; + stats.compressed_blob_count += thr_stats[i].compressed_blob_count; + stats.spanning_blob_count += thr_stats[i].spanning_blob_count; + stats.skipped_illegal_extent += thr_stats[i].skipped_illegal_extent; + stats.extent_count += thr_stats[i].extent_count; + stats.insert_count += thr_stats[i].insert_count; + for (auto &k : thr_stats[i].actual_pool_vstatfs) { + stats.actual_pool_vstatfs[k.first] += k.second; } - ceph_assert(oid == edecoder.get_oid()); - edecoder.decode_some(it->value(), nullptr); - ++stats.shard_count; } + return 0; } +}; + +int BlueStore::read_allocation_from_onodes_mt(SimpleBitmap *sbmap, read_alloc_stats_t& stats) +{ + OnodeScanMT scaner(*this, sbmap, stats); + scaner.scan(); return 0; } +