--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2025 IBM
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ */
+#include "BlueStore.h"
+#include "common/pretty_binary.h"
+#include "simple_bitmap.h"
+#include "common/debug.h"
+using namespace std;
+
+// kv store prefixes, copied from BlueStore.cc
+const string PREFIX_OBJ = "O"; // object name -> onode_t
+
+#undef dout_prefix
+#define dout_prefix *_dout << "bs.onode_scan "
+#undef dout_context
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+
+int get_key_shared_blob(const string& key, uint64_t *sbid);
+bool is_extent_shard_key(const string& key);
+template<typename S>
+ int get_key_object(const S& key, ghobject_t *oid);
+int get_key_extent_shard(const string& key, string *onode_key, uint32_t *offset);
+
+struct bool_vector_t {
+ static constexpr uint8_t null_gen = 0;
+ std::vector<uint8_t> gen_markers;
+ uint8_t current_gen = 1;
+ void mark(uint32_t pos) {
+ if (gen_markers.size() <= pos) {
+ gen_markers.resize(pos * 5 / 4 + 1);
+ }
+ gen_markers[pos] = current_gen;
+ }
+ bool check(uint32_t pos) {
+ return pos < gen_markers.size() && gen_markers[pos] == current_gen;
+ }
+ void reset() {
+ current_gen++;
+ if (current_gen == null_gen) {
+ // visited all possible generation markers, we need to clear table
+ memset(gen_markers.data(), null_gen, gen_markers.size());
+ current_gen++; // step out of invalid generation
+ }
+ }
+};
+
+class BlueStore::Decoder_AllocationsAndStatFS : public BlueStore::ExtentMap::ExtentDecoder {
+ using Extent = BlueStore::Extent;
+ BlueStore &store;
+ read_alloc_stats_t &stats;
+ SimpleBitmap &sbmap;
+ uint8_t min_alloc_size_order;
+ Extent extent;
+ ghobject_t oid;
+ volatile_statfs *per_pool_statfs = nullptr;
+ bool_vector_t is_local_blob_compressed;
+ bool_vector_t is_spanning_blob_compressed;
+
+ void _consume_new_blob(bool spanning, uint64_t extent_no, uint64_t sbid, BlobRef b);
+
+protected:
+ void consume_blobid(Extent *, bool spanning, uint64_t blobid) override;
+ void consume_blob(Extent *le, uint64_t extent_no, uint64_t sbid, BlobRef b) override;
+ void consume_spanning_blob(uint64_t sbid, BlobRef b) override;
+ Extent *get_next_extent() override {
+ ++stats.extent_count;
+ extent = Extent();
+ return &extent;
+ }
+ void add_extent(Extent *) override {}
+
+public:
+ Decoder_AllocationsAndStatFS(
+ BlueStore &_store,
+ read_alloc_stats_t &_stats,
+ SimpleBitmap &_sbmap,
+ uint8_t _min_alloc_size_order)
+ : store(_store), stats(_stats), sbmap(_sbmap), min_alloc_size_order(_min_alloc_size_order) {}
+ const ghobject_t &get_oid() const { return oid; }
+ void reset(const ghobject_t _oid, volatile_statfs *_per_pool_statfs);
+ void reset_new_shard();
+};
+
+void BlueStore::Decoder_AllocationsAndStatFS::_consume_new_blob(
+ bool spanning,
+ uint64_t extent_no,
+ uint64_t sbid,
+ BlobRef b)
+{
+ [[maybe_unused]] auto cct = store.cct;
+ ceph_assert(per_pool_statfs);
+ ceph_assert(oid != ghobject_t());
+
+ auto &blob = b->get_blob();
+ bool compressed = blob.is_compressed();
+ if(spanning) {
+ dout(20) << __func__ << " spanning " << b->id << dendl;
+ ceph_assert(b->id >= 0);
+ if (compressed) {
+ is_spanning_blob_compressed.mark(b->id);
+ }
+ ++stats.spanning_blob_count;
+ } else {
+ dout(20) << __func__ << " local " << extent_no << dendl;
+ if (compressed) {
+ is_local_blob_compressed.mark(extent_no);
+ }
+ }
+
+ uint64_t new_marked_allocations = 0;
+ for (auto &pe : blob.get_extents()) {
+ if (pe.offset != bluestore_pextent_t::INVALID_OFFSET) {
+ new_marked_allocations += sbmap.set_atomic(
+ pe.offset >> min_alloc_size_order, pe.length >> min_alloc_size_order);
+ }
+ }
+ per_pool_statfs->allocated() += new_marked_allocations
+ << min_alloc_size_order;
+ if (compressed) {
+ per_pool_statfs->compressed_allocated() +=
+ new_marked_allocations << min_alloc_size_order;
+ per_pool_statfs->compressed() += blob.get_compressed_payload_length();
+ ++stats.compressed_blob_count;
+ }
+}
+
+void BlueStore::Decoder_AllocationsAndStatFS::consume_blobid(
+ Extent* le, bool spanning, uint64_t blobid)
+{
+ [[maybe_unused]] auto cct = store.cct;
+ dout(20) << __func__ << " " << spanning << " " << blobid << dendl;
+ auto& vec = spanning ? is_spanning_blob_compressed : is_local_blob_compressed;
+ per_pool_statfs->stored() += le->length;
+ if (vec.check(blobid)) {
+ per_pool_statfs->compressed_original() += le->length;
+ }
+}
+
+void BlueStore::Decoder_AllocationsAndStatFS::consume_blob(
+ Extent* le, uint64_t extent_no, uint64_t sbid, BlobRef b)
+{
+ _consume_new_blob(false, extent_no, sbid, b);
+ per_pool_statfs->stored() += le->length;
+ if (b->get_blob().is_compressed()) {
+ per_pool_statfs->compressed_original() += le->length;
+ }
+}
+
+void BlueStore::Decoder_AllocationsAndStatFS::consume_spanning_blob(
+ uint64_t sbid, BlobRef b)
+{
+ _consume_new_blob(true, 0/*doesn't matter*/, sbid, b);
+}
+
+void BlueStore::Decoder_AllocationsAndStatFS::reset(
+ const ghobject_t _oid, volatile_statfs* _per_pool_statfs)
+{
+ oid = _oid;
+ per_pool_statfs = _per_pool_statfs;
+ is_local_blob_compressed.reset();
+ is_spanning_blob_compressed.reset();
+}
+
+void BlueStore::Decoder_AllocationsAndStatFS::reset_new_shard() {
+ is_local_blob_compressed.reset();
+}
+
+
+int BlueStore::read_allocation_from_onodes_mt(SimpleBitmap *sbmap, read_alloc_stats_t& stats)
+{
+ auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+ if (!it) {
+ derr << "failed getting onode's iterator" << dendl;
+ return -ENOENT;
+ }
+
+ uint64_t kv_count = 0;
+ uint64_t count_interval = 1'000'000;
+ Decoder_AllocationsAndStatFS edecoder(*this, stats, *sbmap, min_alloc_size_order);
+
+ // iterate over all ONodes stored in RocksDB
+ for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) {
+ // trace an even after every million processed objects (typically every 5-10 seconds)
+ if (kv_count && (kv_count % count_interval == 0) ) {
+ dout(5) << __func__ << " processed objects count = " << kv_count << dendl;
+ }
+
+ auto key = it->key();
+ auto okey = key;
+ dout(20) << __func__ << " decode onode " << pretty_binary_string(key) << dendl;
+ ghobject_t oid;
+ if (!is_extent_shard_key(it->key())) {
+ int r = get_key_object(okey, &oid);
+ if (r != 0) {
+ derr << __func__ << " failed to decode onode key = "
+ << pretty_binary_string(okey) << dendl;
+ return -EIO;
+ }
+ edecoder.reset(oid,
+ &stats.actual_pool_vstatfs[oid.hobj.get_logical_pool()]);
+ Onode dummy_on(cct);
+ Onode::decode_raw(&dummy_on,
+ it->value(),
+ edecoder,
+ segment_size != 0);
+ ++stats.onode_count;
+ } else {
+ edecoder.reset_new_shard();
+ uint32_t offset;
+ int r = get_key_extent_shard(key, &okey, &offset);
+ if (r != 0) {
+ derr << __func__ << " failed to decode onode extent key = "
+ << pretty_binary_string(key) << dendl;
+ return -EIO;
+ }
+ r = get_key_object(okey, &oid);
+ if (r != 0) {
+ derr << __func__
+ << " failed to decode onode key= " << pretty_binary_string(okey)
+ << " from extent key= " << pretty_binary_string(key)
+ << dendl;
+ return -EIO;
+ }
+ ceph_assert(oid == edecoder.get_oid());
+ edecoder.decode_some(it->value(), nullptr);
+ ++stats.shard_count;
+ }
+ }
+ return 0;
+}