From: Jaya Prakash Date: Tue, 20 Jan 2026 13:52:44 +0000 (+0000) Subject: os/bluestore: Adding Runtime and Static Fragmentation Score to BlueStore X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=051833ce94ae1e5c218af8f6bd772e5f0c7f0092;p=ceph.git os/bluestore: Adding Runtime and Static Fragmentation Score to BlueStore Object fragmentation can increase metadata overhead and negatively impact performance, BlueStore currently lacks visibility into object-level fragmentation. This change object-level fragmentation metrics based on disjoint physical extents. Runtime tracking counts physical extents accessed per read, while scrub-time tracking records static extent runs per object and aggregates at the collection level. Fixes : https://tracker.ceph.com/issues/74012 Signed-off-by: Jaya Prakash --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 3eb318215b7d..3f211162cf70 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -5061,6 +5061,68 @@ void BlueStore::Onode::finish_write(TransContext* txc, uint32_t offset, uint32_t ldout(c->store->cct, 10) << __func__ << " done " << txc << dendl; } +struct FragMetric { + // Computes fragmentation as the number of disjoint segments + // produced by a stream of mapped ranges. + // frag_score == current disjoint segment count. + + std::unordered_set endpoints; + uint64_t frag_score = 0; + + FragMetric() {} + + inline void note(uint64_t offset, uint64_t length) { + bool merge_left = endpoints.count(offset); + bool merge_right = endpoints.count(offset + length); + if (merge_left && merge_right) { + endpoints.erase(offset); + endpoints.erase(offset + length); + frag_score--; + } else if (merge_left) { + endpoints.erase(offset); + endpoints.insert(offset + length); + } else if (merge_right) { + endpoints.erase(offset + length); + endpoints.insert(offset); + } else { + endpoints.insert(offset); + endpoints.insert(offset + length); + frag_score++; + } + } +}; + +int BlueStore::Onode::get_fragmentation_score() +{ + FragMetric frag; + + std::unordered_set visited_compressed_blobs; + + for (const auto& e : extent_map.extent_map) { + if (e.blob->get_blob().is_compressed()) { + if (visited_compressed_blobs.insert(e.blob).second) { + e.blob->get_blob().map( + 0, e.blob->get_blob().get_ondisk_length(), + [&](uint64_t offset, uint64_t length) { + frag.note(offset, length); + return 0; + } + ); + } + } else { + e.blob->get_blob().map( + e.blob_offset, + e.length, + [&](uint64_t phys_offset, uint64_t len) { + frag.note(phys_offset, len); + return 0; + } + ); + } + } + return frag.frag_score; +} + // ======================================================= // WriteContext @@ -12988,6 +13050,45 @@ int BlueStore::_generate_read_result_bl( return 0; } +void BlueStore::_measure_runtime_frag( + Collection *c, + const blobs2read_t& blobs2read) +{ + auto start = mono_clock::now(); + FragMetric frag; + for (auto& p : blobs2read) { + const BlobRef& bptr = p.first; + const regions2read_t& r2r = p.second; + for (auto req : r2r) { + bptr->get_blob().map( + req.r_off, req.r_len, + [&](uint64_t offset, uint64_t length) { + frag.note(offset, length); + return 0; + }); + } + } + if (frag.frag_score > 0) { + c->runtime_read_samples.fetch_add(1, std::memory_order_relaxed); + c->runtime_frag_count.fetch_add(frag.frag_score, std::memory_order_relaxed); + } +} + +void BlueStore::_measure_static_frag( + Collection *c, + const OnodeRef& o) +{ + auto read_samples = c->object_read_samples.load(std::memory_order_relaxed); + auto frag_score = o->get_fragmentation_score(); + if (read_samples == 0) { + c->static_frag_score.store(frag_score, std::memory_order_relaxed); + c->object_read_samples.store(1, std::memory_order_relaxed); + } else { + c->static_frag_score.fetch_add(frag_score, std::memory_order_relaxed); + c->object_read_samples.fetch_add(1, std::memory_order_relaxed); + } +} + int BlueStore::_do_read( Collection *c, OnodeRef& o, @@ -13090,6 +13191,21 @@ int BlueStore::_do_read( ); } + if (cct->_conf->bluestore_frag_runtime) { + _measure_runtime_frag(c, blobs2read); + } + + if ((op_flags & CEPH_OSD_OP_FLAG_SCRUB) && cct->_conf->bluestore_frag_static) { + if (!o->extent_map.extent_map.empty()) { + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + auto it = o->extent_map.extent_map.begin(); + uint64_t first_extent_offset = it->logical_offset; + if (offset <= first_extent_offset && first_extent_offset < offset + length) { + _measure_static_frag(c, o); + } + } + } + bool csum_error = false; r = _generate_read_result_bl(o, offset, length, ready_regions, compressed_blob_bls, blobs2read, @@ -13453,6 +13569,9 @@ int BlueStore::_do_readv( // we always issue aio for reading, so errors other than EIO are not allowed if (r < 0) return r; + if (cct->_conf->bluestore_frag_runtime) { + _measure_runtime_frag(c, std::get<2>(raw_results[i])); + } } auto num_ios = m.size(); @@ -13485,6 +13604,24 @@ int BlueStore::_do_readv( ); } + if ((op_flags & CEPH_OSD_OP_FLAG_SCRUB) && cct->_conf->bluestore_frag_static) { + if (!o->extent_map.extent_map.empty()) { + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + auto it = o->extent_map.extent_map.begin(); + uint64_t first_extent_offset = it->logical_offset; + for (auto& p : m) { + uint64_t off = p.first; + uint64_t len = p.second; + + if (off <= first_extent_offset && + first_extent_offset < off + len) { + _measure_static_frag(c, o); + break; + } + } + } + } + ceph_assert(raw_results.size() == (size_t)m.num_intervals()); i = 0; for (auto p = m.begin(); p != m.end(); p++, i++) { diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 95103d73f62a..6beecfa17ae3 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1452,6 +1452,8 @@ public: void finish_write(TransContext* txc, uint32_t offset, uint32_t length); + int get_fragmentation_score(); + struct printer : public BlueStore::printer { const Onode &onode; uint16_t mode; @@ -1682,6 +1684,11 @@ public: ContextQueue *commit_queue; std::unique_ptr estimator; + std::atomic runtime_frag_count{0}; + std::atomic runtime_read_samples{0}; + std::atomic static_frag_score{0}; + std::atomic object_read_samples{0}; + OnodeCacheShard* get_onode_cache() const { return onode_space.cache; } @@ -3302,6 +3309,10 @@ private: bool* csum_error, ceph::buffer::list& bl); + void _measure_runtime_frag(Collection *c, const blobs2read_t& blobs2read); + + void _measure_static_frag(Collection *c, const OnodeRef& o); + int _do_read( Collection *c, OnodeRef& o,