]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: Adding Runtime and Static Fragmentation Score to BlueStore
authorJaya Prakash <jayaprakash@ibm.com>
Tue, 20 Jan 2026 13:52:44 +0000 (13:52 +0000)
committerJaya Prakash <jayaprakash@ibm.com>
Thu, 9 Apr 2026 16:59:53 +0000 (16:59 +0000)
Object fragmentation can increase metadata overhead and negatively impact
performance, BlueStore currently lacks visibility into object-level
fragmentation.

This change object-level fragmentation metrics based on disjoint physical extents.
Runtime tracking counts physical extents accessed per read, while scrub-time
tracking records static extent runs per object and aggregates at the collection level.

Fixes : https://tracker.ceph.com/issues/74012

Signed-off-by: Jaya Prakash <jayaprakash@ibm.com>
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h

index 3eb318215b7db9241d95a4f4edcf2a1ea09188c8..3f211162cf70e01893d702b3907e687fbc1c316a 100644 (file)
@@ -5061,6 +5061,68 @@ void BlueStore::Onode::finish_write(TransContext* txc, uint32_t offset, uint32_t
   ldout(c->store->cct, 10) << __func__ << " done " << txc << dendl;
 }
 
+struct FragMetric {
+  // Computes fragmentation as the number of disjoint segments
+  // produced by a stream of mapped ranges.
+  // frag_score == current disjoint segment count.
+
+  std::unordered_set<uint64_t> endpoints;
+  uint64_t frag_score = 0;
+
+  FragMetric() {}
+
+  inline void note(uint64_t offset, uint64_t length) {
+    bool merge_left = endpoints.count(offset);
+    bool merge_right = endpoints.count(offset + length);
+    if (merge_left && merge_right) {
+      endpoints.erase(offset);
+      endpoints.erase(offset + length);
+      frag_score--;
+    } else if (merge_left) {
+      endpoints.erase(offset);
+      endpoints.insert(offset + length);
+    } else if (merge_right) {
+      endpoints.erase(offset + length);
+      endpoints.insert(offset);
+    } else {
+      endpoints.insert(offset);
+      endpoints.insert(offset + length);
+      frag_score++;
+    }
+  }
+};
+
+int BlueStore::Onode::get_fragmentation_score()
+{
+  FragMetric frag;
+
+  std::unordered_set<BlobRef> visited_compressed_blobs;
+
+  for (const auto& e : extent_map.extent_map) {
+    if (e.blob->get_blob().is_compressed()) {
+      if (visited_compressed_blobs.insert(e.blob).second) {
+        e.blob->get_blob().map(
+          0, e.blob->get_blob().get_ondisk_length(),
+          [&](uint64_t offset, uint64_t length) {
+            frag.note(offset, length);
+            return 0;
+          }
+        );
+      }
+    } else {
+      e.blob->get_blob().map(
+        e.blob_offset,
+        e.length,
+        [&](uint64_t phys_offset, uint64_t len) {
+          frag.note(phys_offset, len);
+          return 0;
+        }
+      );
+    }
+  }
+  return frag.frag_score;
+}
+
 // =======================================================
 // WriteContext
  
@@ -12988,6 +13050,45 @@ int BlueStore::_generate_read_result_bl(
   return 0;
 }
 
+void BlueStore::_measure_runtime_frag(
+  Collection *c,
+  const blobs2read_t& blobs2read)
+{
+  auto start = mono_clock::now();
+  FragMetric frag;
+  for (auto& p : blobs2read) {
+    const BlobRef& bptr = p.first;
+    const regions2read_t& r2r = p.second;
+    for (auto req : r2r) {
+      bptr->get_blob().map(
+        req.r_off, req.r_len,
+        [&](uint64_t offset, uint64_t length) {
+          frag.note(offset, length);
+          return 0;
+        });
+    }
+  }
+  if (frag.frag_score > 0) {
+    c->runtime_read_samples.fetch_add(1, std::memory_order_relaxed);
+    c->runtime_frag_count.fetch_add(frag.frag_score, std::memory_order_relaxed);
+  }
+}
+
+void BlueStore::_measure_static_frag(
+  Collection *c,
+  const OnodeRef& o)
+{
+  auto read_samples = c->object_read_samples.load(std::memory_order_relaxed);
+  auto frag_score = o->get_fragmentation_score();
+  if (read_samples == 0) {
+    c->static_frag_score.store(frag_score, std::memory_order_relaxed);
+    c->object_read_samples.store(1, std::memory_order_relaxed);
+  } else {
+    c->static_frag_score.fetch_add(frag_score, std::memory_order_relaxed);
+    c->object_read_samples.fetch_add(1, std::memory_order_relaxed);
+  }
+}
+
 int BlueStore::_do_read(
   Collection *c,
   OnodeRef& o,
@@ -13090,6 +13191,21 @@ int BlueStore::_do_read(
     );
   }
 
+  if (cct->_conf->bluestore_frag_runtime) {
+    _measure_runtime_frag(c, blobs2read);
+  }
+
+  if ((op_flags & CEPH_OSD_OP_FLAG_SCRUB) && cct->_conf->bluestore_frag_static) {
+    if (!o->extent_map.extent_map.empty()) {
+      o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+      auto it = o->extent_map.extent_map.begin();
+      uint64_t first_extent_offset = it->logical_offset;
+      if (offset <= first_extent_offset && first_extent_offset < offset + length) {
+        _measure_static_frag(c, o);
+      }
+    }
+  }
+
   bool csum_error = false;
   r = _generate_read_result_bl(o, offset, length, ready_regions,
                               compressed_blob_bls, blobs2read,
@@ -13453,6 +13569,9 @@ int BlueStore::_do_readv(
     // we always issue aio for reading, so errors other than EIO are not allowed
     if (r < 0)
       return r;
+    if (cct->_conf->bluestore_frag_runtime) {
+      _measure_runtime_frag(c, std::get<2>(raw_results[i]));
+    }
   }
 
   auto num_ios = m.size();
@@ -13485,6 +13604,24 @@ int BlueStore::_do_readv(
     );
   }
 
+  if ((op_flags & CEPH_OSD_OP_FLAG_SCRUB) && cct->_conf->bluestore_frag_static) {
+    if (!o->extent_map.extent_map.empty()) {
+      o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+      auto it = o->extent_map.extent_map.begin();
+      uint64_t first_extent_offset = it->logical_offset;
+      for (auto& p : m) {
+        uint64_t off = p.first;
+        uint64_t len = p.second;
+
+        if (off <= first_extent_offset &&
+            first_extent_offset < off + len) {
+          _measure_static_frag(c, o);
+          break;
+        }
+      }
+    }
+  }
+
   ceph_assert(raw_results.size() == (size_t)m.num_intervals());
   i = 0;
   for (auto p = m.begin(); p != m.end(); p++, i++) {
index 95103d73f62a756d0fc4d74ae9d66034c6fa24f3..6beecfa17ae3d85e510d0f8a45f103e97e3b44fa 100644 (file)
@@ -1452,6 +1452,8 @@ public:
 
     void finish_write(TransContext* txc, uint32_t offset, uint32_t length);
 
+    int get_fragmentation_score();
+
     struct printer : public BlueStore::printer {
       const Onode &onode;
       uint16_t mode;
@@ -1682,6 +1684,11 @@ public:
     ContextQueue *commit_queue;
     std::unique_ptr<Estimator> estimator;
 
+    std::atomic<uint64_t> runtime_frag_count{0};
+    std::atomic<uint64_t> runtime_read_samples{0};
+    std::atomic<uint64_t> static_frag_score{0};
+    std::atomic<uint64_t> object_read_samples{0};
+
     OnodeCacheShard* get_onode_cache() const {
       return onode_space.cache;
     }
@@ -3302,6 +3309,10 @@ private:
     bool* csum_error,
     ceph::buffer::list& bl);
 
+  void _measure_runtime_frag(Collection *c, const blobs2read_t& blobs2read);
+
+  void _measure_static_frag(Collection *c, const OnodeRef& o);
+
   int _do_read(
     Collection *c,
     OnodeRef& o,