os/bluestore: Add data segmentation

author Adam Kupczyk <akupczyk@ibm.com>

Mon, 2 Oct 2023 10:41:31 +0000 (10:41 +0000)

committer Adam Kupczyk <akupczyk@ibm.com>

Mon, 31 Mar 2025 07:32:43 +0000 (07:32 +0000)
author Adam Kupczyk <akupczyk@ibm.com>
Mon, 2 Oct 2023 10:41:31 +0000 (10:41 +0000)
committer Adam Kupczyk <akupczyk@ibm.com>
Mon, 31 Mar 2025 07:32:43 +0000 (07:32 +0000)
diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in

index e44838fc00c6bf0a1409a36bd848679bc338b627..686102b308c682f4d34da4a29e7d651539fdb6ee 100644 (file)
--- a/src/common/options/global.yaml.in
+++ b/src/common/options/global.yaml.in
@@ -6688,6 +6688,24 @@ options:
    desc: How long cleaner should sleep before re-checking utilization
    default: 5
    with_legacy: true
+- name: bluestore_onode_segment_size
+  type: size
+  level: advanced
+  desc: Size of segment for onode.
+  long_desc: When object size grows too large BlueStore splits allocation metadata into
+    smaller RocksDB keys (shards). When multiple blobs overlap each other
+    some of them might belong to more than one shard. The encoding for such case
+    is inefficient (spanning blobs). Segmentation of data prevents blobs from crossing
+    specific separation lines, preventing spanning blobs altogether.
+    The smaller values give better split on onode shards.
+    The larger values minimize space loss for padding in compression.
+    Recommended values 256K, 512K, 1024K. Value 0 disables segmentation.
+    Actual segment size cannot be smaller than "compression_max_blob_size" pool option, if set.
+  default: 0
+  see_also:
+  - bluestore_extent_map_shard_max_size
+  - bluestore_extent_map_shard_target_size
+  with_legacy: false
  - name: jaeger_tracing_enable
    type: bool
    level: advanced
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc

index 7917e03d2e05f2c8e3df968cf1d86e3c8a59ecc3..805a4d5b1408ac74817f783f8d40b92f8278b81b 100644 (file)
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -3572,12 +3572,13 @@ bid_t BlueStore::ExtentMap::allocate_spanning_blob_id()
  
  void BlueStore::ExtentMap::reshard(
    KeyValueDB *db,
-  KeyValueDB::Transaction t)
+  KeyValueDB::Transaction t,
+  uint32_t segment_size)
  {
    auto cct = onode->c->store->cct; // used by dout
  
    dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << ","
-          << needs_reshard_end << ")" << std::dec
+          << needs_reshard_end << ") segment 0x" << segment_size << std::dec
            << " of " << onode->onode.extent_map_shards.size()
            << " shards on " << onode->oid << dendl;
    for (auto& p : spanning_blob_map) {
@@ -3613,6 +3614,10 @@ void BlueStore::ExtentMap::reshard(
    }
  
    fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin));
+  uint64_t data_reshard_end = needs_reshard_end;
+  if (needs_reshard_end == OBJECT_MAX_SIZE && !extent_map.empty()) {
+    data_reshard_end = extent_map.rbegin()->blob_end();
+  }
  
    // we may need to fault in a larger interval later must have all
    // referring extents for spanning blobs loaded in order to have
@@ -3650,6 +3655,9 @@ void BlueStore::ExtentMap::reshard(
    dout(20) << __func__ << "  extent_avg " << extent_avg << ", target " << target
            << ", slop " << slop << dendl;
  
+  uint32_t next_boundary = segment_size;
+  uint32_t encoded_segment_estimate = bytes * segment_size / (data_reshard_end - needs_reshard_begin);
+
    // reshard
    unsigned estimate = 0;
    unsigned offset = needs_reshard_begin;
@@ -3664,10 +3672,28 @@ void BlueStore::ExtentMap::reshard(
      }
      dout(30) << " extent " << *extent << dendl;
  
-    // disfavor shard boundaries that span a blob
-    bool would_span = (extent->logical_offset < max_blob_end) || extent->blob_offset;
-    if (estimate &&
-       estimate + extent_avg > target + (would_span ? slop : 0)) {
+    bool make_shard_here = false;
+    if (segment_size != 0) { //onode data has strict boundaries
+      if (extent->blob_start() >= next_boundary) {
+        // beginning of the extent is a place that might be a shard boundary
+        // we want to decide whether to continue streaming to the current shard
+        // or move to the next one
+       if ((estimate >= target /*we have enough already*/) ||
+           (estimate + encoded_segment_estimate >= (target * 3 / 2))
+           /*we will be too large if we wait for next segment*/) {
+         make_shard_here = true;
+       }
+       next_boundary = p2roundup(extent->blob_end(), segment_size);
+      }
+    } else {
+      // disfavor shard boundaries that span a blob
+      bool would_span = (extent->logical_offset < max_blob_end) || (extent->blob_offset != 0);
+      if ((estimate > 0)
+          && (estimate + extent_avg > target + (would_span ? slop : 0))) {
+       make_shard_here = true;
+      }
+    }
+    if (make_shard_here) {
        // new shard
        if (offset == needs_reshard_begin) {
         new_shard_info.emplace_back(bluestore_onode_t::shard_info());
@@ -5838,7 +5864,8 @@ std::vector<std::string> BlueStore::get_tracked_keys() const noexcept
      "bluestore_warn_on_legacy_statfs"s,
      "bluestore_warn_on_no_per_pool_omap"s,
      "bluestore_warn_on_no_per_pg_omap"s,
-    "bluestore_max_defer_interval"s
+    "bluestore_max_defer_interval"s,
+    "bluestore_onode_segment_size"s
    };
  }
  
@@ -5864,6 +5891,9 @@ void BlueStore::handle_conf_change(const ConfigProxy& conf,
        _set_compression();
      }
    }
+  if (changed.count("bluestore_onode_segment_size")) {
+    segment_size = (cct->_conf.get_val<Option::size_t>("bluestore_onode_segment_size"));
+  }
    if (changed.count("bluestore_max_blob_size") ||
        changed.count("bluestore_max_blob_size_ssd") ||
        changed.count("bluestore_max_blob_size_hdd")) {
@@ -5974,11 +6004,11 @@ void BlueStore::_set_compression()
      def_compressor_alg = Compressor::COMP_ALG_NONE;
      alg_name = "(none)";
    }
-
    dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode)
            << " alg " << alg_name
            << " min_blob " << comp_min_blob_size
            << " max_blob " << comp_max_blob_size
+           << " segment_size " << segment_size
            << dendl;
  }
  
@@ -17373,8 +17403,20 @@ void BlueStore::_do_write_data(
      if (head_length) {
        _do_write_small(txc, c, o, head_offset, head_length, p, wctx);
      }
-
-    _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
+    uint32_t segment_size = this->segment_size.load();
+    if (segment_size) {
+      // split data to chunks
+      uint64_t write_offset = middle_offset;
+      while (write_offset < middle_offset + middle_length) {
+       uint64_t segment_end = std::min(
+         p2roundup<uint64_t>(write_offset + 1, segment_size),
+         middle_offset + middle_length);
+       _do_write_big(txc, c, o, write_offset, segment_end - write_offset, p, wctx);
+       write_offset = segment_end;
+      }
+    } else {
+      _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx);
+    }
  
      if (tail_length) {
        _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx);
@@ -17651,6 +17693,9 @@ int BlueStore::_do_write_v2(
      // if we have compression, skip to write_v1
      return _do_write(txc, c, o, offset, length, bl, fadvise_flags);
    }
+  if (segment_size != 0 && wctx.target_blob_size > segment_size) {
+    wctx.target_blob_size = segment_size;
+  }
    if (bl.length() != length) {
      bl.splice(length, bl.length() - length);
    }
@@ -19125,7 +19170,7 @@ void BlueStore::_record_onode(OnodeRef& o, KeyValueDB::Transaction &txn)
    // finalize extent_map shards
    o->extent_map.update(txn, false);
    if (o->extent_map.needs_reshard()) {
-    o->extent_map.reshard(db, txn);
+    o->extent_map.reshard(db, txn, segment_size);
      o->extent_map.update(txn, true);
      if (o->extent_map.needs_reshard()) {
        dout(20) << __func__ << " warning: still wants reshard, check options?"
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h

index 9ca48cea44136e96fcc0ce5dfa9d11a20d3b4398..2064a9f5416c96472c6e69e99f622140efd2a507 100644 (file)
--- a/src/os/bluestore/BlueStore.h
+++ b/src/os/bluestore/BlueStore.h
@@ -1101,7 +1101,8 @@ public:
      decltype(BlueStore::Blob::id) allocate_spanning_blob_id();
      void reshard(
        KeyValueDB *db,
-      KeyValueDB::Transaction t);
+      KeyValueDB::Transaction t,
+      uint32_t segment_size);
  
      /// initialize Shards from the onode
      void init_shards(bool loaded, bool dirty);
@@ -2521,6 +2522,7 @@ private:
    std::atomic<uint64_t> comp_max_blob_size = {0};
  
    std::atomic<uint64_t> max_blob_size = {0};  ///< maximum blob size
+  std::atomic<uint32_t> segment_size = {0};  ///< snapshot of conf value "bluestore_onode_segment_size"
  
    uint64_t kv_ios = 0;
    uint64_t kv_throttle_costs = 0;
author	Adam Kupczyk <akupczyk@ibm.com>
	Mon, 2 Oct 2023 10:41:31 +0000 (10:41 +0000)
committer	Adam Kupczyk <akupczyk@ibm.com>
	Mon, 31 Mar 2025 07:32:43 +0000 (07:32 +0000)
src/common/options/global.yaml.in		patch \| blob \| history
src/os/bluestore/BlueStore.cc		patch \| blob \| history
src/os/bluestore/BlueStore.h		patch \| blob \| history