From: Adam Kupczyk Date: Mon, 2 Oct 2023 10:41:31 +0000 (+0000) Subject: os/bluestore: Add data segmentation X-Git-Tag: v20.3.0~140^2~8 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=5ec786a0d6039cd957325a1b3a47f40863c885cd;p=ceph.git os/bluestore: Add data segmentation Split object data into segments of conf.bluestore_onode_segment_size bytes. This means that no blob will be in two segments at the same time. Modified reshard function to prefer segment separation lines. As a result no spanning blobs are created. Signed-off-by: Adam Kupczyk --- diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index e44838fc00c6..686102b308c6 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -6688,6 +6688,24 @@ options: desc: How long cleaner should sleep before re-checking utilization default: 5 with_legacy: true +- name: bluestore_onode_segment_size + type: size + level: advanced + desc: Size of segment for onode. + long_desc: When object size grows too large BlueStore splits allocation metadata into + smaller RocksDB keys (shards). When multiple blobs overlap each other + some of them might belong to more than one shard. The encoding for such case + is inefficient (spanning blobs). Segmentation of data prevents blobs from crossing + specific separation lines, preventing spanning blobs altogether. + The smaller values give better split on onode shards. + The larger values minimize space loss for padding in compression. + Recommended values 256K, 512K, 1024K. Value 0 disables segmentation. + Actual segment size cannot be smaller than "compression_max_blob_size" pool option, if set. + default: 0 + see_also: + - bluestore_extent_map_shard_max_size + - bluestore_extent_map_shard_target_size + with_legacy: false - name: jaeger_tracing_enable type: bool level: advanced diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 7917e03d2e05..805a4d5b1408 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -3572,12 +3572,13 @@ bid_t BlueStore::ExtentMap::allocate_spanning_blob_id() void BlueStore::ExtentMap::reshard( KeyValueDB *db, - KeyValueDB::Transaction t) + KeyValueDB::Transaction t, + uint32_t segment_size) { auto cct = onode->c->store->cct; // used by dout dout(10) << __func__ << " 0x[" << std::hex << needs_reshard_begin << "," - << needs_reshard_end << ")" << std::dec + << needs_reshard_end << ") segment 0x" << segment_size << std::dec << " of " << onode->onode.extent_map_shards.size() << " shards on " << onode->oid << dendl; for (auto& p : spanning_blob_map) { @@ -3613,6 +3614,10 @@ void BlueStore::ExtentMap::reshard( } fault_range(db, needs_reshard_begin, (needs_reshard_end - needs_reshard_begin)); + uint64_t data_reshard_end = needs_reshard_end; + if (needs_reshard_end == OBJECT_MAX_SIZE && !extent_map.empty()) { + data_reshard_end = extent_map.rbegin()->blob_end(); + } // we may need to fault in a larger interval later must have all // referring extents for spanning blobs loaded in order to have @@ -3650,6 +3655,9 @@ void BlueStore::ExtentMap::reshard( dout(20) << __func__ << " extent_avg " << extent_avg << ", target " << target << ", slop " << slop << dendl; + uint32_t next_boundary = segment_size; + uint32_t encoded_segment_estimate = bytes * segment_size / (data_reshard_end - needs_reshard_begin); + // reshard unsigned estimate = 0; unsigned offset = needs_reshard_begin; @@ -3664,10 +3672,28 @@ void BlueStore::ExtentMap::reshard( } dout(30) << " extent " << *extent << dendl; - // disfavor shard boundaries that span a blob - bool would_span = (extent->logical_offset < max_blob_end) || extent->blob_offset; - if (estimate && - estimate + extent_avg > target + (would_span ? slop : 0)) { + bool make_shard_here = false; + if (segment_size != 0) { //onode data has strict boundaries + if (extent->blob_start() >= next_boundary) { + // beginning of the extent is a place that might be a shard boundary + // we want to decide whether to continue streaming to the current shard + // or move to the next one + if ((estimate >= target /*we have enough already*/) || + (estimate + encoded_segment_estimate >= (target * 3 / 2)) + /*we will be too large if we wait for next segment*/) { + make_shard_here = true; + } + next_boundary = p2roundup(extent->blob_end(), segment_size); + } + } else { + // disfavor shard boundaries that span a blob + bool would_span = (extent->logical_offset < max_blob_end) || (extent->blob_offset != 0); + if ((estimate > 0) + && (estimate + extent_avg > target + (would_span ? slop : 0))) { + make_shard_here = true; + } + } + if (make_shard_here) { // new shard if (offset == needs_reshard_begin) { new_shard_info.emplace_back(bluestore_onode_t::shard_info()); @@ -5838,7 +5864,8 @@ std::vector BlueStore::get_tracked_keys() const noexcept "bluestore_warn_on_legacy_statfs"s, "bluestore_warn_on_no_per_pool_omap"s, "bluestore_warn_on_no_per_pg_omap"s, - "bluestore_max_defer_interval"s + "bluestore_max_defer_interval"s, + "bluestore_onode_segment_size"s }; } @@ -5864,6 +5891,9 @@ void BlueStore::handle_conf_change(const ConfigProxy& conf, _set_compression(); } } + if (changed.count("bluestore_onode_segment_size")) { + segment_size = (cct->_conf.get_val("bluestore_onode_segment_size")); + } if (changed.count("bluestore_max_blob_size") || changed.count("bluestore_max_blob_size_ssd") || changed.count("bluestore_max_blob_size_hdd")) { @@ -5974,11 +6004,11 @@ void BlueStore::_set_compression() def_compressor_alg = Compressor::COMP_ALG_NONE; alg_name = "(none)"; } - dout(10) << __func__ << " mode " << Compressor::get_comp_mode_name(comp_mode) << " alg " << alg_name << " min_blob " << comp_min_blob_size << " max_blob " << comp_max_blob_size + << " segment_size " << segment_size << dendl; } @@ -17373,8 +17403,20 @@ void BlueStore::_do_write_data( if (head_length) { _do_write_small(txc, c, o, head_offset, head_length, p, wctx); } - - _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx); + uint32_t segment_size = this->segment_size.load(); + if (segment_size) { + // split data to chunks + uint64_t write_offset = middle_offset; + while (write_offset < middle_offset + middle_length) { + uint64_t segment_end = std::min( + p2roundup(write_offset + 1, segment_size), + middle_offset + middle_length); + _do_write_big(txc, c, o, write_offset, segment_end - write_offset, p, wctx); + write_offset = segment_end; + } + } else { + _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx); + } if (tail_length) { _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx); @@ -17651,6 +17693,9 @@ int BlueStore::_do_write_v2( // if we have compression, skip to write_v1 return _do_write(txc, c, o, offset, length, bl, fadvise_flags); } + if (segment_size != 0 && wctx.target_blob_size > segment_size) { + wctx.target_blob_size = segment_size; + } if (bl.length() != length) { bl.splice(length, bl.length() - length); } @@ -19125,7 +19170,7 @@ void BlueStore::_record_onode(OnodeRef& o, KeyValueDB::Transaction &txn) // finalize extent_map shards o->extent_map.update(txn, false); if (o->extent_map.needs_reshard()) { - o->extent_map.reshard(db, txn); + o->extent_map.reshard(db, txn, segment_size); o->extent_map.update(txn, true); if (o->extent_map.needs_reshard()) { dout(20) << __func__ << " warning: still wants reshard, check options?" diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 9ca48cea4413..2064a9f5416c 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1101,7 +1101,8 @@ public: decltype(BlueStore::Blob::id) allocate_spanning_blob_id(); void reshard( KeyValueDB *db, - KeyValueDB::Transaction t); + KeyValueDB::Transaction t, + uint32_t segment_size); /// initialize Shards from the onode void init_shards(bool loaded, bool dirty); @@ -2521,6 +2522,7 @@ private: std::atomic comp_max_blob_size = {0}; std::atomic max_blob_size = {0}; ///< maximum blob size + std::atomic segment_size = {0}; ///< snapshot of conf value "bluestore_onode_segment_size" uint64_t kv_ios = 0; uint64_t kv_throttle_costs = 0;