From: Adam Kupczyk Date: Mon, 9 Dec 2024 19:28:57 +0000 (+0000) Subject: os/bluestore: Expand bluestore_onode_t, v=2 -> v=3 X-Git-Tag: v20.3.0~140^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=999e204a604c2467535097f412c63eb6c2aa51f0;p=ceph.git os/bluestore: Expand bluestore_onode_t, v=2 -> v=3 Add segment_size field for bluestore_onode_t. It makes more reliable when onode keeps its segment size. Upgraded v2 to v3. Object creation on v3 (new BlueStore): Object gets its segment_size field initialized from bluestore_onode_segment_size. However, if pool opt compression_max_blob_size is set and it is larger, it will be used. Upgrade: Object upgraded has its segment_size = 0 by default, so new BlueStore does not use segmentation. Spanning blobs can be created. Downgrade: Object downgraded (by being written by older BlueStore) is losing its segment_size setting. Older BlueStore will have no problem understanding it. Signed-off-by: Adam Kupczyk --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index b47bb61ef7c..b7fad7e75f3 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4860,6 +4860,13 @@ BlueStore::Onode* BlueStore::Onode::create_decode( } else { on->extent_map.init_shards(false, false); } + } else { + // init segment_size + uint32_t segment_size = c->store->segment_size.load(); + if (c->comp_max_blob_size.has_value() && segment_size < c->comp_max_blob_size.value()) { + segment_size = c->comp_max_blob_size.value(); // compression larger than global segment_size, use it + } + on->onode.segment_size = segment_size; } return on; } @@ -17418,7 +17425,7 @@ void BlueStore::_do_write_data( if (head_length) { _do_write_small(txc, c, o, head_offset, head_length, p, wctx); } - uint32_t segment_size = this->segment_size.load(); + uint32_t segment_size = o->onode.segment_size; if (segment_size) { // split data to chunks uint64_t write_offset = middle_offset; @@ -17708,8 +17715,8 @@ int BlueStore::_do_write_v2( // if we have compression, skip to write_v1 return _do_write(txc, c, o, offset, length, bl, fadvise_flags); } - if (segment_size != 0 && wctx.target_blob_size > segment_size) { - wctx.target_blob_size = segment_size; + if (o->onode.segment_size != 0 && wctx.target_blob_size > o->onode.segment_size) { + wctx.target_blob_size = o->onode.segment_size; } if (bl.length() != length) { bl.splice(length, bl.length() - length); @@ -19185,7 +19192,7 @@ void BlueStore::_record_onode(OnodeRef& o, KeyValueDB::Transaction &txn) // finalize extent_map shards o->extent_map.update(txn, false); if (o->extent_map.needs_reshard()) { - o->extent_map.reshard(db, txn, segment_size); + o->extent_map.reshard(db, txn, o->onode.segment_size); o->extent_map.update(txn, true); if (o->extent_map.needs_reshard()) { dout(20) << __func__ << " warning: still wants reshard, check options?" diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h index a7dc6cc52f4..a05d2ae9c97 100644 --- a/src/os/bluestore/bluestore_types.h +++ b/src/os/bluestore/bluestore_types.h @@ -1135,6 +1135,7 @@ struct bluestore_onode_t { uint32_t expected_object_size = 0; uint32_t expected_write_size = 0; uint32_t alloc_hint_flags = 0; + uint32_t segment_size = 0; ///< mandatory segment lines to never cross; helps with sharding uint8_t flags = 0; @@ -1214,7 +1215,7 @@ struct bluestore_onode_t { } DENC(bluestore_onode_t, v, p) { - DENC_START(2, 1, p); + DENC_START(3, 1, p); denc_varint(v.nid, p); denc_varint(v.size, p); denc(v.attrs, p); @@ -1226,6 +1227,9 @@ struct bluestore_onode_t { if (struct_v >= 2) { denc(v.zone_offset_refs, p); } + if (struct_v >= 3) { + denc(v.segment_size, p); + } DENC_FINISH(p); } void dump(ceph::Formatter *f) const; diff --git a/src/tools/ceph-dencoder/osd_types.h b/src/tools/ceph-dencoder/osd_types.h index 749233e3b7d..35bb5a45c93 100644 --- a/src/tools/ceph-dencoder/osd_types.h +++ b/src/tools/ceph-dencoder/osd_types.h @@ -118,7 +118,7 @@ TYPE(bluestore_blob_use_tracker_t) // approach. // TYPE_FEATUREFUL(bluestore_blob_t) TYPE(bluestore_shared_blob_t) -TYPE(bluestore_onode_t) +TYPE_FEATUREFUL(bluestore_onode_t) TYPE(bluestore_onode_t::shard_info) using shard_info = bluestore_onode_t::shard_info; TYPE(shard_info)