From: Adam Kupczyk Date: Tue, 11 Feb 2025 06:45:54 +0000 (+0000) Subject: os/bluestore: Create conf bluestore_onode_segmentation X-Git-Tag: v20.3.0~140^2~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=244491ffbd26415c8294b1ebec1de9e0449644c8;p=ceph.git os/bluestore: Create conf bluestore_onode_segmentation This debug grade conf selects bluestore_onode_t v2 or v3. In v2 mode it can read v3 but drops onode_segmentation and pretends it does not exist. It will act as a version that does not handle v3. Also it will write onodes in v2 back to DB, clearing segmentation. Signed-off-by: Adam Kupczyk --- diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 686102b308c..6bbbca0cf38 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -5103,7 +5103,7 @@ options: type: bool level: advanced desc: Random selection of write path mode - long_desc: For testing purposes. If true, value of bluestore_write_v2 is randomly selected. + long_desc: For testing purposes. If true, value of bluestore_write_v2 is randomly selected on each mount. default: false see_also: - bluestore_write_v2 @@ -6705,6 +6705,19 @@ options: see_also: - bluestore_extent_map_shard_max_size - bluestore_extent_map_shard_target_size + - bluestore_debug_onode_segmentation_random + with_legacy: false +- name: bluestore_debug_onode_segmentation_random + type: bool + level: dev + desc: Random selection of onode segmentation + long_desc: For testing purposes. On each mount 50% roll decides whether to use + bluestore_onode_segment_size or set it to 0 (disable). + default: false + see_also: + - bluestore_onode_segment_size + flags: + - startup with_legacy: false - name: jaeger_tracing_enable type: bool diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index b7fad7e75f3..d337d671154 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4814,11 +4814,12 @@ void BlueStore::Onode::put() void BlueStore::Onode::decode_raw( BlueStore::Onode* on, const bufferlist& v, - BlueStore::ExtentMap::ExtentDecoder& edecoder) + BlueStore::ExtentMap::ExtentDecoder& edecoder, + bool use_onode_segmentation) { on->exists = true; auto p = v.front().begin_deep(); - on->onode.decode(p); + on->onode.decode(p, use_onode_segmentation ? 0 : bluestore_onode_t::FLAG_DEBUG_FORCE_V2); // initialize extent_map edecoder.decode_spanning_blobs(p, on->c); @@ -4840,14 +4841,15 @@ BlueStore::Onode* BlueStore::Onode::create_decode( const ghobject_t& oid, const string& key, const bufferlist& v, - bool allow_empty) + bool allow_empty, + bool use_onode_segmentation) { ceph_assert(v.length() || allow_empty); Onode* on = new Onode(c.get(), oid, (const mempool::bluestore_cache_meta::string)(key)); if (v.length()) { ExtentMap::ExtentDecoderFull edecoder(on->extent_map); - decode_raw(on, v, edecoder); + decode_raw(on, v, edecoder, use_onode_segmentation); for (auto& i : on->onode.attrs) { i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); @@ -5239,7 +5241,7 @@ BlueStore::OnodeRef BlueStore::Collection::get_onode( } // new object, load onode if available - on = Onode::create_decode(this, oid, key, v, true); + on = Onode::create_decode(this, oid, key, v, true, store->segment_size != 0); o.reset(on); return onode_space.add_onode(oid, o); } @@ -9421,9 +9423,15 @@ int BlueStore::_mount() } use_write_v2 = cct->_conf.get_val("bluestore_write_v2"); if (cct->_conf.get_val("bluestore_write_v2_random")) { - srand(time(NULL)); + srand(time(NULL) * 11 + 3); use_write_v2 = rand() % 2; - cct->_conf.set_val("bluestore_write_v2", std::to_string(use_write_v2)); + } + segment_size = (cct->_conf.get_val("bluestore_onode_segment_size")); + if (cct->_conf.get_val("bluestore_debug_onode_segmentation_random")) { + srand(time(NULL) * 13 + 5); + if (rand() % 2) { + segment_size = 0; + } } _kv_only = false; if (cct->_conf->bluestore_fsck_on_mount) { @@ -9768,7 +9776,7 @@ void BlueStore::_fsck_foreach_shared_blob( << dendl; OnodeRef o; - o.reset(Onode::create_decode(c, oid, it->key(), it->value())); + o.reset(Onode::create_decode(c, oid, it->key(), it->value(), false, segment_size != 0)); o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); _dump_onode<30>(cct, *o); @@ -9924,7 +9932,7 @@ BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow( dout(10) << __func__ << " " << oid << dendl; OnodeRef o; - o.reset(Onode::create_decode(c, oid, key, value)); + o.reset(Onode::create_decode(c, oid, key, value, false, segment_size != 0)); ++num_objects; ++pool_fsck_stat->num_objects; num_spanning_blobs += o->extent_map.spanning_blob_map.size(); @@ -11312,7 +11320,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) << " obj:" << oid << dendl; OnodeRef o; - o.reset(Onode::create_decode(c, oid, it->key(), it->value())); + o.reset(Onode::create_decode(c, oid, it->key(), it->value(), false, segment_size != 0)); o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); mempool::bluestore_fsck::set blobs; @@ -12088,6 +12096,8 @@ void BlueStore::collect_metadata(map *pm) (*pm)["bluestore_min_alloc_size"] = stringify(min_alloc_size); (*pm)["bluestore_allocation_from_file"] = stringify(fm && fm->is_null_manager()); (*pm)["bluestore_allocator"] = alloc ? alloc->get_type() : "null"; + (*pm)["bluestore_write_mode"] = use_write_v2 ? "new" : "classic"; + (*pm)["bluestore_onode_segmentation"] = segment_size == 0 ? "inactive" : "active"; } int BlueStore::get_numa_node( @@ -19204,7 +19214,8 @@ void BlueStore::_record_onode(OnodeRef& o, KeyValueDB::Transaction &txn) // bound encode size_t bound = 0; - denc(o->onode, bound); + uint64_t flag = segment_size != 0 ? 0 : bluestore_onode_t::FLAG_DEBUG_FORCE_V2; + denc(o->onode, bound, flag); o->extent_map.bound_encode_spanning_blobs(bound); if (o->onode.extent_map_shards.empty()) { denc(o->extent_map.inline_bl, bound); @@ -19215,7 +19226,7 @@ void BlueStore::_record_onode(OnodeRef& o, KeyValueDB::Transaction &txn) unsigned onode_part, blob_part, extent_part; { auto p = bl.get_contiguous_appender(bound, true); - denc(o->onode, p); + denc(o->onode, p, flag); onode_part = p.get_logical_offset(); o->extent_map.encode_spanning_blobs(p); blob_part = p.get_logical_offset() - onode_part; @@ -20637,7 +20648,8 @@ int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats Onode dummy_on(cct); Onode::decode_raw(&dummy_on, it->value(), - edecoder); + edecoder, + segment_size != 0); ++stats.onode_count; } else { uint32_t offset; diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index a2cf4705cf6..058c79923da 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1404,14 +1404,16 @@ public: static void decode_raw( BlueStore::Onode* on, const bufferlist& v, - ExtentMap::ExtentDecoder& dencoder); + ExtentMap::ExtentDecoder& dencoder, + bool use_onode_segmentation); static Onode* create_decode( CollectionRef c, const ghobject_t& oid, const std::string& key, const ceph::buffer::list& v, - bool allow_empty = false); + bool allow_empty, + bool use_onode_segmentation); void dump(ceph::Formatter* f) const; @@ -2503,7 +2505,9 @@ private: std::atomic comp_max_blob_size = {0}; std::atomic max_blob_size = {0}; ///< maximum blob size - std::atomic segment_size = {0}; ///< snapshot of conf value "bluestore_onode_segment_size" + std::atomic segment_size = {0}; ///< snapshot of conf value "bluestore_onode_segment_size" + /// When 0 onode_bluestore_t v2 is in force, otherwise v3 is used. + /// Ability to disable is important for efficient testing. uint64_t kv_ios = 0; uint64_t kv_throttle_costs = 0; diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h index a05d2ae9c97..435455edc2a 100644 --- a/src/os/bluestore/bluestore_types.h +++ b/src/os/bluestore/bluestore_types.h @@ -1214,8 +1214,11 @@ struct bluestore_onode_t { FLAG_PERPG_OMAP); } - DENC(bluestore_onode_t, v, p) { - DENC_START(3, 1, p); + template + friend std::enable_if_t || + std::is_same_v> + _denc_friend(T& v, P& p, __u8& struct_v) + { denc_varint(v.nid, p); denc_varint(v.size, p); denc(v.attrs, p); @@ -1230,13 +1233,63 @@ struct bluestore_onode_t { if (struct_v >= 3) { denc(v.segment_size, p); } + } + + enum { + FLAG_DEBUG_FORCE_V2 = 1, // debug runtime flag to test transistions v2 <-> v3 + }; + + // Creation: + // Object created on Tentacle+, gets v3 version. + // Object gets its segment_size field initialized from bluestore_onode_segment_size. + // If pool opt `compression_max_blob_size` is set and it is larger, it will be used. + // + // Upgrade: + // Object created on earlier versions, when read on Tentacle+ get segment_size = 0. + // This disables segmentation for the object. Tentacle will operate in legacy mode, + // When object is written, it will be encoded in v3, with segment_size = 0. + // In this mode spanning blobs are expected to be created. + // + // Downgrade: + // When older BlueStore reads an object it skips v3 specific segment_size setting. + // There is no change in any other encoding, object will be read without troubles. + // Object that is only read, does not lose its v3 version. + // When object is written back, its encoded in v2, losing its segment_size setting. + + DENC_HELPERS + void bound_encode(size_t& p, uint64_t features) const { + __u8 struct_v_to_use = 3; + if ((features & FLAG_DEBUG_FORCE_V2) != 0) { + struct_v_to_use = 2; + } + DENC_START(struct_v_to_use, 1, p); + _denc_friend(*this, p, struct_v_to_use); DENC_FINISH(p); } + void encode(::ceph::buffer::list::contiguous_appender& p, uint64_t features) const { + __u8 struct_v_to_use = 3; + if ((features & FLAG_DEBUG_FORCE_V2) != 0) { + struct_v_to_use = 2; + } + DENC_START(struct_v_to_use, 1, p); + DENC_DUMP_PRE(Type); + _denc_friend(*this, p, struct_v_to_use); + DENC_FINISH(p); + } + void decode(::ceph::buffer::ptr::const_iterator& p, uint64_t features = 0) { + DENC_START(3, 1, p); + _denc_friend(*this, p, struct_v); //decode what is + if ((features & FLAG_DEBUG_FORCE_V2) != 0) { + this->segment_size = 0; + } + DENC_FINISH(p); + } + void dump(ceph::Formatter *f) const; static void generate_test_instances(std::list& o); }; WRITE_CLASS_DENC(bluestore_onode_t::shard_info) -WRITE_CLASS_DENC(bluestore_onode_t) +WRITE_CLASS_DENC_FEATURED(bluestore_onode_t) std::ostream& operator<<(std::ostream& out, const bluestore_onode_t::shard_info& si);