void BlueStore::Onode::decode_raw(
BlueStore::Onode* on,
const bufferlist& v,
- BlueStore::ExtentMap::ExtentDecoder& edecoder)
+ BlueStore::ExtentMap::ExtentDecoder& edecoder,
+ bool use_onode_segmentation)
{
on->exists = true;
auto p = v.front().begin_deep();
- on->onode.decode(p);
+ on->onode.decode(p, use_onode_segmentation ? 0 : bluestore_onode_t::FLAG_DEBUG_FORCE_V2);
// initialize extent_map
edecoder.decode_spanning_blobs(p, on->c);
const ghobject_t& oid,
const string& key,
const bufferlist& v,
- bool allow_empty)
+ bool allow_empty,
+ bool use_onode_segmentation)
{
ceph_assert(v.length() || allow_empty);
Onode* on = new Onode(c.get(), oid, (const mempool::bluestore_cache_meta::string)(key));
if (v.length()) {
ExtentMap::ExtentDecoderFull edecoder(on->extent_map);
- decode_raw(on, v, edecoder);
+ decode_raw(on, v, edecoder, use_onode_segmentation);
for (auto& i : on->onode.attrs) {
i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
}
// new object, load onode if available
- on = Onode::create_decode(this, oid, key, v, true);
+ on = Onode::create_decode(this, oid, key, v, true, store->segment_size != 0);
o.reset(on);
return onode_space.add_onode(oid, o);
}
}
use_write_v2 = cct->_conf.get_val<bool>("bluestore_write_v2");
if (cct->_conf.get_val<bool>("bluestore_write_v2_random")) {
- srand(time(NULL));
+ srand(time(NULL) * 11 + 3);
use_write_v2 = rand() % 2;
- cct->_conf.set_val("bluestore_write_v2", std::to_string(use_write_v2));
+ }
+ segment_size = (cct->_conf.get_val<Option::size_t>("bluestore_onode_segment_size"));
+ if (cct->_conf.get_val<bool>("bluestore_debug_onode_segmentation_random")) {
+ srand(time(NULL) * 13 + 5);
+ if (rand() % 2) {
+ segment_size = 0;
+ }
}
_kv_only = false;
if (cct->_conf->bluestore_fsck_on_mount) {
<< dendl;
OnodeRef o;
- o.reset(Onode::create_decode(c, oid, it->key(), it->value()));
+ o.reset(Onode::create_decode(c, oid, it->key(), it->value(), false, segment_size != 0));
o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
_dump_onode<30>(cct, *o);
dout(10) << __func__ << " " << oid << dendl;
OnodeRef o;
- o.reset(Onode::create_decode(c, oid, key, value));
+ o.reset(Onode::create_decode(c, oid, key, value, false, segment_size != 0));
++num_objects;
++pool_fsck_stat->num_objects;
num_spanning_blobs += o->extent_map.spanning_blob_map.size();
<< " obj:" << oid << dendl;
OnodeRef o;
- o.reset(Onode::create_decode(c, oid, it->key(), it->value()));
+ o.reset(Onode::create_decode(c, oid, it->key(), it->value(), false, segment_size != 0));
o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
mempool::bluestore_fsck::set<BlobRef> blobs;
(*pm)["bluestore_min_alloc_size"] = stringify(min_alloc_size);
(*pm)["bluestore_allocation_from_file"] = stringify(fm && fm->is_null_manager());
(*pm)["bluestore_allocator"] = alloc ? alloc->get_type() : "null";
+ (*pm)["bluestore_write_mode"] = use_write_v2 ? "new" : "classic";
+ (*pm)["bluestore_onode_segmentation"] = segment_size == 0 ? "inactive" : "active";
}
int BlueStore::get_numa_node(
// bound encode
size_t bound = 0;
- denc(o->onode, bound);
+ uint64_t flag = segment_size != 0 ? 0 : bluestore_onode_t::FLAG_DEBUG_FORCE_V2;
+ denc(o->onode, bound, flag);
o->extent_map.bound_encode_spanning_blobs(bound);
if (o->onode.extent_map_shards.empty()) {
denc(o->extent_map.inline_bl, bound);
unsigned onode_part, blob_part, extent_part;
{
auto p = bl.get_contiguous_appender(bound, true);
- denc(o->onode, p);
+ denc(o->onode, p, flag);
onode_part = p.get_logical_offset();
o->extent_map.encode_spanning_blobs(p);
blob_part = p.get_logical_offset() - onode_part;
Onode dummy_on(cct);
Onode::decode_raw(&dummy_on,
it->value(),
- edecoder);
+ edecoder,
+ segment_size != 0);
++stats.onode_count;
} else {
uint32_t offset;
static void decode_raw(
BlueStore::Onode* on,
const bufferlist& v,
- ExtentMap::ExtentDecoder& dencoder);
+ ExtentMap::ExtentDecoder& dencoder,
+ bool use_onode_segmentation);
static Onode* create_decode(
CollectionRef c,
const ghobject_t& oid,
const std::string& key,
const ceph::buffer::list& v,
- bool allow_empty = false);
+ bool allow_empty,
+ bool use_onode_segmentation);
void dump(ceph::Formatter* f) const;
std::atomic<uint64_t> comp_max_blob_size = {0};
std::atomic<uint64_t> max_blob_size = {0}; ///< maximum blob size
- std::atomic<uint32_t> segment_size = {0}; ///< snapshot of conf value "bluestore_onode_segment_size"
+ std::atomic<uint32_t> segment_size = {0}; ///< snapshot of conf value "bluestore_onode_segment_size"
+ /// When 0 onode_bluestore_t v2 is in force, otherwise v3 is used.
+ /// Ability to disable is important for efficient testing.
uint64_t kv_ios = 0;
uint64_t kv_throttle_costs = 0;
FLAG_PERPG_OMAP);
}
- DENC(bluestore_onode_t, v, p) {
- DENC_START(3, 1, p);
+ template<typename T, typename P>
+ friend std::enable_if_t<std::is_same_v<T, bluestore_onode_t> ||
+ std::is_same_v<T, const bluestore_onode_t>>
+ _denc_friend(T& v, P& p, __u8& struct_v)
+ {
denc_varint(v.nid, p);
denc_varint(v.size, p);
denc(v.attrs, p);
if (struct_v >= 3) {
denc(v.segment_size, p);
}
+ }
+
+ enum {
+ FLAG_DEBUG_FORCE_V2 = 1, // debug runtime flag to test transistions v2 <-> v3
+ };
+
+ // Creation:
+ // Object created on Tentacle+, gets v3 version.
+ // Object gets its segment_size field initialized from bluestore_onode_segment_size.
+ // If pool opt `compression_max_blob_size` is set and it is larger, it will be used.
+ //
+ // Upgrade:
+ // Object created on earlier versions, when read on Tentacle+ get segment_size = 0.
+ // This disables segmentation for the object. Tentacle will operate in legacy mode,
+ // When object is written, it will be encoded in v3, with segment_size = 0.
+ // In this mode spanning blobs are expected to be created.
+ //
+ // Downgrade:
+ // When older BlueStore reads an object it skips v3 specific segment_size setting.
+ // There is no change in any other encoding, object will be read without troubles.
+ // Object that is only read, does not lose its v3 version.
+ // When object is written back, its encoded in v2, losing its segment_size setting.
+
+ DENC_HELPERS
+ void bound_encode(size_t& p, uint64_t features) const {
+ __u8 struct_v_to_use = 3;
+ if ((features & FLAG_DEBUG_FORCE_V2) != 0) {
+ struct_v_to_use = 2;
+ }
+ DENC_START(struct_v_to_use, 1, p);
+ _denc_friend(*this, p, struct_v_to_use);
DENC_FINISH(p);
}
+ void encode(::ceph::buffer::list::contiguous_appender& p, uint64_t features) const {
+ __u8 struct_v_to_use = 3;
+ if ((features & FLAG_DEBUG_FORCE_V2) != 0) {
+ struct_v_to_use = 2;
+ }
+ DENC_START(struct_v_to_use, 1, p);
+ DENC_DUMP_PRE(Type);
+ _denc_friend(*this, p, struct_v_to_use);
+ DENC_FINISH(p);
+ }
+ void decode(::ceph::buffer::ptr::const_iterator& p, uint64_t features = 0) {
+ DENC_START(3, 1, p);
+ _denc_friend(*this, p, struct_v); //decode what is
+ if ((features & FLAG_DEBUG_FORCE_V2) != 0) {
+ this->segment_size = 0;
+ }
+ DENC_FINISH(p);
+ }
+
void dump(ceph::Formatter *f) const;
static void generate_test_instances(std::list<bluestore_onode_t*>& o);
};
WRITE_CLASS_DENC(bluestore_onode_t::shard_info)
-WRITE_CLASS_DENC(bluestore_onode_t)
+WRITE_CLASS_DENC_FEATURED(bluestore_onode_t)
std::ostream& operator<<(std::ostream& out, const bluestore_onode_t::shard_info& si);