From: Igor Fedotov Date: Tue, 19 Apr 2022 18:08:16 +0000 (+0300) Subject: os/bluestore: refactor NCB recovery procedure. X-Git-Tag: v18.1.0~896^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bda1a20ac123d8ead82e5f2afaec7904cc59beb0;p=ceph.git os/bluestore: refactor NCB recovery procedure. This implements a basis for statfs recovery from persistent Onode metadata. Plus some redesign to make this procedure more lightweight and performant - via avoiding full Onode rebuild. Signed-off-by: Igor Fedotov --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index a8e121b0767b..d6500e4cca49 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -2392,11 +2392,11 @@ void BlueStore::Blob::split(Collection *coll, uint32_t blob_offset, Blob *r) #ifndef CACHE_BLOB_BL void BlueStore::Blob::decode( - Collection *coll, bufferptr::const_iterator& p, uint64_t struct_v, uint64_t* sbid, - bool include_ref_map) + bool include_ref_map, + Collection *coll) { denc(blob, p, struct_v); if (blob.is_shared()) { @@ -2409,11 +2409,13 @@ void BlueStore::Blob::decode( used_in_blob.clear(); bluestore_extent_ref_map_t legacy_ref_map; legacy_ref_map.decode(p); - for (auto r : legacy_ref_map.ref_map) { - get_ref( - coll, - r.first, - r.second.refs * r.second.length); + if (coll) { + for (auto r : legacy_ref_map.ref_map) { + get_ref( + coll, + r.first, + r.second.refs * r.second.length); + } } } } @@ -2456,10 +2458,9 @@ BlueStore::OldExtent* BlueStore::OldExtent::create(CollectionRef c, #undef dout_context #define dout_context onode->c->store->cct -BlueStore::ExtentMap::ExtentMap(Onode *o) +BlueStore::ExtentMap::ExtentMap(Onode *o, size_t inline_shard_prealloc_size) : onode(o), - inline_bl( - o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) { + inline_bl(inline_shard_prealloc_size) { } void BlueStore::ExtentMap::dump(Formatter* f) const @@ -3130,80 +3131,144 @@ bool BlueStore::ExtentMap::encode_some( return false; } -unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl) +/////////////////// BlueStore::ExtentMap::DecoderExtent /////////// +void BlueStore::ExtentMap::ExtentDecoder::decode_extent( + Extent* le, + __u8 struct_v, + bptr_c_it_t& p, + Collection* c) { - /* - derr << __func__ << ":"; - bl.hexdump(*_dout); - *_dout << dendl; - */ + uint64_t blobid; + denc_varint(blobid, p); + if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) { + uint64_t gap; + denc_varint_lowz(gap, p); + pos += gap; + } + le->logical_offset = pos; + if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) { + denc_varint_lowz(le->blob_offset, p); + } else { + le->blob_offset = 0; + } + if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) { + denc_varint_lowz(prev_len, p); + } + le->length = prev_len; + if (blobid & BLOBID_FLAG_SPANNING) { + consume_blobid(le, true, blobid >> BLOBID_SHIFT_BITS); + } else { + blobid >>= BLOBID_SHIFT_BITS; + if (blobid) { + consume_blobid(le, false, blobid - 1); + } else { + Blob *b = new Blob(); + uint64_t sbid = 0; + b->decode(p, struct_v, &sbid, false, c); + consume_blob(le, extent_pos, sbid, b); + } + } + pos += prev_len; + ++extent_pos; +} + +unsigned BlueStore::ExtentMap::ExtentDecoder::decode_some( + const bufferlist& bl, Collection* c) +{ + __u8 struct_v; + uint32_t num; ceph_assert(bl.get_num_buffers() <= 1); auto p = bl.front().begin_deep(); - __u8 struct_v; denc(struct_v, p); // Version 2 differs from v1 in blob's ref_map // serialization only. Hence there is no specific // handling at ExtentMap level below. ceph_assert(struct_v == 1 || struct_v == 2); - - uint32_t num; denc_varint(num, p); - vector blobs(num); - uint64_t pos = 0; - uint64_t prev_len = 0; - unsigned n = 0; + extent_pos = 0; while (!p.end()) { - Extent *le = new Extent(); - uint64_t blobid; - denc_varint(blobid, p); - if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) { - uint64_t gap; - denc_varint_lowz(gap, p); - pos += gap; - } - le->logical_offset = pos; - if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) { - denc_varint_lowz(le->blob_offset, p); - } else { - le->blob_offset = 0; - } - if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) { - denc_varint_lowz(prev_len, p); - } - le->length = prev_len; + Extent* le = get_next_extent(); + decode_extent(le, struct_v, p, c); + add_extent(le); + } + ceph_assert(extent_pos == num); + return num; +} - if (blobid & BLOBID_FLAG_SPANNING) { - dout(30) << __func__ << " getting spanning blob " - << (blobid >> BLOBID_SHIFT_BITS) << dendl; - le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS)); - } else { - blobid >>= BLOBID_SHIFT_BITS; - if (blobid) { - le->assign_blob(blobs[blobid - 1]); - ceph_assert(le->blob); - } else { - Blob *b = new Blob(); - uint64_t sbid = 0; - b->decode(onode->c, p, struct_v, &sbid, false); - blobs[n] = b; - onode->c->open_shared_blob(sbid, b); - le->assign_blob(b); - } - // we build ref_map dynamically for non-spanning blobs - le->blob->get_ref( - onode->c, - le->blob_offset, - le->length); - } - pos += prev_len; - ++n; - extent_map.insert(*le); +void BlueStore::ExtentMap::ExtentDecoder::decode_spanning_blobs( + bptr_c_it_t& p, Collection* c) +{ + __u8 struct_v; + denc(struct_v, p); + // Version 2 differs from v1 in blob's ref_map + // serialization only. Hence there is no specific + // handling at ExtentMap level. + ceph_assert(struct_v == 1 || struct_v == 2); + + unsigned n; + denc_varint(n, p); + while (n--) { + BlueStore::BlobRef b(new Blob()); + denc_varint(b->id, p); + uint64_t sbid = 0; + b->decode(p, struct_v, &sbid, true, c); + consume_spanning_blob(sbid, b); } +} - ceph_assert(n == num); - return num; +/////////////////// BlueStore::ExtentMap::DecoderExtentFull /////////// +void BlueStore::ExtentMap::ExtentDecoderFull::consume_blobid( + BlueStore::Extent* le, bool spanning, uint64_t blobid) { + ceph_assert(le); + if (spanning) { + le->assign_blob(extent_map.get_spanning_blob(blobid)); + } else { + ceph_assert(blobid < blobs.size()); + le->assign_blob(blobs[blobid]); + // we build ref_map dynamically for non-spanning blobs + le->blob->get_ref( + extent_map.onode->c, + le->blob_offset, + le->length); + } +} + +void BlueStore::ExtentMap::ExtentDecoderFull::consume_blob( + BlueStore::Extent* le, uint64_t extent_no, uint64_t sbid, BlobRef b) { + ceph_assert(le); + blobs.resize(extent_no + 1); + blobs[extent_no] = b; + extent_map.onode->c->open_shared_blob(sbid, b); + le->assign_blob(b); + le->blob->get_ref( + extent_map.onode->c, + le->blob_offset, + le->length); +} + +void BlueStore::ExtentMap::ExtentDecoderFull::consume_spanning_blob( + uint64_t sbid, BlueStore::BlobRef b) { + extent_map.spanning_blob_map[b->id] = b; + extent_map.onode->c->open_shared_blob(sbid, b); +} + +BlueStore::Extent* BlueStore::ExtentMap::ExtentDecoderFull::get_next_extent() +{ + return new Extent(); +} + +void BlueStore::ExtentMap::ExtentDecoderFull::add_extent(BlueStore::Extent* le) +{ + extent_map.extent_map.insert(*le); +} + +unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl) +{ + ExtentDecoderFull edecoder(*this); + unsigned n = edecoder.decode_some(bl, onode->c); + return n; } void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p) @@ -3239,28 +3304,6 @@ void BlueStore::ExtentMap::encode_spanning_blobs( } } -void BlueStore::ExtentMap::decode_spanning_blobs( - bufferptr::const_iterator& p) -{ - __u8 struct_v; - denc(struct_v, p); - // Version 2 differs from v1 in blob's ref_map - // serialization only. Hence there is no specific - // handling at ExtentMap level. - ceph_assert(struct_v == 1 || struct_v == 2); - - unsigned n; - denc_varint(n, p); - while (n--) { - BlobRef b(new Blob()); - denc_varint(b->id, p); - spanning_blob_map[b->id] = b; - uint64_t sbid = 0; - b->decode(onode->c, p, struct_v, &sbid, true); - onode->c->open_shared_blob(sbid, b); - } -} - void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty) { shards.resize(onode->onode.extent_map_shards.size()); @@ -3712,6 +3755,22 @@ void BlueStore::Onode::put() { } } +void BlueStore::Onode::decode_raw( + BlueStore::Onode* on, + const bufferlist& v, + BlueStore::ExtentMap::ExtentDecoder& edecoder) +{ + auto p = v.front().begin_deep(); + on->onode.decode(p); + + // initialize extent_map + edecoder.decode_spanning_blobs(p, on->c); + if (on->onode.extent_map_shards.empty()) { + denc(on->extent_map.inline_bl, p); + edecoder.decode_some(on->extent_map.inline_bl, on->c); + } +} + BlueStore::Onode* BlueStore::Onode::decode( CollectionRef c, const ghobject_t& oid, @@ -3720,21 +3779,19 @@ BlueStore::Onode* BlueStore::Onode::decode( { Onode* on = new Onode(c.get(), oid, key); on->exists = true; - auto p = v.front().begin_deep(); - on->onode.decode(p); + + ExtentMap::ExtentDecoderFull edecoder(on->extent_map); + decode_raw(on, v, edecoder); + for (auto& i : on->onode.attrs) { i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta); } // initialize extent_map - on->extent_map.decode_spanning_blobs(p); if (on->onode.extent_map_shards.empty()) { - denc(on->extent_map.inline_bl, p); - on->extent_map.decode_some(on->extent_map.inline_bl); on->extent_map.inline_bl.reassign_to_mempool( mempool::mempool_bluestore_cache_data); - } - else { + } else { on->extent_map.init_shards(false, false); } return on; @@ -5888,7 +5945,6 @@ int BlueStore::_init_alloc(std::map *zone_adjustments) derr << __func__ << "::NCB::Please change the value of bluestore_allocation_from_file to TRUE in your ceph.conf file" << dendl; return -ENOTSUP; // Operation not supported } - if (restore_allocator(alloc, &num, &bytes) == 0) { dout(5) << __func__ << "::NCB::restore_allocator() completed successfully alloc=" << alloc << dendl; } else { @@ -6711,10 +6767,14 @@ void BlueStore::_close_db_leave_bluefs() void BlueStore::_close_db() { - dout(10) << __func__ << ":read_only=" << db_was_opened_read_only << " fm=" << fm << " destage_alloc_file=" << need_to_destage_allocation_file << dendl; + dout(10) << __func__ << ":read_only=" << db_was_opened_read_only + << " fm=" << fm + << " destage_alloc_file=" << need_to_destage_allocation_file + << dendl; + bool do_destage = !db_was_opened_read_only && need_to_destage_allocation_file; _close_db_leave_bluefs(); - if (!db_was_opened_read_only && fm && fm->is_null_manager() && need_to_destage_allocation_file) { + if (do_destage && fm && fm->is_null_manager()) { int ret = store_allocator(alloc); if (ret != 0) { derr << __func__ << "::NCB::store_allocator() failed (continue with bitmapFreelistManager)" << dendl; @@ -18783,228 +18843,248 @@ int BlueStore::restore_allocator(Allocator* dest_allocator, uint64_t *num, uint6 return ret; } -//------------------------------------------------------------------------- -void BlueStore::ExtentMap::provide_shard_info_to_onode(bufferlist v, uint32_t shard_id) -{ - [[maybe_unused]] auto cct = onode->c->store->cct; - auto path = onode->c->store->path; - if (shard_id < shards.size()) { - auto p = &shards[shard_id]; - if (!p->loaded) { - dout(30) << "opening shard 0x" << std::hex << p->shard_info->offset << std::dec << dendl; - p->extents = decode_some(v); - p->loaded = true; - dout(20) << "open shard 0x" << std::hex << p->shard_info->offset << std::dec << dendl; - ceph_assert(p->dirty == false); - ceph_assert(v.length() == p->shard_info->bytes); - } - } else { - derr << "illegal shard-id=" << shard_id << " shards.size()=" << shards.size() << dendl; - ceph_assert(shard_id < shards.size()); - } -} - //----------------------------------------------------------------------------------- void BlueStore::set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length) { + dout(30) << __func__ << " 0x" << std::hex + << offset << "~" << length + << " " << min_alloc_size_mask + << dendl; ceph_assert((offset & min_alloc_size_mask) == 0); ceph_assert((length & min_alloc_size_mask) == 0); sbmap->set(offset >> min_alloc_size_order, length >> min_alloc_size_order); } -//--------------------------------------------------------- -// Process all physical extents from a given Onode (including all its shards) -void BlueStore::read_allocation_from_single_onode( - SimpleBitmap* sbmap, - BlueStore::OnodeRef& onode_ref, - read_alloc_stats_t& stats) -{ - // create a map holding all physical-extents of this Onode to prevent duplication from being added twice and more - std::unordered_map lcl_extnt_map; - unsigned blobs_count = 0; - uint64_t pos = 0; +void BlueStore::ExtentDecoderPartial::_consume_new_blob(bool spanning, + uint64_t extent_no, + uint64_t sbid, + BlobRef b) +{ + [[maybe_unused]] auto cct = store.cct; + ceph_assert(per_pool_statfs); + ceph_assert(oid != ghobject_t()); - stats.spanning_blob_count += onode_ref->extent_map.spanning_blob_map.size(); - // first iterate over all logical-extents - for (struct Extent& l_extent : onode_ref->extent_map.extent_map) { - ceph_assert(l_extent.logical_offset >= pos); + auto &blob = b->get_blob(); + if(spanning) { + dout(20) << __func__ << " " << spanning << " " << b->id << dendl; + ceph_assert(b->id >= 0); + spanning_blobs[b->id] = b; + ++stats.spanning_blob_count; + } else { + dout(20) << __func__ << " " << spanning << " " << extent_no << dendl; + blobs[extent_no] = b; + } + bool compressed = blob.is_compressed(); + if (!blob.is_shared()) { + for (auto& pe : blob.get_extents()) { + if (pe.offset == bluestore_pextent_t::INVALID_OFFSET) { + ++stats.skipped_illegal_extent; + continue; + } + store.set_allocation_in_simple_bmap(&sbmap, pe.offset, pe.length); - pos = l_extent.logical_offset + l_extent.length; - ceph_assert(l_extent.blob); - const bluestore_blob_t& blob = l_extent.blob->get_blob(); - const PExtentVector& p_extent_vec = blob.get_extents(); - blobs_count++; - if (blob.is_compressed()) { - stats.compressed_blob_count++; + per_pool_statfs->allocated() += pe.length; + if (compressed) { + per_pool_statfs->compressed_allocated() += pe.length; + } } - - if (blob.is_shared()) { - stats.shared_blobs_count++; + if (compressed) { + per_pool_statfs->compressed() += + blob.get_compressed_payload_length(); + ++stats.compressed_blob_count; } - - // process all physical extent in this blob - for (auto p_extent = p_extent_vec.begin(); p_extent != p_extent_vec.end(); p_extent++) { - auto offset = p_extent->offset; - auto length = p_extent->length; - - // Offset of -1 means that the extent was removed (and it is only a place holder) and can be safely skipped - if (offset == (uint64_t)-1) { - stats.skipped_illegal_extent++; - continue; + } else { + auto it = sb_info.find(sbid); + if (it == sb_info.end()) { + derr << __func__ << " shared blob not found:" << sbid + << dendl; + } + auto &sbi = *it; + auto pool_id = oid.hobj.get_logical_pool(); + if (sbi.pool_id == sb_info_t::INVALID_POOL_ID) { + sbi.pool_id = pool_id; + size_t alloc_delta = sbi.allocated_chunks << min_alloc_size_order; + per_pool_statfs->allocated() += alloc_delta; + if (compressed) { + per_pool_statfs->compressed_allocated() += alloc_delta; + ++stats.compressed_blob_count; } + } + if (compressed) { + per_pool_statfs->compressed() += + blob.get_compressed_payload_length(); + } + } +} - if (!blob.is_shared()) { - // skip repeating extents - auto lcl_itr = lcl_extnt_map.find(offset); - // extents using shared blobs might have differnt length - if (lcl_itr != lcl_extnt_map.end() ) { - // repeated extents must have the same length! - ceph_assert(lcl_extnt_map[offset] == length); - stats.skipped_repeated_extent++; - } else { - lcl_extnt_map[offset] = length; - set_allocation_in_simple_bmap(sbmap, offset, length); - stats.extent_count++; - } - } else { - // extents using shared blobs might have differnt length - set_allocation_in_simple_bmap(sbmap, offset, length); - stats.extent_count++; - } +void BlueStore::ExtentDecoderPartial::consume_blobid(Extent* le, + bool spanning, + uint64_t blobid) +{ + [[maybe_unused]] auto cct = store.cct; + dout(20) << __func__ << " " << spanning << " " << blobid << dendl; + auto &map = spanning ? spanning_blobs : blobs; + auto it = map.find(blobid); + ceph_assert(it != map.end()); + per_pool_statfs->stored() += le->length; + if (it->second->get_blob().is_compressed()) { + per_pool_statfs->compressed_original() += le->length; + } +} - } // physical-extents loop +void BlueStore::ExtentDecoderPartial::consume_blob(Extent* le, + uint64_t extent_no, + uint64_t sbid, + BlobRef b) +{ + _consume_new_blob(false, extent_no, sbid, b); + per_pool_statfs->stored() += le->length; + if (b->get_blob().is_compressed()) { + per_pool_statfs->compressed_original() += le->length; + } +} - } // logical-extents loop +void BlueStore::ExtentDecoderPartial::consume_spanning_blob(uint64_t sbid, + BlobRef b) +{ + _consume_new_blob(true, 0/*doesn't matter*/, sbid, b); +} - if (blobs_count < MAX_BLOBS_IN_ONODE) { - stats.blobs_in_onode[blobs_count]++; - } else { - // store all counts higher than MAX_BLOBS_IN_ONODE in a single bucket at offset zero - stats.blobs_in_onode[MAX_BLOBS_IN_ONODE]++; - } +void BlueStore::ExtentDecoderPartial::reset(const ghobject_t _oid, + volatile_statfs* _per_pool_statfs) +{ + oid = _oid; + per_pool_statfs = _per_pool_statfs; + blob_map_t empty; + blob_map_t empty2; + std::swap(blobs, empty); + std::swap(spanning_blobs, empty2); } -//------------------------------------------------------------------------- int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats_t& stats) { - // finally add all space take by user data - auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); + sb_info_space_efficient_map_t sb_info; + // iterate over all shared blobs + auto it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE); if (!it) { - // TBD - find a better error code - derr << "failed db->get_iterator(PREFIX_OBJ)" << dendl; - return -1; + derr << "failed getting shared blob's iterator" << dendl; + return -ENOENT; + } + if (it) { + for (it->lower_bound(string()); it->valid(); it->next()) { + const auto& key = it->key(); + dout(20) << __func__ << " decode sb " << pretty_binary_string(key) << dendl; + uint64_t sbid = 0; + if (get_key_shared_blob(key, &sbid) != 0) { + derr << __func__ << " bad shared blob key '" << pretty_binary_string(key) + << "'" << dendl; + } + bluestore_shared_blob_t shared_blob(sbid); + bufferlist bl = it->value(); + auto blp = bl.cbegin(); + try { + decode(shared_blob, blp); + } + catch (ceph::buffer::error& e) { + derr << __func__ << " failed to decode Shared Blob" + << pretty_binary_string(key) << dendl; + continue; + } + dout(20) << __func__ << " " << shared_blob << dendl; + uint64_t allocated = 0; + for (auto& r : shared_blob.ref_map.ref_map) { + ceph_assert(r.first != bluestore_pextent_t::INVALID_OFFSET); + set_allocation_in_simple_bmap(sbmap, r.first, r.second.length); + allocated += r.second.length; + } + auto &sbi = sb_info.add_or_adopt(sbid); + ceph_assert(p2phase(allocated, min_alloc_size) == 0); + sbi.allocated_chunks += (allocated >> min_alloc_size_order); + ++stats.shared_blob_count; + } + } + + it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); + if (!it) { + derr << "failed getting onode's iterator" << dendl; + return -ENOENT; } - CollectionRef collection_ref; - spg_t pgid; - BlueStore::OnodeRef onode_ref; - bool has_open_onode = false; - uint32_t shard_id = 0; uint64_t kv_count = 0; uint64_t count_interval = 1'000'000; + ExtentDecoderPartial edecoder(*this, + stats, + *sbmap, + sb_info, + min_alloc_size_order); + // iterate over all ONodes stored in RocksDB for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) { // trace an even after every million processed objects (typically every 5-10 seconds) if (kv_count && (kv_count % count_interval == 0) ) { - dout(5) << "processed objects count = " << kv_count << dendl; - } - - // Shards - Code - // add the extents from the shards to the main Obj - if (is_extent_shard_key(it->key())) { - // shards must follow a valid main object - if (has_open_onode) { - // shards keys must start with the main object key - if (it->key().find(onode_ref->key) == 0) { - // shards count can't exceed declared shard-count in the main-object - if (shard_id < onode_ref->extent_map.shards.size()) { - onode_ref->extent_map.provide_shard_info_to_onode(it->value(), shard_id); - stats.shard_count++; - shard_id++; - } else { - derr << "illegal shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl; - derr << "shard->key=" << pretty_binary_string(it->key()) << dendl; - ceph_assert(shard_id < onode_ref->extent_map.shards.size()); - } - } else { - derr << "illegal shard-key::onode->key=" << pretty_binary_string(onode_ref->key) << " shard->key=" << pretty_binary_string(it->key()) << dendl; - ceph_assert(it->key().find(onode_ref->key) == 0); - } - } else { - derr << "error::shard without main objects for key=" << pretty_binary_string(it->key()) << dendl; - ceph_assert(has_open_onode); - } - - } else { - // Main Object Code + dout(5) << __func__ << " processed objects count = " << kv_count << dendl; + } - if (has_open_onode) { - // make sure we got all shards of this object - if (shard_id == onode_ref->extent_map.shards.size()) { - // We completed an Onode Object -> pass it to be processed - read_allocation_from_single_onode(sbmap, onode_ref, stats); - } else { - derr << "Missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl; - ceph_assert(shard_id == onode_ref->extent_map.shards.size()); - } - } else { - // We opened a new Object - has_open_onode = true; + auto key = it->key(); + auto okey = key; + dout(20) << __func__ << " decode onode " << pretty_binary_string(key) << dendl; + ghobject_t oid; + if (!is_extent_shard_key(it->key())) { + int r = get_key_object(okey, &oid); + if (r != 0) { + derr << __func__ << " failed to decode onode key = " + << pretty_binary_string(okey) << dendl; + return -EIO; } - - // The main Obj is always first in RocksDB so we can start with shard_id set to zero - shard_id = 0; - stats.onode_count++; - ghobject_t oid; - int ret = get_key_object(it->key(), &oid); - if (ret < 0) { - derr << "bad object key " << pretty_binary_string(it->key()) << dendl; - ceph_assert(ret == 0); - continue; + edecoder.reset(oid, + &stats.actual_pool_vstatfs[oid.hobj.get_logical_pool()]); + Onode dummy_on(cct); + Onode::decode_raw(&dummy_on, + it->value(), + edecoder); + ++stats.onode_count; + } else { + uint32_t offset; + int r = get_key_extent_shard(key, &okey, &offset); + if (r != 0) { + derr << __func__ << " failed to decode onode extent key = " + << pretty_binary_string(key) << dendl; + return -EIO; } - - // fill collection_ref if doesn't exist yet - // We process all the obejcts in a given collection and then move to the next collection - // This means we only search once for every given collection - if (!collection_ref || - oid.shard_id != pgid.shard || - oid.hobj.get_logical_pool() != (int64_t)pgid.pool() || - !collection_ref->contains(oid)) { - stats.collection_search++; - collection_ref = nullptr; - - for (auto& p : coll_map) { - if (p.second->contains(oid)) { - collection_ref = p.second; - break; - } - } - - if (!collection_ref) { - derr << "stray object " << oid << " not owned by any collection" << dendl; - ceph_assert(collection_ref); - continue; - } - - collection_ref->cid.is_pg(&pgid); + r = get_key_object(okey, &oid); + if (r != 0) { + derr << __func__ + << " failed to decode onode key= " << pretty_binary_string(okey) + << " from extent key= " << pretty_binary_string(key) + << dendl; + return -EIO; } - onode_ref.reset(BlueStore::Onode::decode(collection_ref, oid, it->key(), it->value())); + ceph_assert(oid == edecoder.get_oid()); + edecoder.decode_some(it->value(), nullptr); + ++stats.shard_count; } } - // process the last object - if (has_open_onode) { - // make sure we got all shards of this object - if (shard_id == onode_ref->extent_map.shards.size()) { - // We completed an Onode Object -> pass it to be processed - read_allocation_from_single_onode(sbmap, onode_ref, stats); - } else { - derr << "Last Object is missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl; - ceph_assert(shard_id == onode_ref->extent_map.shards.size()); - } + std::lock_guard l(vstatfs_lock); + store_statfs_t s; + osd_pools.clear(); + for (auto& p : stats.actual_pool_vstatfs) { + if (per_pool_stat_collection) { + osd_pools[p.first] = p.second; + } + stats.actual_store_vstatfs += p.second; + p.second.publish(&s); + dout(5) << __func__ << " recovered pool " + << std::hex + << p.first << "->" << s + << std::dec + << " per-pool:" << per_pool_stat_collection + << dendl; } - dout(5) << "onode_count=" << stats.onode_count << " ,shard_count=" << stats.shard_count << dendl; - + vstatfs = stats.actual_store_vstatfs; + vstatfs.publish(&s); + dout(5) << __func__ << " recovered " << s + << dendl; return 0; } diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 6c8e30b4c07e..10948107a85b 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -218,6 +218,7 @@ enum { }; #define META_POOL_ID ((uint64_t)-1ull) +using bptr_c_it_t = buffer::ptr::const_iterator; class BlueStore : public ObjectStore, public md_config_obs_t { @@ -716,9 +717,9 @@ public: } } void decode( - Collection */*coll*/, ceph::buffer::ptr::const_iterator& p, - bool include_ref_map) { + bool include_ref_map, + Collection */*coll*/) { const char *start = p.get_pos(); denc(blob, p); const char *end = p.get_pos(); @@ -756,11 +757,11 @@ public: } } void decode( - Collection *coll, ceph::buffer::ptr::const_iterator& p, uint64_t struct_v, uint64_t* sbid, - bool include_ref_map); + bool include_ref_map, + Collection *coll); #endif }; typedef boost::intrusive_ptr BlobRef; @@ -872,6 +873,7 @@ public: bool loaded = false; ///< true if shard is loaded bool dirty = false; ///< true if shard is dirty and needs reencoding }; + mempool::bluestore_cache_meta::vector shards; ///< shards ceph::buffer::list inline_bl; ///< cached encoded map, if unsharded; empty=>dirty @@ -901,7 +903,7 @@ public: void operator()(Extent *e) { delete e; } }; - ExtentMap(Onode *o); + ExtentMap(Onode *o, size_t inline_shard_prealloc_size); ~ExtentMap() { extent_map.clear_and_dispose(DeleteDisposer()); } @@ -917,12 +919,56 @@ public: bool encode_some(uint32_t offset, uint32_t length, ceph::buffer::list& bl, unsigned *pn); + + class ExtentDecoder { + uint64_t pos = 0; + uint64_t prev_len = 0; + uint64_t extent_pos = 0; + protected: + virtual void consume_blobid(Extent* le, + bool spanning, + uint64_t blobid) = 0; + virtual void consume_blob(Extent* le, + uint64_t extent_no, + uint64_t sbid, + BlobRef b) = 0; + virtual void consume_spanning_blob(uint64_t sbid, BlobRef b) = 0; + virtual Extent* get_next_extent() = 0; + virtual void add_extent(Extent*) = 0; + + void decode_extent(Extent* le, + __u8 struct_v, + bptr_c_it_t& p, + Collection* c); + public: + virtual ~ExtentDecoder() { + } + + unsigned decode_some(const ceph::buffer::list& bl, Collection* c); + void decode_spanning_blobs(bptr_c_it_t& p, Collection* c); + }; + + class ExtentDecoderFull : public ExtentDecoder { + ExtentMap& extent_map; + std::vector blobs; + protected: + void consume_blobid(Extent* le, bool spanning, uint64_t blobid) override; + void consume_blob(Extent* le, + uint64_t extent_no, + uint64_t sbid, + BlobRef b) override; + void consume_spanning_blob(uint64_t sbid, BlobRef b) override; + Extent* get_next_extent() override; + void add_extent(Extent* ) override; + public: + ExtentDecoderFull (ExtentMap& _extent_map) : extent_map(_extent_map) { + } + }; + unsigned decode_some(ceph::buffer::list& bl); void bound_encode_spanning_blobs(size_t& p); void encode_spanning_blobs(ceph::buffer::list::contiguous_appender& p); - void decode_spanning_blobs(ceph::buffer::ptr::const_iterator& p); - BlobRef get_spanning_blob(int id) { auto p = spanning_blob_map.find(id); ceph_assert(p != spanning_blob_map.end()); @@ -1022,8 +1068,6 @@ public: /// split a blob (and referring extents) BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos); - - void provide_shard_info_to_onode(bufferlist v, uint32_t shard_id); }; /// Compressed Blob Garbage collector @@ -1173,31 +1217,50 @@ public: exists(false), cached(false), pinned(false), - extent_map(this) { + extent_map(this, + c->store->cct->_conf-> + bluestore_extent_map_inline_shard_prealloc_size) { } Onode(Collection* c, const ghobject_t& o, const std::string& k) : nref(0), - c(c), - oid(o), - key(k), - exists(false), - cached(false), - pinned(false), - extent_map(this) { + c(c), + oid(o), + key(k), + exists(false), + cached(false), + pinned(false), + extent_map(this, + c->store->cct->_conf-> + bluestore_extent_map_inline_shard_prealloc_size) { } Onode(Collection* c, const ghobject_t& o, const char* k) : nref(0), - c(c), - oid(o), - key(k), - exists(false), - cached(false), - pinned(false), - extent_map(this) { + c(c), + oid(o), + key(k), + exists(false), + cached(false), + pinned(false), + extent_map(this, + c->store->cct->_conf-> + bluestore_extent_map_inline_shard_prealloc_size) { } - + Onode(CephContext* cct) + : nref(0), + c(nullptr), + exists(false), + cached(false), + pinned(false), + extent_map(this, + cct->_conf-> + bluestore_extent_map_inline_shard_prealloc_size) { + } + static void decode_raw( + BlueStore::Onode* on, + const bufferlist& v, + ExtentMap::ExtentDecoder& dencoder); static Onode* decode( CollectionRef c, const ghobject_t& oid, @@ -2731,6 +2794,7 @@ private: int64_t& errors, int64_t &warnings, BlueStoreRepairer* repairer); + void _fsck_repair_shared_blobs( BlueStoreRepairer& repairer, shared_blob_2hash_tracker_t& sb_ref_counts, @@ -2808,7 +2872,7 @@ public: bool is_rotational() override; bool is_journal_rotational() override; - bool is_db_rotational() ; + bool is_db_rotational(); std::string get_default_device_class() override { std::string device_class; @@ -3672,67 +3736,82 @@ public: int push_allocation_to_rocksdb(); int read_allocation_from_drive_for_bluestore_tool(); #endif + void set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length); + private: -#define MAX_BLOBS_IN_ONODE 128 struct read_alloc_stats_t { - //read_alloc_stats_t() { memset(&this, 0, sizeof(read_alloc_stats_t)); } uint32_t onode_count = 0; uint32_t shard_count = 0; - uint32_t skipped_repeated_extent = 0; uint32_t skipped_illegal_extent = 0; - uint32_t collection_search = 0; - uint32_t pad_limit_count = 0; - - uint64_t shared_blobs_count = 0; + uint64_t shared_blob_count = 0; uint64_t compressed_blob_count = 0; uint64_t spanning_blob_count = 0; uint64_t insert_count = 0; uint64_t extent_count = 0; - - uint64_t saved_inplace_count = 0; - uint32_t merge_insert_count = 0; - uint32_t merge_inplace_count = 0; - - std::arrayblobs_in_onode = {}; - //uint32_t blobs_in_onode[MAX_BLOBS_IN_ONODE+1]; + std::map actual_pool_vstatfs; + volatile_statfs actual_store_vstatfs; + }; + class ExtentDecoderPartial : public ExtentMap::ExtentDecoder { + BlueStore& store; + read_alloc_stats_t& stats; + SimpleBitmap& sbmap; + sb_info_space_efficient_map_t& sb_info; + uint8_t min_alloc_size_order; + Extent extent; + ghobject_t oid; + volatile_statfs* per_pool_statfs = nullptr; + blob_map_t blobs; + blob_map_t spanning_blobs; + + void _consume_new_blob(bool spanning, + uint64_t extent_no, + uint64_t sbid, + BlobRef b); + protected: + void consume_blobid(Extent*, bool spanning, uint64_t blobid) override; + void consume_blob(Extent* le, + uint64_t extent_no, + uint64_t sbid, + BlobRef b) override; + void consume_spanning_blob(uint64_t sbid, BlobRef b) override; + Extent* get_next_extent() override { + ++stats.extent_count; + extent = Extent(); + return &extent; + } + void add_extent(Extent*) override { + } + public: + ExtentDecoderPartial(BlueStore& _store, + read_alloc_stats_t& _stats, + SimpleBitmap& _sbmap, + sb_info_space_efficient_map_t& _sb_info, + uint8_t _min_alloc_size_order) + : store(_store), stats(_stats), sbmap(_sbmap), sb_info(_sb_info), + min_alloc_size_order(_min_alloc_size_order) + {} + const ghobject_t& get_oid() const { + return oid; + } + void reset(const ghobject_t _oid, + volatile_statfs* _per_pool_statfs); }; friend std::ostream& operator<<(std::ostream& out, const read_alloc_stats_t& stats) { out << "==========================================================" << std::endl; out << "NCB::onode_count = " ;out.width(10);out << stats.onode_count << std::endl << "NCB::shard_count = " ;out.width(10);out << stats.shard_count << std::endl - << "NCB::shared_blobs_count = " ;out.width(10);out << stats.shared_blobs_count << std::endl + << "NCB::shared_blob_count = " ;out.width(10);out << stats.shared_blob_count << std::endl << "NCB::compressed_blob_count = " ;out.width(10);out << stats.compressed_blob_count << std::endl << "NCB::spanning_blob_count = " ;out.width(10);out << stats.spanning_blob_count << std::endl - << "NCB::collection search = " ;out.width(10);out << stats.collection_search << std::endl - << "NCB::skipped_repeated_extent = " ;out.width(10);out << stats.skipped_repeated_extent << std::endl << "NCB::skipped_illegal_extent = " ;out.width(10);out << stats.skipped_illegal_extent << std::endl << "NCB::extent_count = " ;out.width(10);out << stats.extent_count << std::endl << "NCB::insert_count = " ;out.width(10);out << stats.insert_count << std::endl; - if (stats.merge_insert_count) { - out << "NCB::merge_insert_count = " ;out.width(10);out << stats.merge_insert_count << std::endl; - } - if (stats.merge_inplace_count ) { - out << "NCB::merge_inplace_count = " ;out.width(10);out << stats.merge_inplace_count << std::endl; - out << "NCB::saved_inplace_count = " ;out.width(10);out << stats.saved_inplace_count << std::endl; - out << "NCB::saved inplace per call = " ;out.width(10);out << stats.saved_inplace_count/stats.merge_inplace_count << std::endl; - } out << "==========================================================" << std::endl; - for (unsigned i = 0; i < MAX_BLOBS_IN_ONODE; i++ ) { - if (stats.blobs_in_onode[i]) { - out << "NCB::We had " ;out.width(9); out << stats.blobs_in_onode[i] - << " ONodes with "; out.width(3); out << i << " blobs" << std::endl; - } - } - - if (stats.blobs_in_onode[MAX_BLOBS_IN_ONODE]) { - out << "NCB::We had " ;out.width(9);out << stats.blobs_in_onode[MAX_BLOBS_IN_ONODE] - << " ONodes with more than " << MAX_BLOBS_IN_ONODE << " blobs" << std::endl; - } return out; } @@ -3750,8 +3829,6 @@ private: int read_allocation_from_drive_on_startup(); int reconstruct_allocations(SimpleBitmap *smbmp, read_alloc_stats_t &stats); int read_allocation_from_onodes(SimpleBitmap *smbmp, read_alloc_stats_t& stats); - void read_allocation_from_single_onode(SimpleBitmap *smbmp, BlueStore::OnodeRef& onode_ref, read_alloc_stats_t& stats); - void set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length); int commit_freelist_type(); int commit_to_null_manager(); int commit_to_real_manager(); diff --git a/src/test/objectstore/test_bluestore_types.cc b/src/test/objectstore/test_bluestore_types.cc index bf69df0b9d4e..2d3fbd97c49e 100644 --- a/src/test/objectstore/test_bluestore_types.cc +++ b/src/test/objectstore/test_bluestore_types.cc @@ -1097,17 +1097,17 @@ TEST(Blob, legacy_decode) uint64_t sbid, sbid2; Bres.decode( - coll.get(), p, 1, /*struct_v*/ &sbid, - true); + true, + coll.get()); Bres2.decode( - coll.get(), p2, 2, /*struct_v*/ &sbid2, - true); + true, + coll.get()); ASSERT_EQ(0xff0u + 1u, Bres.get_blob_use_tracker().get_referenced_bytes()); ASSERT_EQ(0xff0u + 1u, Bres2.get_blob_use_tracker().get_referenced_bytes()); @@ -1125,7 +1125,8 @@ TEST(ExtentMap, seek_lextent) auto coll = ceph::make_ref(&store, oc, bc, coll_t()); BlueStore::Onode onode(coll.get(), ghobject_t(), ""); - BlueStore::ExtentMap em(&onode); + BlueStore::ExtentMap em(&onode, + g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size); BlueStore::BlobRef br(new BlueStore::Blob); br->shared_blob = new BlueStore::SharedBlob(coll.get()); @@ -1177,7 +1178,8 @@ TEST(ExtentMap, has_any_lextents) g_ceph_context, "lru", NULL); auto coll = ceph::make_ref(&store, oc, bc, coll_t()); BlueStore::Onode onode(coll.get(), ghobject_t(), ""); - BlueStore::ExtentMap em(&onode); + BlueStore::ExtentMap em(&onode, + g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size); BlueStore::BlobRef b(new BlueStore::Blob); b->shared_blob = new BlueStore::SharedBlob(coll.get()); @@ -1236,7 +1238,8 @@ TEST(ExtentMap, compress_extent_map) auto coll = ceph::make_ref(&store, oc, bc, coll_t()); BlueStore::Onode onode(coll.get(), ghobject_t(), ""); - BlueStore::ExtentMap em(&onode); + BlueStore::ExtentMap em(&onode, + g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size); BlueStore::BlobRef b1(new BlueStore::Blob); BlueStore::BlobRef b2(new BlueStore::Blob); BlueStore::BlobRef b3(new BlueStore::Blob); @@ -1303,7 +1306,8 @@ TEST(GarbageCollector, BasicTest) BlueStore store(g_ceph_context, "", 4096); auto coll = ceph::make_ref(&store, oc, bc, coll_t()); BlueStore::Onode onode(coll.get(), ghobject_t(), ""); - BlueStore::ExtentMap em(&onode); + BlueStore::ExtentMap em(&onode, + g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size); BlueStore::old_extent_map_t old_extents; @@ -1392,7 +1396,8 @@ TEST(GarbageCollector, BasicTest) BlueStore store(g_ceph_context, "", 0x10000); auto coll = ceph::make_ref(&store, oc, bc, coll_t()); BlueStore::Onode onode(coll.get(), ghobject_t(), ""); - BlueStore::ExtentMap em(&onode); + BlueStore::ExtentMap em(&onode, + g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size); BlueStore::old_extent_map_t old_extents; BlueStore::GarbageCollector gc(g_ceph_context); @@ -1519,7 +1524,8 @@ TEST(GarbageCollector, BasicTest) BlueStore store(g_ceph_context, "", 0x10000); auto coll = ceph::make_ref(&store, oc, bc, coll_t()); BlueStore::Onode onode(coll.get(), ghobject_t(), ""); - BlueStore::ExtentMap em(&onode); + BlueStore::ExtentMap em(&onode, + g_ceph_context->_conf->bluestore_extent_map_inline_shard_prealloc_size); BlueStore::old_extent_map_t old_extents; BlueStore::GarbageCollector gc(g_ceph_context);