#ifndef CACHE_BLOB_BL
void BlueStore::Blob::decode(
- Collection *coll,
bufferptr::const_iterator& p,
uint64_t struct_v,
uint64_t* sbid,
- bool include_ref_map)
+ bool include_ref_map,
+ Collection *coll)
{
denc(blob, p, struct_v);
if (blob.is_shared()) {
used_in_blob.clear();
bluestore_extent_ref_map_t legacy_ref_map;
legacy_ref_map.decode(p);
- for (auto r : legacy_ref_map.ref_map) {
- get_ref(
- coll,
- r.first,
- r.second.refs * r.second.length);
+ if (coll) {
+ for (auto r : legacy_ref_map.ref_map) {
+ get_ref(
+ coll,
+ r.first,
+ r.second.refs * r.second.length);
+ }
}
}
}
#undef dout_context
#define dout_context onode->c->store->cct
-BlueStore::ExtentMap::ExtentMap(Onode *o)
+BlueStore::ExtentMap::ExtentMap(Onode *o, size_t inline_shard_prealloc_size)
: onode(o),
- inline_bl(
- o->c->store->cct->_conf->bluestore_extent_map_inline_shard_prealloc_size) {
+ inline_bl(inline_shard_prealloc_size) {
}
void BlueStore::ExtentMap::dump(Formatter* f) const
return false;
}
-unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
+/////////////////// BlueStore::ExtentMap::DecoderExtent ///////////
+void BlueStore::ExtentMap::ExtentDecoder::decode_extent(
+ Extent* le,
+ __u8 struct_v,
+ bptr_c_it_t& p,
+ Collection* c)
{
- /*
- derr << __func__ << ":";
- bl.hexdump(*_dout);
- *_dout << dendl;
- */
+ uint64_t blobid;
+ denc_varint(blobid, p);
+ if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
+ uint64_t gap;
+ denc_varint_lowz(gap, p);
+ pos += gap;
+ }
+ le->logical_offset = pos;
+ if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
+ denc_varint_lowz(le->blob_offset, p);
+ } else {
+ le->blob_offset = 0;
+ }
+ if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
+ denc_varint_lowz(prev_len, p);
+ }
+ le->length = prev_len;
+ if (blobid & BLOBID_FLAG_SPANNING) {
+ consume_blobid(le, true, blobid >> BLOBID_SHIFT_BITS);
+ } else {
+ blobid >>= BLOBID_SHIFT_BITS;
+ if (blobid) {
+ consume_blobid(le, false, blobid - 1);
+ } else {
+ Blob *b = new Blob();
+ uint64_t sbid = 0;
+ b->decode(p, struct_v, &sbid, false, c);
+ consume_blob(le, extent_pos, sbid, b);
+ }
+ }
+ pos += prev_len;
+ ++extent_pos;
+}
+
+unsigned BlueStore::ExtentMap::ExtentDecoder::decode_some(
+ const bufferlist& bl, Collection* c)
+{
+ __u8 struct_v;
+ uint32_t num;
ceph_assert(bl.get_num_buffers() <= 1);
auto p = bl.front().begin_deep();
- __u8 struct_v;
denc(struct_v, p);
// Version 2 differs from v1 in blob's ref_map
// serialization only. Hence there is no specific
// handling at ExtentMap level below.
ceph_assert(struct_v == 1 || struct_v == 2);
-
- uint32_t num;
denc_varint(num, p);
- vector<BlobRef> blobs(num);
- uint64_t pos = 0;
- uint64_t prev_len = 0;
- unsigned n = 0;
+ extent_pos = 0;
while (!p.end()) {
- Extent *le = new Extent();
- uint64_t blobid;
- denc_varint(blobid, p);
- if ((blobid & BLOBID_FLAG_CONTIGUOUS) == 0) {
- uint64_t gap;
- denc_varint_lowz(gap, p);
- pos += gap;
- }
- le->logical_offset = pos;
- if ((blobid & BLOBID_FLAG_ZEROOFFSET) == 0) {
- denc_varint_lowz(le->blob_offset, p);
- } else {
- le->blob_offset = 0;
- }
- if ((blobid & BLOBID_FLAG_SAMELENGTH) == 0) {
- denc_varint_lowz(prev_len, p);
- }
- le->length = prev_len;
+ Extent* le = get_next_extent();
+ decode_extent(le, struct_v, p, c);
+ add_extent(le);
+ }
+ ceph_assert(extent_pos == num);
+ return num;
+}
- if (blobid & BLOBID_FLAG_SPANNING) {
- dout(30) << __func__ << " getting spanning blob "
- << (blobid >> BLOBID_SHIFT_BITS) << dendl;
- le->assign_blob(get_spanning_blob(blobid >> BLOBID_SHIFT_BITS));
- } else {
- blobid >>= BLOBID_SHIFT_BITS;
- if (blobid) {
- le->assign_blob(blobs[blobid - 1]);
- ceph_assert(le->blob);
- } else {
- Blob *b = new Blob();
- uint64_t sbid = 0;
- b->decode(onode->c, p, struct_v, &sbid, false);
- blobs[n] = b;
- onode->c->open_shared_blob(sbid, b);
- le->assign_blob(b);
- }
- // we build ref_map dynamically for non-spanning blobs
- le->blob->get_ref(
- onode->c,
- le->blob_offset,
- le->length);
- }
- pos += prev_len;
- ++n;
- extent_map.insert(*le);
+void BlueStore::ExtentMap::ExtentDecoder::decode_spanning_blobs(
+ bptr_c_it_t& p, Collection* c)
+{
+ __u8 struct_v;
+ denc(struct_v, p);
+ // Version 2 differs from v1 in blob's ref_map
+ // serialization only. Hence there is no specific
+ // handling at ExtentMap level.
+ ceph_assert(struct_v == 1 || struct_v == 2);
+
+ unsigned n;
+ denc_varint(n, p);
+ while (n--) {
+ BlueStore::BlobRef b(new Blob());
+ denc_varint(b->id, p);
+ uint64_t sbid = 0;
+ b->decode(p, struct_v, &sbid, true, c);
+ consume_spanning_blob(sbid, b);
}
+}
- ceph_assert(n == num);
- return num;
+/////////////////// BlueStore::ExtentMap::DecoderExtentFull ///////////
+void BlueStore::ExtentMap::ExtentDecoderFull::consume_blobid(
+ BlueStore::Extent* le, bool spanning, uint64_t blobid) {
+ ceph_assert(le);
+ if (spanning) {
+ le->assign_blob(extent_map.get_spanning_blob(blobid));
+ } else {
+ ceph_assert(blobid < blobs.size());
+ le->assign_blob(blobs[blobid]);
+ // we build ref_map dynamically for non-spanning blobs
+ le->blob->get_ref(
+ extent_map.onode->c,
+ le->blob_offset,
+ le->length);
+ }
+}
+
+void BlueStore::ExtentMap::ExtentDecoderFull::consume_blob(
+ BlueStore::Extent* le, uint64_t extent_no, uint64_t sbid, BlobRef b) {
+ ceph_assert(le);
+ blobs.resize(extent_no + 1);
+ blobs[extent_no] = b;
+ extent_map.onode->c->open_shared_blob(sbid, b);
+ le->assign_blob(b);
+ le->blob->get_ref(
+ extent_map.onode->c,
+ le->blob_offset,
+ le->length);
+}
+
+void BlueStore::ExtentMap::ExtentDecoderFull::consume_spanning_blob(
+ uint64_t sbid, BlueStore::BlobRef b) {
+ extent_map.spanning_blob_map[b->id] = b;
+ extent_map.onode->c->open_shared_blob(sbid, b);
+}
+
+BlueStore::Extent* BlueStore::ExtentMap::ExtentDecoderFull::get_next_extent()
+{
+ return new Extent();
+}
+
+void BlueStore::ExtentMap::ExtentDecoderFull::add_extent(BlueStore::Extent* le)
+{
+ extent_map.extent_map.insert(*le);
+}
+
+unsigned BlueStore::ExtentMap::decode_some(bufferlist& bl)
+{
+ ExtentDecoderFull edecoder(*this);
+ unsigned n = edecoder.decode_some(bl, onode->c);
+ return n;
}
void BlueStore::ExtentMap::bound_encode_spanning_blobs(size_t& p)
}
}
-void BlueStore::ExtentMap::decode_spanning_blobs(
- bufferptr::const_iterator& p)
-{
- __u8 struct_v;
- denc(struct_v, p);
- // Version 2 differs from v1 in blob's ref_map
- // serialization only. Hence there is no specific
- // handling at ExtentMap level.
- ceph_assert(struct_v == 1 || struct_v == 2);
-
- unsigned n;
- denc_varint(n, p);
- while (n--) {
- BlobRef b(new Blob());
- denc_varint(b->id, p);
- spanning_blob_map[b->id] = b;
- uint64_t sbid = 0;
- b->decode(onode->c, p, struct_v, &sbid, true);
- onode->c->open_shared_blob(sbid, b);
- }
-}
-
void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
{
shards.resize(onode->onode.extent_map_shards.size());
}
}
+void BlueStore::Onode::decode_raw(
+ BlueStore::Onode* on,
+ const bufferlist& v,
+ BlueStore::ExtentMap::ExtentDecoder& edecoder)
+{
+ auto p = v.front().begin_deep();
+ on->onode.decode(p);
+
+ // initialize extent_map
+ edecoder.decode_spanning_blobs(p, on->c);
+ if (on->onode.extent_map_shards.empty()) {
+ denc(on->extent_map.inline_bl, p);
+ edecoder.decode_some(on->extent_map.inline_bl, on->c);
+ }
+}
+
BlueStore::Onode* BlueStore::Onode::decode(
CollectionRef c,
const ghobject_t& oid,
{
Onode* on = new Onode(c.get(), oid, key);
on->exists = true;
- auto p = v.front().begin_deep();
- on->onode.decode(p);
+
+ ExtentMap::ExtentDecoderFull edecoder(on->extent_map);
+ decode_raw(on, v, edecoder);
+
for (auto& i : on->onode.attrs) {
i.second.reassign_to_mempool(mempool::mempool_bluestore_cache_meta);
}
// initialize extent_map
- on->extent_map.decode_spanning_blobs(p);
if (on->onode.extent_map_shards.empty()) {
- denc(on->extent_map.inline_bl, p);
- on->extent_map.decode_some(on->extent_map.inline_bl);
on->extent_map.inline_bl.reassign_to_mempool(
mempool::mempool_bluestore_cache_data);
- }
- else {
+ } else {
on->extent_map.init_shards(false, false);
}
return on;
derr << __func__ << "::NCB::Please change the value of bluestore_allocation_from_file to TRUE in your ceph.conf file" << dendl;
return -ENOTSUP; // Operation not supported
}
-
if (restore_allocator(alloc, &num, &bytes) == 0) {
dout(5) << __func__ << "::NCB::restore_allocator() completed successfully alloc=" << alloc << dendl;
} else {
void BlueStore::_close_db()
{
- dout(10) << __func__ << ":read_only=" << db_was_opened_read_only << " fm=" << fm << " destage_alloc_file=" << need_to_destage_allocation_file << dendl;
+ dout(10) << __func__ << ":read_only=" << db_was_opened_read_only
+ << " fm=" << fm
+ << " destage_alloc_file=" << need_to_destage_allocation_file
+ << dendl;
+ bool do_destage = !db_was_opened_read_only && need_to_destage_allocation_file;
_close_db_leave_bluefs();
- if (!db_was_opened_read_only && fm && fm->is_null_manager() && need_to_destage_allocation_file) {
+ if (do_destage && fm && fm->is_null_manager()) {
int ret = store_allocator(alloc);
if (ret != 0) {
derr << __func__ << "::NCB::store_allocator() failed (continue with bitmapFreelistManager)" << dendl;
return ret;
}
-//-------------------------------------------------------------------------
-void BlueStore::ExtentMap::provide_shard_info_to_onode(bufferlist v, uint32_t shard_id)
-{
- [[maybe_unused]] auto cct = onode->c->store->cct;
- auto path = onode->c->store->path;
- if (shard_id < shards.size()) {
- auto p = &shards[shard_id];
- if (!p->loaded) {
- dout(30) << "opening shard 0x" << std::hex << p->shard_info->offset << std::dec << dendl;
- p->extents = decode_some(v);
- p->loaded = true;
- dout(20) << "open shard 0x" << std::hex << p->shard_info->offset << std::dec << dendl;
- ceph_assert(p->dirty == false);
- ceph_assert(v.length() == p->shard_info->bytes);
- }
- } else {
- derr << "illegal shard-id=" << shard_id << " shards.size()=" << shards.size() << dendl;
- ceph_assert(shard_id < shards.size());
- }
-}
-
//-----------------------------------------------------------------------------------
void BlueStore::set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length)
{
+ dout(30) << __func__ << " 0x" << std::hex
+ << offset << "~" << length
+ << " " << min_alloc_size_mask
+ << dendl;
ceph_assert((offset & min_alloc_size_mask) == 0);
ceph_assert((length & min_alloc_size_mask) == 0);
sbmap->set(offset >> min_alloc_size_order, length >> min_alloc_size_order);
}
-//---------------------------------------------------------
-// Process all physical extents from a given Onode (including all its shards)
-void BlueStore::read_allocation_from_single_onode(
- SimpleBitmap* sbmap,
- BlueStore::OnodeRef& onode_ref,
- read_alloc_stats_t& stats)
-{
- // create a map holding all physical-extents of this Onode to prevent duplication from being added twice and more
- std::unordered_map<uint64_t, uint32_t> lcl_extnt_map;
- unsigned blobs_count = 0;
- uint64_t pos = 0;
+void BlueStore::ExtentDecoderPartial::_consume_new_blob(bool spanning,
+ uint64_t extent_no,
+ uint64_t sbid,
+ BlobRef b)
+{
+ [[maybe_unused]] auto cct = store.cct;
+ ceph_assert(per_pool_statfs);
+ ceph_assert(oid != ghobject_t());
- stats.spanning_blob_count += onode_ref->extent_map.spanning_blob_map.size();
- // first iterate over all logical-extents
- for (struct Extent& l_extent : onode_ref->extent_map.extent_map) {
- ceph_assert(l_extent.logical_offset >= pos);
+ auto &blob = b->get_blob();
+ if(spanning) {
+ dout(20) << __func__ << " " << spanning << " " << b->id << dendl;
+ ceph_assert(b->id >= 0);
+ spanning_blobs[b->id] = b;
+ ++stats.spanning_blob_count;
+ } else {
+ dout(20) << __func__ << " " << spanning << " " << extent_no << dendl;
+ blobs[extent_no] = b;
+ }
+ bool compressed = blob.is_compressed();
+ if (!blob.is_shared()) {
+ for (auto& pe : blob.get_extents()) {
+ if (pe.offset == bluestore_pextent_t::INVALID_OFFSET) {
+ ++stats.skipped_illegal_extent;
+ continue;
+ }
+ store.set_allocation_in_simple_bmap(&sbmap, pe.offset, pe.length);
- pos = l_extent.logical_offset + l_extent.length;
- ceph_assert(l_extent.blob);
- const bluestore_blob_t& blob = l_extent.blob->get_blob();
- const PExtentVector& p_extent_vec = blob.get_extents();
- blobs_count++;
- if (blob.is_compressed()) {
- stats.compressed_blob_count++;
+ per_pool_statfs->allocated() += pe.length;
+ if (compressed) {
+ per_pool_statfs->compressed_allocated() += pe.length;
+ }
}
-
- if (blob.is_shared()) {
- stats.shared_blobs_count++;
+ if (compressed) {
+ per_pool_statfs->compressed() +=
+ blob.get_compressed_payload_length();
+ ++stats.compressed_blob_count;
}
-
- // process all physical extent in this blob
- for (auto p_extent = p_extent_vec.begin(); p_extent != p_extent_vec.end(); p_extent++) {
- auto offset = p_extent->offset;
- auto length = p_extent->length;
-
- // Offset of -1 means that the extent was removed (and it is only a place holder) and can be safely skipped
- if (offset == (uint64_t)-1) {
- stats.skipped_illegal_extent++;
- continue;
+ } else {
+ auto it = sb_info.find(sbid);
+ if (it == sb_info.end()) {
+ derr << __func__ << " shared blob not found:" << sbid
+ << dendl;
+ }
+ auto &sbi = *it;
+ auto pool_id = oid.hobj.get_logical_pool();
+ if (sbi.pool_id == sb_info_t::INVALID_POOL_ID) {
+ sbi.pool_id = pool_id;
+ size_t alloc_delta = sbi.allocated_chunks << min_alloc_size_order;
+ per_pool_statfs->allocated() += alloc_delta;
+ if (compressed) {
+ per_pool_statfs->compressed_allocated() += alloc_delta;
+ ++stats.compressed_blob_count;
}
+ }
+ if (compressed) {
+ per_pool_statfs->compressed() +=
+ blob.get_compressed_payload_length();
+ }
+ }
+}
- if (!blob.is_shared()) {
- // skip repeating extents
- auto lcl_itr = lcl_extnt_map.find(offset);
- // extents using shared blobs might have differnt length
- if (lcl_itr != lcl_extnt_map.end() ) {
- // repeated extents must have the same length!
- ceph_assert(lcl_extnt_map[offset] == length);
- stats.skipped_repeated_extent++;
- } else {
- lcl_extnt_map[offset] = length;
- set_allocation_in_simple_bmap(sbmap, offset, length);
- stats.extent_count++;
- }
- } else {
- // extents using shared blobs might have differnt length
- set_allocation_in_simple_bmap(sbmap, offset, length);
- stats.extent_count++;
- }
+void BlueStore::ExtentDecoderPartial::consume_blobid(Extent* le,
+ bool spanning,
+ uint64_t blobid)
+{
+ [[maybe_unused]] auto cct = store.cct;
+ dout(20) << __func__ << " " << spanning << " " << blobid << dendl;
+ auto &map = spanning ? spanning_blobs : blobs;
+ auto it = map.find(blobid);
+ ceph_assert(it != map.end());
+ per_pool_statfs->stored() += le->length;
+ if (it->second->get_blob().is_compressed()) {
+ per_pool_statfs->compressed_original() += le->length;
+ }
+}
- } // physical-extents loop
+void BlueStore::ExtentDecoderPartial::consume_blob(Extent* le,
+ uint64_t extent_no,
+ uint64_t sbid,
+ BlobRef b)
+{
+ _consume_new_blob(false, extent_no, sbid, b);
+ per_pool_statfs->stored() += le->length;
+ if (b->get_blob().is_compressed()) {
+ per_pool_statfs->compressed_original() += le->length;
+ }
+}
- } // logical-extents loop
+void BlueStore::ExtentDecoderPartial::consume_spanning_blob(uint64_t sbid,
+ BlobRef b)
+{
+ _consume_new_blob(true, 0/*doesn't matter*/, sbid, b);
+}
- if (blobs_count < MAX_BLOBS_IN_ONODE) {
- stats.blobs_in_onode[blobs_count]++;
- } else {
- // store all counts higher than MAX_BLOBS_IN_ONODE in a single bucket at offset zero
- stats.blobs_in_onode[MAX_BLOBS_IN_ONODE]++;
- }
+void BlueStore::ExtentDecoderPartial::reset(const ghobject_t _oid,
+ volatile_statfs* _per_pool_statfs)
+{
+ oid = _oid;
+ per_pool_statfs = _per_pool_statfs;
+ blob_map_t empty;
+ blob_map_t empty2;
+ std::swap(blobs, empty);
+ std::swap(spanning_blobs, empty2);
}
-//-------------------------------------------------------------------------
int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats_t& stats)
{
- // finally add all space take by user data
- auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+ sb_info_space_efficient_map_t sb_info;
+ // iterate over all shared blobs
+ auto it = db->get_iterator(PREFIX_SHARED_BLOB, KeyValueDB::ITERATOR_NOCACHE);
if (!it) {
- // TBD - find a better error code
- derr << "failed db->get_iterator(PREFIX_OBJ)" << dendl;
- return -1;
+ derr << "failed getting shared blob's iterator" << dendl;
+ return -ENOENT;
+ }
+ if (it) {
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ const auto& key = it->key();
+ dout(20) << __func__ << " decode sb " << pretty_binary_string(key) << dendl;
+ uint64_t sbid = 0;
+ if (get_key_shared_blob(key, &sbid) != 0) {
+ derr << __func__ << " bad shared blob key '" << pretty_binary_string(key)
+ << "'" << dendl;
+ }
+ bluestore_shared_blob_t shared_blob(sbid);
+ bufferlist bl = it->value();
+ auto blp = bl.cbegin();
+ try {
+ decode(shared_blob, blp);
+ }
+ catch (ceph::buffer::error& e) {
+ derr << __func__ << " failed to decode Shared Blob"
+ << pretty_binary_string(key) << dendl;
+ continue;
+ }
+ dout(20) << __func__ << " " << shared_blob << dendl;
+ uint64_t allocated = 0;
+ for (auto& r : shared_blob.ref_map.ref_map) {
+ ceph_assert(r.first != bluestore_pextent_t::INVALID_OFFSET);
+ set_allocation_in_simple_bmap(sbmap, r.first, r.second.length);
+ allocated += r.second.length;
+ }
+ auto &sbi = sb_info.add_or_adopt(sbid);
+ ceph_assert(p2phase(allocated, min_alloc_size) == 0);
+ sbi.allocated_chunks += (allocated >> min_alloc_size_order);
+ ++stats.shared_blob_count;
+ }
+ }
+
+ it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+ if (!it) {
+ derr << "failed getting onode's iterator" << dendl;
+ return -ENOENT;
}
- CollectionRef collection_ref;
- spg_t pgid;
- BlueStore::OnodeRef onode_ref;
- bool has_open_onode = false;
- uint32_t shard_id = 0;
uint64_t kv_count = 0;
uint64_t count_interval = 1'000'000;
+ ExtentDecoderPartial edecoder(*this,
+ stats,
+ *sbmap,
+ sb_info,
+ min_alloc_size_order);
+
// iterate over all ONodes stored in RocksDB
for (it->lower_bound(string()); it->valid(); it->next(), kv_count++) {
// trace an even after every million processed objects (typically every 5-10 seconds)
if (kv_count && (kv_count % count_interval == 0) ) {
- dout(5) << "processed objects count = " << kv_count << dendl;
- }
-
- // Shards - Code
- // add the extents from the shards to the main Obj
- if (is_extent_shard_key(it->key())) {
- // shards must follow a valid main object
- if (has_open_onode) {
- // shards keys must start with the main object key
- if (it->key().find(onode_ref->key) == 0) {
- // shards count can't exceed declared shard-count in the main-object
- if (shard_id < onode_ref->extent_map.shards.size()) {
- onode_ref->extent_map.provide_shard_info_to_onode(it->value(), shard_id);
- stats.shard_count++;
- shard_id++;
- } else {
- derr << "illegal shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
- derr << "shard->key=" << pretty_binary_string(it->key()) << dendl;
- ceph_assert(shard_id < onode_ref->extent_map.shards.size());
- }
- } else {
- derr << "illegal shard-key::onode->key=" << pretty_binary_string(onode_ref->key) << " shard->key=" << pretty_binary_string(it->key()) << dendl;
- ceph_assert(it->key().find(onode_ref->key) == 0);
- }
- } else {
- derr << "error::shard without main objects for key=" << pretty_binary_string(it->key()) << dendl;
- ceph_assert(has_open_onode);
- }
-
- } else {
- // Main Object Code
+ dout(5) << __func__ << " processed objects count = " << kv_count << dendl;
+ }
- if (has_open_onode) {
- // make sure we got all shards of this object
- if (shard_id == onode_ref->extent_map.shards.size()) {
- // We completed an Onode Object -> pass it to be processed
- read_allocation_from_single_onode(sbmap, onode_ref, stats);
- } else {
- derr << "Missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
- ceph_assert(shard_id == onode_ref->extent_map.shards.size());
- }
- } else {
- // We opened a new Object
- has_open_onode = true;
+ auto key = it->key();
+ auto okey = key;
+ dout(20) << __func__ << " decode onode " << pretty_binary_string(key) << dendl;
+ ghobject_t oid;
+ if (!is_extent_shard_key(it->key())) {
+ int r = get_key_object(okey, &oid);
+ if (r != 0) {
+ derr << __func__ << " failed to decode onode key = "
+ << pretty_binary_string(okey) << dendl;
+ return -EIO;
}
-
- // The main Obj is always first in RocksDB so we can start with shard_id set to zero
- shard_id = 0;
- stats.onode_count++;
- ghobject_t oid;
- int ret = get_key_object(it->key(), &oid);
- if (ret < 0) {
- derr << "bad object key " << pretty_binary_string(it->key()) << dendl;
- ceph_assert(ret == 0);
- continue;
+ edecoder.reset(oid,
+ &stats.actual_pool_vstatfs[oid.hobj.get_logical_pool()]);
+ Onode dummy_on(cct);
+ Onode::decode_raw(&dummy_on,
+ it->value(),
+ edecoder);
+ ++stats.onode_count;
+ } else {
+ uint32_t offset;
+ int r = get_key_extent_shard(key, &okey, &offset);
+ if (r != 0) {
+ derr << __func__ << " failed to decode onode extent key = "
+ << pretty_binary_string(key) << dendl;
+ return -EIO;
}
-
- // fill collection_ref if doesn't exist yet
- // We process all the obejcts in a given collection and then move to the next collection
- // This means we only search once for every given collection
- if (!collection_ref ||
- oid.shard_id != pgid.shard ||
- oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
- !collection_ref->contains(oid)) {
- stats.collection_search++;
- collection_ref = nullptr;
-
- for (auto& p : coll_map) {
- if (p.second->contains(oid)) {
- collection_ref = p.second;
- break;
- }
- }
-
- if (!collection_ref) {
- derr << "stray object " << oid << " not owned by any collection" << dendl;
- ceph_assert(collection_ref);
- continue;
- }
-
- collection_ref->cid.is_pg(&pgid);
+ r = get_key_object(okey, &oid);
+ if (r != 0) {
+ derr << __func__
+ << " failed to decode onode key= " << pretty_binary_string(okey)
+ << " from extent key= " << pretty_binary_string(key)
+ << dendl;
+ return -EIO;
}
- onode_ref.reset(BlueStore::Onode::decode(collection_ref, oid, it->key(), it->value()));
+ ceph_assert(oid == edecoder.get_oid());
+ edecoder.decode_some(it->value(), nullptr);
+ ++stats.shard_count;
}
}
- // process the last object
- if (has_open_onode) {
- // make sure we got all shards of this object
- if (shard_id == onode_ref->extent_map.shards.size()) {
- // We completed an Onode Object -> pass it to be processed
- read_allocation_from_single_onode(sbmap, onode_ref, stats);
- } else {
- derr << "Last Object is missing shards! shard_id=" << shard_id << ", shards.size()=" << onode_ref->extent_map.shards.size() << dendl;
- ceph_assert(shard_id == onode_ref->extent_map.shards.size());
- }
+ std::lock_guard l(vstatfs_lock);
+ store_statfs_t s;
+ osd_pools.clear();
+ for (auto& p : stats.actual_pool_vstatfs) {
+ if (per_pool_stat_collection) {
+ osd_pools[p.first] = p.second;
+ }
+ stats.actual_store_vstatfs += p.second;
+ p.second.publish(&s);
+ dout(5) << __func__ << " recovered pool "
+ << std::hex
+ << p.first << "->" << s
+ << std::dec
+ << " per-pool:" << per_pool_stat_collection
+ << dendl;
}
- dout(5) << "onode_count=" << stats.onode_count << " ,shard_count=" << stats.shard_count << dendl;
-
+ vstatfs = stats.actual_store_vstatfs;
+ vstatfs.publish(&s);
+ dout(5) << __func__ << " recovered " << s
+ << dendl;
return 0;
}
};
#define META_POOL_ID ((uint64_t)-1ull)
+using bptr_c_it_t = buffer::ptr::const_iterator;
class BlueStore : public ObjectStore,
public md_config_obs_t {
}
}
void decode(
- Collection */*coll*/,
ceph::buffer::ptr::const_iterator& p,
- bool include_ref_map) {
+ bool include_ref_map,
+ Collection */*coll*/) {
const char *start = p.get_pos();
denc(blob, p);
const char *end = p.get_pos();
}
}
void decode(
- Collection *coll,
ceph::buffer::ptr::const_iterator& p,
uint64_t struct_v,
uint64_t* sbid,
- bool include_ref_map);
+ bool include_ref_map,
+ Collection *coll);
#endif
};
typedef boost::intrusive_ptr<Blob> BlobRef;
bool loaded = false; ///< true if shard is loaded
bool dirty = false; ///< true if shard is dirty and needs reencoding
};
+
mempool::bluestore_cache_meta::vector<Shard> shards; ///< shards
ceph::buffer::list inline_bl; ///< cached encoded map, if unsharded; empty=>dirty
void operator()(Extent *e) { delete e; }
};
- ExtentMap(Onode *o);
+ ExtentMap(Onode *o, size_t inline_shard_prealloc_size);
~ExtentMap() {
extent_map.clear_and_dispose(DeleteDisposer());
}
bool encode_some(uint32_t offset, uint32_t length, ceph::buffer::list& bl,
unsigned *pn);
+
+ class ExtentDecoder {
+ uint64_t pos = 0;
+ uint64_t prev_len = 0;
+ uint64_t extent_pos = 0;
+ protected:
+ virtual void consume_blobid(Extent* le,
+ bool spanning,
+ uint64_t blobid) = 0;
+ virtual void consume_blob(Extent* le,
+ uint64_t extent_no,
+ uint64_t sbid,
+ BlobRef b) = 0;
+ virtual void consume_spanning_blob(uint64_t sbid, BlobRef b) = 0;
+ virtual Extent* get_next_extent() = 0;
+ virtual void add_extent(Extent*) = 0;
+
+ void decode_extent(Extent* le,
+ __u8 struct_v,
+ bptr_c_it_t& p,
+ Collection* c);
+ public:
+ virtual ~ExtentDecoder() {
+ }
+
+ unsigned decode_some(const ceph::buffer::list& bl, Collection* c);
+ void decode_spanning_blobs(bptr_c_it_t& p, Collection* c);
+ };
+
+ class ExtentDecoderFull : public ExtentDecoder {
+ ExtentMap& extent_map;
+ std::vector<BlobRef> blobs;
+ protected:
+ void consume_blobid(Extent* le, bool spanning, uint64_t blobid) override;
+ void consume_blob(Extent* le,
+ uint64_t extent_no,
+ uint64_t sbid,
+ BlobRef b) override;
+ void consume_spanning_blob(uint64_t sbid, BlobRef b) override;
+ Extent* get_next_extent() override;
+ void add_extent(Extent* ) override;
+ public:
+ ExtentDecoderFull (ExtentMap& _extent_map) : extent_map(_extent_map) {
+ }
+ };
+
unsigned decode_some(ceph::buffer::list& bl);
void bound_encode_spanning_blobs(size_t& p);
void encode_spanning_blobs(ceph::buffer::list::contiguous_appender& p);
- void decode_spanning_blobs(ceph::buffer::ptr::const_iterator& p);
-
BlobRef get_spanning_blob(int id) {
auto p = spanning_blob_map.find(id);
ceph_assert(p != spanning_blob_map.end());
/// split a blob (and referring extents)
BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos);
-
- void provide_shard_info_to_onode(bufferlist v, uint32_t shard_id);
};
/// Compressed Blob Garbage collector
exists(false),
cached(false),
pinned(false),
- extent_map(this) {
+ extent_map(this,
+ c->store->cct->_conf->
+ bluestore_extent_map_inline_shard_prealloc_size) {
}
Onode(Collection* c, const ghobject_t& o,
const std::string& k)
: nref(0),
- c(c),
- oid(o),
- key(k),
- exists(false),
- cached(false),
- pinned(false),
- extent_map(this) {
+ c(c),
+ oid(o),
+ key(k),
+ exists(false),
+ cached(false),
+ pinned(false),
+ extent_map(this,
+ c->store->cct->_conf->
+ bluestore_extent_map_inline_shard_prealloc_size) {
}
Onode(Collection* c, const ghobject_t& o,
const char* k)
: nref(0),
- c(c),
- oid(o),
- key(k),
- exists(false),
- cached(false),
- pinned(false),
- extent_map(this) {
+ c(c),
+ oid(o),
+ key(k),
+ exists(false),
+ cached(false),
+ pinned(false),
+ extent_map(this,
+ c->store->cct->_conf->
+ bluestore_extent_map_inline_shard_prealloc_size) {
}
-
+ Onode(CephContext* cct)
+ : nref(0),
+ c(nullptr),
+ exists(false),
+ cached(false),
+ pinned(false),
+ extent_map(this,
+ cct->_conf->
+ bluestore_extent_map_inline_shard_prealloc_size) {
+ }
+ static void decode_raw(
+ BlueStore::Onode* on,
+ const bufferlist& v,
+ ExtentMap::ExtentDecoder& dencoder);
static Onode* decode(
CollectionRef c,
const ghobject_t& oid,
int64_t& errors,
int64_t &warnings,
BlueStoreRepairer* repairer);
+
void _fsck_repair_shared_blobs(
BlueStoreRepairer& repairer,
shared_blob_2hash_tracker_t& sb_ref_counts,
bool is_rotational() override;
bool is_journal_rotational() override;
- bool is_db_rotational() ;
+ bool is_db_rotational();
std::string get_default_device_class() override {
std::string device_class;
int push_allocation_to_rocksdb();
int read_allocation_from_drive_for_bluestore_tool();
#endif
+ void set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length);
+
private:
-#define MAX_BLOBS_IN_ONODE 128
struct read_alloc_stats_t {
- //read_alloc_stats_t() { memset(&this, 0, sizeof(read_alloc_stats_t)); }
uint32_t onode_count = 0;
uint32_t shard_count = 0;
- uint32_t skipped_repeated_extent = 0;
uint32_t skipped_illegal_extent = 0;
- uint32_t collection_search = 0;
- uint32_t pad_limit_count = 0;
-
- uint64_t shared_blobs_count = 0;
+ uint64_t shared_blob_count = 0;
uint64_t compressed_blob_count = 0;
uint64_t spanning_blob_count = 0;
uint64_t insert_count = 0;
uint64_t extent_count = 0;
-
- uint64_t saved_inplace_count = 0;
- uint32_t merge_insert_count = 0;
- uint32_t merge_inplace_count = 0;
-
- std::array<uint32_t, MAX_BLOBS_IN_ONODE+1>blobs_in_onode = {};
- //uint32_t blobs_in_onode[MAX_BLOBS_IN_ONODE+1];
+ std::map<uint64_t, volatile_statfs> actual_pool_vstatfs;
+ volatile_statfs actual_store_vstatfs;
+ };
+ class ExtentDecoderPartial : public ExtentMap::ExtentDecoder {
+ BlueStore& store;
+ read_alloc_stats_t& stats;
+ SimpleBitmap& sbmap;
+ sb_info_space_efficient_map_t& sb_info;
+ uint8_t min_alloc_size_order;
+ Extent extent;
+ ghobject_t oid;
+ volatile_statfs* per_pool_statfs = nullptr;
+ blob_map_t blobs;
+ blob_map_t spanning_blobs;
+
+ void _consume_new_blob(bool spanning,
+ uint64_t extent_no,
+ uint64_t sbid,
+ BlobRef b);
+ protected:
+ void consume_blobid(Extent*, bool spanning, uint64_t blobid) override;
+ void consume_blob(Extent* le,
+ uint64_t extent_no,
+ uint64_t sbid,
+ BlobRef b) override;
+ void consume_spanning_blob(uint64_t sbid, BlobRef b) override;
+ Extent* get_next_extent() override {
+ ++stats.extent_count;
+ extent = Extent();
+ return &extent;
+ }
+ void add_extent(Extent*) override {
+ }
+ public:
+ ExtentDecoderPartial(BlueStore& _store,
+ read_alloc_stats_t& _stats,
+ SimpleBitmap& _sbmap,
+ sb_info_space_efficient_map_t& _sb_info,
+ uint8_t _min_alloc_size_order)
+ : store(_store), stats(_stats), sbmap(_sbmap), sb_info(_sb_info),
+ min_alloc_size_order(_min_alloc_size_order)
+ {}
+ const ghobject_t& get_oid() const {
+ return oid;
+ }
+ void reset(const ghobject_t _oid,
+ volatile_statfs* _per_pool_statfs);
};
friend std::ostream& operator<<(std::ostream& out, const read_alloc_stats_t& stats) {
out << "==========================================================" << std::endl;
out << "NCB::onode_count = " ;out.width(10);out << stats.onode_count << std::endl
<< "NCB::shard_count = " ;out.width(10);out << stats.shard_count << std::endl
- << "NCB::shared_blobs_count = " ;out.width(10);out << stats.shared_blobs_count << std::endl
+ << "NCB::shared_blob_count = " ;out.width(10);out << stats.shared_blob_count << std::endl
<< "NCB::compressed_blob_count = " ;out.width(10);out << stats.compressed_blob_count << std::endl
<< "NCB::spanning_blob_count = " ;out.width(10);out << stats.spanning_blob_count << std::endl
- << "NCB::collection search = " ;out.width(10);out << stats.collection_search << std::endl
- << "NCB::skipped_repeated_extent = " ;out.width(10);out << stats.skipped_repeated_extent << std::endl
<< "NCB::skipped_illegal_extent = " ;out.width(10);out << stats.skipped_illegal_extent << std::endl
<< "NCB::extent_count = " ;out.width(10);out << stats.extent_count << std::endl
<< "NCB::insert_count = " ;out.width(10);out << stats.insert_count << std::endl;
- if (stats.merge_insert_count) {
- out << "NCB::merge_insert_count = " ;out.width(10);out << stats.merge_insert_count << std::endl;
- }
- if (stats.merge_inplace_count ) {
- out << "NCB::merge_inplace_count = " ;out.width(10);out << stats.merge_inplace_count << std::endl;
- out << "NCB::saved_inplace_count = " ;out.width(10);out << stats.saved_inplace_count << std::endl;
- out << "NCB::saved inplace per call = " ;out.width(10);out << stats.saved_inplace_count/stats.merge_inplace_count << std::endl;
- }
out << "==========================================================" << std::endl;
- for (unsigned i = 0; i < MAX_BLOBS_IN_ONODE; i++ ) {
- if (stats.blobs_in_onode[i]) {
- out << "NCB::We had " ;out.width(9); out << stats.blobs_in_onode[i]
- << " ONodes with "; out.width(3); out << i << " blobs" << std::endl;
- }
- }
-
- if (stats.blobs_in_onode[MAX_BLOBS_IN_ONODE]) {
- out << "NCB::We had " ;out.width(9);out << stats.blobs_in_onode[MAX_BLOBS_IN_ONODE]
- << " ONodes with more than " << MAX_BLOBS_IN_ONODE << " blobs" << std::endl;
- }
return out;
}
int read_allocation_from_drive_on_startup();
int reconstruct_allocations(SimpleBitmap *smbmp, read_alloc_stats_t &stats);
int read_allocation_from_onodes(SimpleBitmap *smbmp, read_alloc_stats_t& stats);
- void read_allocation_from_single_onode(SimpleBitmap *smbmp, BlueStore::OnodeRef& onode_ref, read_alloc_stats_t& stats);
- void set_allocation_in_simple_bmap(SimpleBitmap* sbmap, uint64_t offset, uint64_t length);
int commit_freelist_type();
int commit_to_null_manager();
int commit_to_real_manager();