From: Igor Fedotov Date: Fri, 7 Jan 2022 21:57:26 +0000 (+0300) Subject: os/bluestore: Adam's PR comments resolution X-Git-Tag: v17.1.0~46^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a902d22b6c785099c704a229db1dc1e6fefee3e2;p=ceph.git os/bluestore: Adam's PR comments resolution to be squashed before merge! Signed-off-by: Igor Fedotov --- diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index d9129e37ebb..bcc992ce33b 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -4849,13 +4849,14 @@ options: default: 2 with_legacy: true - name: bluestore_fsck_shared_blob_tracker_size - type: size + type: float level: dev - desc: Size in bytes for a hash table to track shared blobs ref counts. Higher the size, more precise is the tracker -> less overhead during the repair. - default: 128_M + desc: Size(a fraction of osd_memory_target, defaults to 128MB) of a hash table to track shared blobs ref counts. Higher the size, more precise is the tracker -> less overhead during the repair. + default: 0.03125 + see_also: + - osd_memory_target flags: - runtime - with_legacy: true - name: bluestore_throttle_bytes type: size level: advanced diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 876005de32b..a8140f29096 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -7830,94 +7830,121 @@ void BlueStore::_fsck_repair_shared_blobs( if (!sb_ref_mismatches) // not expected to succeed, just in case return; - mempool::bluestore_fsck::map ref_maps; - auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); - if (it) { - CollectionRef c; - spg_t pgid; - for (it->lower_bound(string()); it->valid(); it->next()) { - dout(30) << __func__ << " key " - << pretty_binary_string(it->key()) << dendl; - if (is_extent_shard_key(it->key())) { - continue; - } - - ghobject_t oid; - int r = get_key_object(it->key(), &oid); - if (r < 0) { - continue; - } - - if (!c || - oid.shard_id != pgid.shard || - oid.hobj.get_logical_pool() != (int64_t)pgid.pool() || - !c->contains(oid)) { - c = nullptr; - for (auto& p : coll_map) { - if (p.second->contains(oid)) { - c = p.second; - break; - } - } - if (!c) { - continue; - } - } - dout(20) << __func__ << " inspecting shared blob refs for col:" << c->cid - << " obj:" << oid << dendl; - OnodeRef o; - o.reset(Onode::decode(c, oid, it->key(), it->value())); - o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + auto foreach_shared_blob = [&](std::function< + void (coll_t, + ghobject_t, + uint64_t, + const bluestore_blob_t&)> cb) { + auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE); + if (it) { + CollectionRef c; + spg_t pgid; + for (it->lower_bound(string()); it->valid(); it->next()) { + dout(30) << __func__ << " key " + << pretty_binary_string(it->key()) + << dendl; + if (is_extent_shard_key(it->key())) { + continue; + } - _dump_onode<30>(cct, *o); + ghobject_t oid; + int r = get_key_object(it->key(), &oid); + if (r < 0) { + continue; + } - mempool::bluestore_fsck::set passed_sbs; - for (auto& e : o->extent_map.extent_map) { - auto& b = e.blob->get_blob(); - if (b.is_shared() && passed_sbs.count(e.blob) == 0) { - auto sbid = e.blob->shared_blob->get_sbid(); - passed_sbs.emplace(e.blob); - bluestore_extent_ref_map_t ref_map_candidate; - bool to_be_updated = false; - auto it = ref_maps.lower_bound(sbid); - bool add_existing = it != ref_maps.end() && it->first == sbid; - for (auto& p : b.get_extents()) { - if (p.is_valid()) { - if (add_existing) { - it->second.get(p.offset, p.length); - } else { - ref_map_candidate.get(p.offset, p.length); - if (!to_be_updated) { - to_be_updated = - !sb_ref_counts.test_all_zero_range(sbid, p.offset, p.length); - } + if (!c || + oid.shard_id != pgid.shard || + oid.hobj.get_logical_pool() != (int64_t)pgid.pool() || + !c->contains(oid)) { + c = nullptr; + for (auto& p : coll_map) { + if (p.second->contains(oid)) { + c = p.second; + break; } } - } - if (to_be_updated) { - ceph_assert(!ref_map_candidate.empty()); - ceph_assert(!add_existing); - ref_maps.emplace_hint(it, sbid, std::move(ref_map_candidate)); - } - dout(20) << __func__ << " inspected shared blob refs for col:" << c->cid - << " obj:" << oid << " sbid 0x " << std::hex << sbid << std::dec - << " existed " << add_existing - << " to be updated " << to_be_updated - << dendl; - } // if b.shared .... - } // for ... extent_map - } // for ... it->valid - } //if (it(PREFIX_OBJ)) + if (!c) { + continue; + } + } + dout(20) << __func__ + << " inspecting shared blob refs for col:" << c->cid + << " obj:" << oid + << dendl; + + OnodeRef o; + o.reset(Onode::decode(c, oid, it->key(), it->value())); + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + + _dump_onode<30>(cct, *o); + + mempool::bluestore_fsck::set passed_sbs; + for (auto& e : o->extent_map.extent_map) { + auto& b = e.blob->get_blob(); + if (b.is_shared() && passed_sbs.count(e.blob) == 0) { + auto sbid = e.blob->shared_blob->get_sbid(); + cb(c->cid, oid, sbid, b); + passed_sbs.emplace(e.blob); + } + } // for ... extent_map + } // for ... it->valid + } //if (it(PREFIX_OBJ)) + }; //foreach_shared_blob fn declaration + + mempool::bluestore_fsck::map refs_map; + + // first iteration over objects to identify all the broken sbids + foreach_shared_blob( [&](coll_t cid, + ghobject_t oid, + uint64_t sbid, + const bluestore_blob_t& b) { + auto it = refs_map.lower_bound(sbid); + if(it != refs_map.end() && it->first == sbid) { + return; + } + for (auto& p : b.get_extents()) { + if (p.is_valid() && + !sb_ref_counts.test_all_zero_range(sbid, + p.offset, + p.length)) { + refs_map.emplace_hint(it, sbid, bluestore_extent_ref_map_t()); + dout(20) << __func__ + << " broken shared blob found for col:" << cid + << " obj:" << oid + << " sbid 0x " << std::hex << sbid << std::dec + << dendl; + break; + } + } + }); + + // second iteration over objects to build new ref map for the broken sbids + foreach_shared_blob( [&](coll_t cid, + ghobject_t oid, + uint64_t sbid, + const bluestore_blob_t& b) { + auto it = refs_map.find(sbid); + if(it == refs_map.end()) { + return; + } + for (auto& p : b.get_extents()) { + if (p.is_valid()) { + it->second.get(p.offset, p.length); + break; + } + } + }); // update shared blob records - auto ref_it = ref_maps.begin(); - while (ref_it != ref_maps.end()) { + auto ref_it = refs_map.begin(); + while (ref_it != refs_map.end()) { size_t cnt = 0; const size_t max_transactions = 4096; KeyValueDB::Transaction txn = db->get_transaction(); for (cnt = 0; - cnt < max_transactions && ref_it != ref_maps.end(); + cnt < max_transactions && ref_it != refs_map.end(); ref_it++) { auto sbid = ref_it->first; dout(20) << __func__ << " repaired shared_blob 0x" @@ -8976,13 +9003,18 @@ int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair) int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) { - dout(5) << __func__ << "::NCB::entered" << dendl; + uint64_t sb_hash_size = uint64_t( + cct->_conf.get_val("osd_memory_target") * + cct->_conf.get_val( + "bluestore_fsck_shared_blob_tracker_size")); + dout(1) << __func__ << " <<>>" << (repair ? " repair" : " check") << (depth == FSCK_DEEP ? " (deep)" : depth == FSCK_SHALLOW ? " (shallow)" : " (regular)") - << " start" << dendl; + << " start sb_tracker_hash_size:" << sb_hash_size + << dendl; int64_t errors = 0; int64_t warnings = 0; unsigned repaired = 0; @@ -8997,7 +9029,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) sb_info_space_efficient_map_t sb_info; shared_blob_2hash_tracker_t sb_ref_counts( - cct->_conf->bluestore_fsck_shared_blob_tracker_size, + sb_hash_size, min_alloc_size); size_t sb_ref_mismatches = 0; @@ -9168,8 +9200,9 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) for (it->lower_bound(string()); it->valid(); it->next()) { string key = it->key(); uint64_t sbid; - if (get_key_shared_blob(key, &sbid)) { - // this gonna to be handled at the second stage + if (get_key_shared_blob(key, &sbid) < 0) { + // Failed to parse the key. + // This gonna to be handled at the second stage continue; } bluestore_shared_blob_t shared_blob(sbid); @@ -9246,7 +9279,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) errors += sb_ref_mismatches; } - if (depth != FSCK_SHALLOW && repair && sb_ref_mismatches) { + if (depth != FSCK_SHALLOW && repair) { _fsck_repair_shared_blobs(repairer, sb_ref_counts, sb_info); } dout(1) << __func__ << " checking shared_blobs (phase 2)" << dendl; diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 98d9a91fafe..918eb10d5ca 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -3786,9 +3786,6 @@ public: public: void fix_per_pool_omap(KeyValueDB *db, int); bool remove_key(KeyValueDB *db, const std::string& prefix, const std::string& key); - //bool fix_shared_blob(KeyValueDB *db, - // uint64_t sbid, - // const ceph::buffer::list* bl); bool fix_shared_blob(KeyValueDB::Transaction txn, uint64_t sbid, bluestore_extent_ref_map_t* ref_map, diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h index 78f9d634757..88b6e9efad7 100644 --- a/src/os/bluestore/bluestore_types.h +++ b/src/os/bluestore/bluestore_types.h @@ -1200,18 +1200,22 @@ class shared_blob_2hash_tracker_t : public ref_counter_2hash_tracker_t { static const size_t hash_input_len = 3; + typedef std::array hash_input_t; - inline hash_input_t build_hash_input(uint64_t sbid, uint64_t offset) const; + static size_t get_hash_input_size() { return hash_input_len * sizeof(hash_input_t::value_type); } + inline hash_input_t build_hash_input(uint64_t sbid, uint64_t offset) const; + size_t au_void_bits = 0; + public: shared_blob_2hash_tracker_t(uint64_t mem_cap, size_t alloc_unit) : ref_counter_2hash_tracker_t(mem_cap) { - ceph_assert(!!alloc_unit); + ceph_assert(alloc_unit); ceph_assert(isp2(alloc_unit)); au_void_bits = ctz(alloc_unit); } @@ -1302,7 +1306,7 @@ struct sb_info_space_efficient_map_t { if (aux_items.size() != 0) { auto it = std::lower_bound( aux_items.begin(), - aux_items.end() - 1, + aux_items.end(), id, [](const sb_info_t& a, const uint64_t& b) { return a < b; @@ -1342,21 +1346,22 @@ struct sb_info_space_efficient_map_t { } private: sb_info_t& _add(int64_t id) { - if (items.size() == 0 || uint64_t(std::abs(id)) > items.back().get_sbid()) { + uint64_t n_id = uint64_t(std::abs(id)); + if (items.size() == 0 || n_id > items.back().get_sbid()) { return items.emplace_back(id); } - auto it = find(uint64_t(std::abs(id))); + auto it = find(n_id); if (it != items.end()) { return *it; } - if (aux_items.size() == 0 || uint64_t(std::abs(id)) > aux_items.back().get_sbid()) { + if (aux_items.size() == 0 || n_id > aux_items.back().get_sbid()) { return aux_items.emplace_back(id); } // do sorted insertion, may be expensive! it = std::upper_bound( aux_items.begin(), - aux_items.end() - 1, - uint64_t(std::abs(id)), + aux_items.end(), + n_id, [](const uint64_t& a, const sb_info_t& b) { return a < b.get_sbid(); }); diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 029bdc14917..0ada1ae37c0 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -40,10 +40,11 @@ #include "common/pretty_binary.h" #include "include/stringify.h" #include "include/coredumpctl.h" - #include "include/unordered_map.h" +#include "os/kv.h" #include "store_test_fixture.h" + using namespace std; using namespace std::placeholders; @@ -9212,6 +9213,82 @@ TEST_P(StoreTestSpecificAUSize, BluestoreBrokenZombieRepairTest) { bstore->mount(); } +TEST_P(StoreTestSpecificAUSize, BluestoreRepairSharedBlobTest) { + if (string(GetParam()) != "bluestore") + return; + if (smr) { + cout << "TODO: repair mismatched write pointer (+ dead bytes mismatch)" << std::endl; + return; + } + + SetVal(g_conf(), "bluestore_fsck_on_mount", "false"); + SetVal(g_conf(), "bluestore_fsck_on_umount", "false"); + + const size_t block_size = 0x1000; + StartDeferred(block_size); + + BlueStore* bstore = dynamic_cast (store.get()); + + // fill the store with some data + const uint64_t pool = 555; + coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD)); + auto ch = store->create_new_collection(cid); + + ghobject_t hoid = make_object("Object 1", pool); + ghobject_t hoid_cloned = hoid; + hoid_cloned.hobj.snap = 1; + ghobject_t hoid2 = make_object("Object 2", pool); + + string s(block_size, 1); + bufferlist bl; + bl.append(s); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + // check the scenario when shared blob contains + // references to extents from two objects which don't overlapp + // o1 -> 0x2000~1K + // o2 -> 0x4000~1k + cerr << "introduce 2 non-overlapped extents in a shared blob" + << std::endl; + { + ObjectStore::Transaction t; + t.write(cid, hoid, 0, bl.length(), bl); + t.write(cid, hoid2, 0, bl.length(), bl); // to make a gap in allocations + t.write(cid, hoid, block_size * 2 , bl.length(), bl); + t.clone(cid, hoid, hoid_cloned); + t.zero(cid, hoid, 0, bl.length()); + t.zero(cid, hoid_cloned, block_size * 2, bl.length()); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bstore->umount(); + bstore->mount(); + { + string key; + _key_encode_u64(1, &key); + bluestore_shared_blob_t sb(1); + sb.ref_map.get(0x2000, block_size); + sb.ref_map.get(0x4000, block_size); + sb.ref_map.get(0x4000, block_size); + bufferlist bl; + encode(sb, bl); + bstore->inject_broken_shared_blob_key(key, bl); + } + bstore->umount(); + ASSERT_EQ(bstore->fsck(false), 2); + ASSERT_EQ(bstore->repair(false), 0); + ASSERT_EQ(bstore->fsck(false), 0); + + cerr << "Completing" << std::endl; + bstore->mount(); +} + TEST_P(StoreTestSpecificAUSize, BluestoreBrokenNoSharedBlobRepairTest) { if (string(GetParam()) != "bluestore") return;