if (!sb_ref_mismatches) // not expected to succeed, just in case
return;
- mempool::bluestore_fsck::map<uint64_t, bluestore_extent_ref_map_t> ref_maps;
- auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
- if (it) {
- CollectionRef c;
- spg_t pgid;
- for (it->lower_bound(string()); it->valid(); it->next()) {
- dout(30) << __func__ << " key "
- << pretty_binary_string(it->key()) << dendl;
- if (is_extent_shard_key(it->key())) {
- continue;
- }
-
- ghobject_t oid;
- int r = get_key_object(it->key(), &oid);
- if (r < 0) {
- continue;
- }
-
- if (!c ||
- oid.shard_id != pgid.shard ||
- oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
- !c->contains(oid)) {
- c = nullptr;
- for (auto& p : coll_map) {
- if (p.second->contains(oid)) {
- c = p.second;
- break;
- }
- }
- if (!c) {
- continue;
- }
- }
- dout(20) << __func__ << " inspecting shared blob refs for col:" << c->cid
- << " obj:" << oid << dendl;
- OnodeRef o;
- o.reset(Onode::decode(c, oid, it->key(), it->value()));
- o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+ auto foreach_shared_blob = [&](std::function<
+ void (coll_t,
+ ghobject_t,
+ uint64_t,
+ const bluestore_blob_t&)> cb) {
+ auto it = db->get_iterator(PREFIX_OBJ, KeyValueDB::ITERATOR_NOCACHE);
+ if (it) {
+ CollectionRef c;
+ spg_t pgid;
+ for (it->lower_bound(string()); it->valid(); it->next()) {
+ dout(30) << __func__ << " key "
+ << pretty_binary_string(it->key())
+ << dendl;
+ if (is_extent_shard_key(it->key())) {
+ continue;
+ }
- _dump_onode<30>(cct, *o);
+ ghobject_t oid;
+ int r = get_key_object(it->key(), &oid);
+ if (r < 0) {
+ continue;
+ }
- mempool::bluestore_fsck::set<BlobRef> passed_sbs;
- for (auto& e : o->extent_map.extent_map) {
- auto& b = e.blob->get_blob();
- if (b.is_shared() && passed_sbs.count(e.blob) == 0) {
- auto sbid = e.blob->shared_blob->get_sbid();
- passed_sbs.emplace(e.blob);
- bluestore_extent_ref_map_t ref_map_candidate;
- bool to_be_updated = false;
- auto it = ref_maps.lower_bound(sbid);
- bool add_existing = it != ref_maps.end() && it->first == sbid;
- for (auto& p : b.get_extents()) {
- if (p.is_valid()) {
- if (add_existing) {
- it->second.get(p.offset, p.length);
- } else {
- ref_map_candidate.get(p.offset, p.length);
- if (!to_be_updated) {
- to_be_updated =
- !sb_ref_counts.test_all_zero_range(sbid, p.offset, p.length);
- }
+ if (!c ||
+ oid.shard_id != pgid.shard ||
+ oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
+ !c->contains(oid)) {
+ c = nullptr;
+ for (auto& p : coll_map) {
+ if (p.second->contains(oid)) {
+ c = p.second;
+ break;
}
}
- }
- if (to_be_updated) {
- ceph_assert(!ref_map_candidate.empty());
- ceph_assert(!add_existing);
- ref_maps.emplace_hint(it, sbid, std::move(ref_map_candidate));
- }
- dout(20) << __func__ << " inspected shared blob refs for col:" << c->cid
- << " obj:" << oid << " sbid 0x " << std::hex << sbid << std::dec
- << " existed " << add_existing
- << " to be updated " << to_be_updated
- << dendl;
- } // if b.shared ....
- } // for ... extent_map
- } // for ... it->valid
- } //if (it(PREFIX_OBJ))
+ if (!c) {
+ continue;
+ }
+ }
+ dout(20) << __func__
+ << " inspecting shared blob refs for col:" << c->cid
+ << " obj:" << oid
+ << dendl;
+
+ OnodeRef o;
+ o.reset(Onode::decode(c, oid, it->key(), it->value()));
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+
+ _dump_onode<30>(cct, *o);
+
+ mempool::bluestore_fsck::set<BlobRef> passed_sbs;
+ for (auto& e : o->extent_map.extent_map) {
+ auto& b = e.blob->get_blob();
+ if (b.is_shared() && passed_sbs.count(e.blob) == 0) {
+ auto sbid = e.blob->shared_blob->get_sbid();
+ cb(c->cid, oid, sbid, b);
+ passed_sbs.emplace(e.blob);
+ }
+ } // for ... extent_map
+ } // for ... it->valid
+ } //if (it(PREFIX_OBJ))
+ }; //foreach_shared_blob fn declaration
+
+ mempool::bluestore_fsck::map<uint64_t, bluestore_extent_ref_map_t> refs_map;
+
+ // first iteration over objects to identify all the broken sbids
+ foreach_shared_blob( [&](coll_t cid,
+ ghobject_t oid,
+ uint64_t sbid,
+ const bluestore_blob_t& b) {
+ auto it = refs_map.lower_bound(sbid);
+ if(it != refs_map.end() && it->first == sbid) {
+ return;
+ }
+ for (auto& p : b.get_extents()) {
+ if (p.is_valid() &&
+ !sb_ref_counts.test_all_zero_range(sbid,
+ p.offset,
+ p.length)) {
+ refs_map.emplace_hint(it, sbid, bluestore_extent_ref_map_t());
+ dout(20) << __func__
+ << " broken shared blob found for col:" << cid
+ << " obj:" << oid
+ << " sbid 0x " << std::hex << sbid << std::dec
+ << dendl;
+ break;
+ }
+ }
+ });
+
+ // second iteration over objects to build new ref map for the broken sbids
+ foreach_shared_blob( [&](coll_t cid,
+ ghobject_t oid,
+ uint64_t sbid,
+ const bluestore_blob_t& b) {
+ auto it = refs_map.find(sbid);
+ if(it == refs_map.end()) {
+ return;
+ }
+ for (auto& p : b.get_extents()) {
+ if (p.is_valid()) {
+ it->second.get(p.offset, p.length);
+ break;
+ }
+ }
+ });
// update shared blob records
- auto ref_it = ref_maps.begin();
- while (ref_it != ref_maps.end()) {
+ auto ref_it = refs_map.begin();
+ while (ref_it != refs_map.end()) {
size_t cnt = 0;
const size_t max_transactions = 4096;
KeyValueDB::Transaction txn = db->get_transaction();
for (cnt = 0;
- cnt < max_transactions && ref_it != ref_maps.end();
+ cnt < max_transactions && ref_it != refs_map.end();
ref_it++) {
auto sbid = ref_it->first;
dout(20) << __func__ << " repaired shared_blob 0x"
int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
{
- dout(5) << __func__ << "::NCB::entered" << dendl;
+ uint64_t sb_hash_size = uint64_t(
+ cct->_conf.get_val<Option::size_t>("osd_memory_target") *
+ cct->_conf.get_val<double>(
+ "bluestore_fsck_shared_blob_tracker_size"));
+
dout(1) << __func__
<< " <<<START>>>"
<< (repair ? " repair" : " check")
<< (depth == FSCK_DEEP ? " (deep)" :
depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
- << " start" << dendl;
+ << " start sb_tracker_hash_size:" << sb_hash_size
+ << dendl;
int64_t errors = 0;
int64_t warnings = 0;
unsigned repaired = 0;
sb_info_space_efficient_map_t sb_info;
shared_blob_2hash_tracker_t sb_ref_counts(
- cct->_conf->bluestore_fsck_shared_blob_tracker_size,
+ sb_hash_size,
min_alloc_size);
size_t sb_ref_mismatches = 0;
for (it->lower_bound(string()); it->valid(); it->next()) {
string key = it->key();
uint64_t sbid;
- if (get_key_shared_blob(key, &sbid)) {
- // this gonna to be handled at the second stage
+ if (get_key_shared_blob(key, &sbid) < 0) {
+ // Failed to parse the key.
+ // This gonna to be handled at the second stage
continue;
}
bluestore_shared_blob_t shared_blob(sbid);
errors += sb_ref_mismatches;
}
- if (depth != FSCK_SHALLOW && repair && sb_ref_mismatches) {
+ if (depth != FSCK_SHALLOW && repair) {
_fsck_repair_shared_blobs(repairer, sb_ref_counts, sb_info);
}
dout(1) << __func__ << " checking shared_blobs (phase 2)" << dendl;
: public ref_counter_2hash_tracker_t<mempool::bluestore_fsck::vector> {
static const size_t hash_input_len = 3;
+
typedef std::array<uint64_t, hash_input_len> hash_input_t;
- inline hash_input_t build_hash_input(uint64_t sbid, uint64_t offset) const;
+
static size_t get_hash_input_size() {
return hash_input_len * sizeof(hash_input_t::value_type);
}
+ inline hash_input_t build_hash_input(uint64_t sbid, uint64_t offset) const;
+
size_t au_void_bits = 0;
+
public:
shared_blob_2hash_tracker_t(uint64_t mem_cap, size_t alloc_unit)
: ref_counter_2hash_tracker_t(mem_cap) {
- ceph_assert(!!alloc_unit);
+ ceph_assert(alloc_unit);
ceph_assert(isp2(alloc_unit));
au_void_bits = ctz(alloc_unit);
}
if (aux_items.size() != 0) {
auto it = std::lower_bound(
aux_items.begin(),
- aux_items.end() - 1,
+ aux_items.end(),
id,
[](const sb_info_t& a, const uint64_t& b) {
return a < b;
}
private:
sb_info_t& _add(int64_t id) {
- if (items.size() == 0 || uint64_t(std::abs(id)) > items.back().get_sbid()) {
+ uint64_t n_id = uint64_t(std::abs(id));
+ if (items.size() == 0 || n_id > items.back().get_sbid()) {
return items.emplace_back(id);
}
- auto it = find(uint64_t(std::abs(id)));
+ auto it = find(n_id);
if (it != items.end()) {
return *it;
}
- if (aux_items.size() == 0 || uint64_t(std::abs(id)) > aux_items.back().get_sbid()) {
+ if (aux_items.size() == 0 || n_id > aux_items.back().get_sbid()) {
return aux_items.emplace_back(id);
}
// do sorted insertion, may be expensive!
it = std::upper_bound(
aux_items.begin(),
- aux_items.end() - 1,
- uint64_t(std::abs(id)),
+ aux_items.end(),
+ n_id,
[](const uint64_t& a, const sb_info_t& b) {
return a < b.get_sbid();
});
#include "common/pretty_binary.h"
#include "include/stringify.h"
#include "include/coredumpctl.h"
-
#include "include/unordered_map.h"
+#include "os/kv.h"
#include "store_test_fixture.h"
+
using namespace std;
using namespace std::placeholders;
bstore->mount();
}
+TEST_P(StoreTestSpecificAUSize, BluestoreRepairSharedBlobTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ if (smr) {
+ cout << "TODO: repair mismatched write pointer (+ dead bytes mismatch)" << std::endl;
+ return;
+ }
+
+ SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
+ SetVal(g_conf(), "bluestore_fsck_on_umount", "false");
+
+ const size_t block_size = 0x1000;
+ StartDeferred(block_size);
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+
+ // fill the store with some data
+ const uint64_t pool = 555;
+ coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD));
+ auto ch = store->create_new_collection(cid);
+
+ ghobject_t hoid = make_object("Object 1", pool);
+ ghobject_t hoid_cloned = hoid;
+ hoid_cloned.hobj.snap = 1;
+ ghobject_t hoid2 = make_object("Object 2", pool);
+
+ string s(block_size, 1);
+ bufferlist bl;
+ bl.append(s);
+ int r;
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ // check the scenario when shared blob contains
+ // references to extents from two objects which don't overlapp
+ // o1 -> 0x2000~1K
+ // o2 -> 0x4000~1k
+ cerr << "introduce 2 non-overlapped extents in a shared blob"
+ << std::endl;
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, bl.length(), bl);
+ t.write(cid, hoid2, 0, bl.length(), bl); // to make a gap in allocations
+ t.write(cid, hoid, block_size * 2 , bl.length(), bl);
+ t.clone(cid, hoid, hoid_cloned);
+ t.zero(cid, hoid, 0, bl.length());
+ t.zero(cid, hoid_cloned, block_size * 2, bl.length());
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bstore->umount();
+ bstore->mount();
+ {
+ string key;
+ _key_encode_u64(1, &key);
+ bluestore_shared_blob_t sb(1);
+ sb.ref_map.get(0x2000, block_size);
+ sb.ref_map.get(0x4000, block_size);
+ sb.ref_map.get(0x4000, block_size);
+ bufferlist bl;
+ encode(sb, bl);
+ bstore->inject_broken_shared_blob_key(key, bl);
+ }
+ bstore->umount();
+ ASSERT_EQ(bstore->fsck(false), 2);
+ ASSERT_EQ(bstore->repair(false), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+
+ cerr << "Completing" << std::endl;
+ bstore->mount();
+}
+
TEST_P(StoreTestSpecificAUSize, BluestoreBrokenNoSharedBlobRepairTest) {
if (string(GetParam()) != "bluestore")
return;