}
int BlueStore::_fsck_check_extents(
+ const coll_t& cid,
const ghobject_t& oid,
const PExtentVector& extents,
bool compressed,
mempool_dynamic_bitset &used_blocks,
uint64_t granularity,
+ BlueStoreRepairer* repairer,
store_statfs_t& expected_statfs)
{
dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
e.offset, e.length, granularity, used_blocks,
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
assert(pos < bs.size());
- if (bs.test(pos))
- already = true;
+ if (bs.test(pos)) {
+ if (repairer) {
+ repairer->note_misreference(
+ pos * min_alloc_size, min_alloc_size, !already);
+ }
+ if (!already) {
+ derr << "fsck error: " << oid << " extent " << e
+ << " or a subset is already allocated (misreferenced)" << dendl;
+ ++errors;
+ already = true;
+ }
+ }
else
bs.set(pos);
});
- if (already) {
- derr << " " << oid << " extent " << e
- << " or a subset is already allocated" << dendl;
- ++errors;
- }
+ if (repairer) {
+ repairer->get_space_usage_tracker().set_used( e.offset, e.length, cid, oid);
+ }
+
if (e.end() > bdev->get_size()) {
- derr << " " << oid << " extent " << e
+ derr << "fsck error: " << oid << " extent " << e
<< " past end of block device" << dendl;
++errors;
}
return errors;
}
+/**
+An overview for currently implemented repair logics
+performed in fsck in two stages: detection(+preparation) and commit.
+Detection stage (in processing order):
+ (Issue -> Repair action to schedule)
+ - Detect undecodable keys for Shared Blobs -> Remove
+ - Detect undecodable records for Shared Blobs -> Remove
+ (might trigger missed Shared Blob detection below)
+ - Detect stray records for Shared Blobs -> Remove
+ - Detect misreferenced pextents -> Fix
+ Prepare Bloom-like filter to track cid/oid -> pextent
+ Prepare list of extents that are improperly referenced
+ Enumerate Onode records that might use 'misreferenced' pextents
+ (Bloom-like filter applied to reduce computation)
+ Per each questinable Onode enumerate all blobs and identify broken ones
+ (i.e. blobs having 'misreferences')
+ Rewrite each broken blob data by allocating another extents and
+ copying data there
+ If blob is shared - unshare it and mark corresponding Shared Blob
+ for removal
+ Release previously allocated space
+ Update Extent Map
+ - Detect missed Shared Blobs -> Recreate
+ - Detect undecodable deferred transaction -> Remove
+ - Detect Freelist Manager's 'false free' entries -> Mark as used
+ - Detect Freelist Manager's leaked entries -> Mark as free
+ - Detect statfs inconsistency - Update
+ Commit stage (separate DB commit per each step):
+ - Apply leaked FM entries fix
+ - Apply 'false free' FM entries fix
+ - Apply 'Remove' actions
+ - Apply fix for misreference pextents
+ - Apply Shared Blob recreate
+ (can be merged with the step above if misreferences were dectected)
+ - Apply StatFS update
+*/
int BlueStore::_fsck(bool deep, bool repair)
{
dout(1) << __func__
- << (repair ? " fsck" : " repair")
+ << " <<<START>>>"
+ << (repair ? " repair" : " check")
<< (deep ? " (deep)" : " (shallow)") << " start" << dendl;
int errors = 0;
- int repaired = 0;
+ unsigned repaired = 0;
typedef btree::btree_set<
uint64_t,std::less<uint64_t>,
KeyValueDB::Iterator it;
store_statfs_t expected_statfs, actual_statfs;
struct sb_info_t {
+ coll_t cid;
list<ghobject_t> oids;
SharedBlobRef sb;
bluestore_extent_ref_map_t ref_map;
- bool compressed;
+ bool compressed = false;
+ bool passed = false;
+ bool updated = false;
};
mempool::bluestore_fsck::map<uint64_t,sb_info_t> sb_info;
uint64_t num_shared_blobs = 0;
uint64_t num_sharded_objects = 0;
uint64_t num_object_shards = 0;
+ BlueStoreRepairer repairer;
utime_t start = ceph_clock_now();
bs.set(pos);
}
);
+ if (repair) {
+ repairer.get_space_usage_tracker().init(
+ bdev->get_size(),
+ min_alloc_size);
+ }
if (bluefs) {
for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
oid.hobj.pool != (int64_t)pgid.pool() ||
!c->contains(oid)) {
c = nullptr;
- for (ceph::unordered_map<coll_t, CollectionRef>::iterator p =
- coll_map.begin();
- p != coll_map.end();
- ++p) {
- if (p->second->contains(oid)) {
- c = p->second;
+ for (auto& p : coll_map) {
+ if (p.second->contains(oid)) {
+ c = p.second;
break;
}
}
++errors;
}
sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
+ assert(sbi.cid == coll_t() || sbi.cid == c->cid);
+ sbi.cid = c->cid;
sbi.sb = i.first->shared_blob;
sbi.oids.push_back(oid);
sbi.compressed = blob.is_compressed();
}
}
} else {
- errors += _fsck_check_extents(oid, blob.get_extents(),
+ errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
blob.is_compressed(),
used_blocks,
fm->get_alloc_size(),
+ repair ? &repairer : nullptr,
expected_statfs);
}
}
}
}
}
+
dout(1) << __func__ << " checking shared_blobs" << dendl;
it = db->get_iterator(PREFIX_SHARED_BLOB);
if (it) {
if (get_key_shared_blob(key, &sbid)) {
derr << "fsck error: bad key '" << key
<< "' in shared blob namespace" << dendl;
+ if (repair) {
+ repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+ }
++errors;
continue;
}
if (p == sb_info.end()) {
derr << "fsck error: found stray shared blob data for sbid 0x"
<< std::hex << sbid << std::dec << dendl;
+ if (repair) {
+ repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+ }
++errors;
} else {
++num_shared_blobs;
bluestore_shared_blob_t shared_blob(sbid);
bufferlist bl = it->value();
bufferlist::iterator blp = bl.begin();
- decode(shared_blob, blp);
+ try {
+ decode(shared_blob, blp);
+ } catch (buffer::error& e) {
+ ++errors;
+ // Force update and don't report as missing
+ sbi.updated = sbi.passed = true;
+
+ derr << "fsck error: failed to decode Shared Blob"
+ << pretty_binary_string(it->key()) << dendl;
+ if (repair) {
+ dout(20) << __func__ << " undecodable Shared Blob, key:'"
+ << pretty_binary_string(it->key())
+ << "', removing" << dendl;
+ repairer.remove_key(db, PREFIX_DEFERRED, it->key());
+ }
+ continue;
+ }
dout(20) << __func__ << " " << *sbi.sb << " " << shared_blob << dendl;
if (shared_blob.ref_map != sbi.ref_map) {
derr << "fsck error: shared blob 0x" << std::hex << sbid
- << std::dec << " ref_map " << shared_blob.ref_map
- << " != expected " << sbi.ref_map << dendl;
+ << std::dec << " ref_map " << shared_blob.ref_map
+ << " != expected " << sbi.ref_map << dendl;
+ sbi.updated = true; // will update later in repair mode only!
++errors;
}
PExtentVector extents;
for (auto &r : shared_blob.ref_map.ref_map) {
extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
}
- errors += _fsck_check_extents(p->second.oids.front(),
+ errors += _fsck_check_extents(sbi.cid,
+ p->second.oids.front(),
extents,
p->second.compressed,
used_blocks,
fm->get_alloc_size(),
+ repair ? &repairer : nullptr,
expected_statfs);
- sb_info.erase(p);
+ sbi.passed = true;
+ }
+ }
+ } // if (it)
+
+ if (repair && repairer.preprocess_misreference(db)) {
+ dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
+ auto& space_tracker = repairer.get_space_usage_tracker();
+ auto& misref_extents = repairer.get_misreferences();
+ interval_set<uint64_t> to_release;
+ it = db->get_iterator(PREFIX_OBJ);
+ if (it) {
+ CollectionRef c;
+ spg_t pgid;
+ KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
+ bool bypass_rest = false;
+ for (it->lower_bound(string()); it->valid() && !bypass_rest;
+ it->next()) {
+ dout(30) << __func__ << " key "
+ << pretty_binary_string(it->key()) << dendl;
+ if (is_extent_shard_key(it->key())) {
+ continue;
+ }
+
+ ghobject_t oid;
+ int r = get_key_object(it->key(), &oid);
+ if (r < 0 || !space_tracker.is_used(oid)) {
+ continue;
+ }
+
+ if (!c ||
+ oid.shard_id != pgid.shard ||
+ oid.hobj.pool != (int64_t)pgid.pool() ||
+ !c->contains(oid)) {
+ c = nullptr;
+ for (auto& p : coll_map) {
+ if (p.second->contains(oid)) {
+ c = p.second;
+ break;
+ }
+ }
+ if (!c) {
+ continue;
+ }
+ c->cid.is_pg(&pgid);
+ }
+ if (!space_tracker.is_used(c->cid)) {
+ continue;
+ }
+ dout(20) << __func__ << " check misreference for col:" << c->cid
+ << " obj:" << oid << dendl;
+
+ RWLock::RLocker l(c->lock);
+ OnodeRef o = c->get_onode(oid, false);
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+ mempool::bluestore_fsck::set<BlobRef> blobs;
+
+ for (auto& e : o->extent_map.extent_map) {
+ blobs.insert(e.blob);
+ }
+ bool need_onode_update = false;
+ bool first_dump = true;
+ for(auto b : blobs) {
+ bool broken_blob = false;
+ auto& pextents = b->dirty_blob().dirty_extents();
+ for (auto& e : pextents) {
+ if (!e.is_valid()) {
+ continue;
+ }
+ // for the sake of simplicity and proper shared blob handling
+ // always rewrite the whole blob even when it's partially
+ // misreferenced.
+ if (misref_extents.intersects(e.offset, e.length)) {
+ if (first_dump) {
+ first_dump = false;
+ _dump_onode(o, 10);
+ }
+ broken_blob = true;
+ break;
+ }
+ }
+ if (!broken_blob)
+ continue;
+ bool compressed = b->get_blob().is_compressed();
+ need_onode_update = true;
+ dout(10) << __func__
+ << " fix misreferences in oid:" << oid
+ << " " << *b << dendl;
+ uint64_t b_off = 0;
+ PExtentVector pext_to_release;
+ pext_to_release.reserve(pextents.size());
+ // rewriting all valid pextents
+ for (auto e = pextents.begin(); e != pextents.end();
+ b_off += e->length, e++) {
+ if (!e->is_valid()) {
+ continue;
+ }
+ int r = alloc->reserve(e->length);
+ if (r != 0) {
+ derr << __func__ << " failed to reserve 0x" << std::hex << e->length
+ << " bytes, misreferences repair's incomplete" << std::dec << dendl;
+ bypass_rest = true;
+ break;
+ }
+ PExtentVector exts;
+ int64_t alloc_len = alloc->allocate(e->length, min_alloc_size,
+ 0, 0, &exts);
+ if (alloc_len < (int64_t)e->length) {
+ derr << __func__ << " allocate failed on 0x" << std::hex << e->length
+ << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+ assert(0 == "allocate failed, wtf");
+ }
+ expected_statfs.allocated += e->length;
+ if (compressed) {
+ expected_statfs.compressed_allocated += e->length;
+ }
+ bufferlist bl;
+ IOContext ioc(cct, NULL, true); // allow EIO
+ r = bdev->read(e->offset, e->length, &bl, &ioc, false);
+ if (r < 0) {
+ derr << __func__ << " failed to read from 0x" << std::hex << e->offset
+ <<"~" << e->length << std::dec << dendl;
+ assert(0 == "read failed, wtf");
+ }
+ pext_to_release.push_back(*e);
+ e = pextents.erase(e);
+ e = pextents.insert(e, exts.begin(), exts.end());
+ b->get_blob().map_bl(
+ b_off, bl,
+ [&](uint64_t offset, bufferlist& t) {
+ int r = bdev->write(offset, t, false);
+ assert(r == 0);
+ });
+ e += exts.size() - 1;
+ for (auto& p : exts) {
+ fm->allocate(p.offset, p.length, txn);
+ }
+ } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
+
+ if (b->get_blob().is_shared()) {
+ b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
+
+ auto sb_it = sb_info.find(b->shared_blob->get_sbid());
+ assert(sb_it != sb_info.end());
+ sb_info_t& sbi = sb_it->second;
+
+ for (auto& r : sbi.ref_map.ref_map) {
+ expected_statfs.allocated -= r.second.length;
+ if (sbi.compressed) {
+ // NB: it's crucial to use compressed flag from sb_info_t
+ // as we originally used that value while accumulating
+ // expected_statfs
+ expected_statfs.compressed_allocated -= r.second.length;
+ }
+ }
+ sbi.updated = sbi.passed = true;
+ sbi.ref_map.clear();
+
+ // relying on blob's pextents to decide what to release.
+ for (auto& p : pext_to_release) {
+ to_release.insert(p.offset, p.length);
+ }
+ } else {
+ for (auto& p : pext_to_release) {
+ expected_statfs.allocated -= p.length;
+ if (compressed) {
+ expected_statfs.compressed_allocated -= p.length;
+ }
+ to_release.insert(p.offset, p.length);
+ }
+ }
+ if (bypass_rest) {
+ break;
+ }
+ } // for(auto b : blobs)
+ if (need_onode_update) {
+ o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
+ _record_onode(o, txn);
+ }
+ } // for (it->lower_bound(string()); it->valid(); it->next())
+
+ for (auto it = to_release.begin(); it != to_release.end(); ++it) {
+ dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
+ << "~" << it.get_len() << std::dec << dendl;
+ fm->release(it.get_start(), it.get_len(), txn);
}
- }
- }
+ alloc->release(to_release);
+ to_release.clear();
+ } // if (it) {
+ } //if (repair && repairer.preprocess_misreference()) {
+
for (auto &p : sb_info) {
- derr << "fsck error: shared_blob 0x" << p.first
- << " key is missing (" << *p.second.sb << ")" << dendl;
- ++errors;
+ sb_info_t& sbi = p.second;
+ if (!sbi.passed) {
+ derr << "fsck error: missing " << *sbi.sb << dendl;
+ ++errors;
+ }
+ if (repair && (!sbi.passed || sbi.updated)) {
+ auto sbid = p.first;
+ if (sbi.ref_map.empty()) {
+ assert(sbi.passed);
+ dout(20) << __func__ << " " << *sbi.sb
+ << " is empty, removing" << dendl;
+ repairer.fix_shared_blob(db, sbid, nullptr);
+ } else {
+ bufferlist bl;
+ bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
+ encode(persistent, bl);
+ dout(20) << __func__ << " " << *sbi.sb
+ << " is " << bl.length() << " bytes, updating" << dendl;
+
+ repairer.fix_shared_blob(db, sbid, &bl);
+ }
+ }
}
+ sb_info.clear();
+
if (!(actual_statfs == expected_statfs)) {
derr << "fsck error: actual " << actual_statfs
<< " != expected " << expected_statfs << dendl;
+ if (repair) {
+ repairer.fix_statfs(db, expected_statfs);
+ }
++errors;
}
} catch (buffer::error& e) {
derr << "fsck error: failed to decode deferred txn "
<< pretty_binary_string(it->key()) << dendl;
- r = -EIO;
- goto out_scan;
+ if (repair) {
+ dout(20) << __func__ << " undecodable deferred TXN record, key: '"
+ << pretty_binary_string(it->key())
+ << "', removing" << dendl;
+ repairer.remove_key(db, PREFIX_DEFERRED, it->key());
+ }
+ continue;
}
dout(20) << __func__ << " deferred " << wt.seq
<< " ops " << wt.ops.size()
[&](uint64_t pos, mempool_dynamic_bitset &bs) {
assert(pos < bs.size());
if (bs.test(pos)) {
- intersects = true;
+ if (offset == SUPER_RESERVED &&
+ length == min_alloc_size - SUPER_RESERVED) {
+ // this is due to the change just after luminous to min_alloc_size
+ // granularity allocations, and our baked in assumption at the top
+ // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
+ // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
+ // since we will never allocate this region below min_alloc_size.
+ dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
+ << " and min_alloc_size, 0x" << std::hex << offset << "~"
+ << length << dendl;
+ } else {
+ intersects = true;
+ if (repair) {
+ repairer.fix_false_free(db, fm,
+ pos * min_alloc_size,
+ min_alloc_size);
+ }
+ }
} else {
bs.set(pos);
}
}
);
if (intersects) {
- if (offset == SUPER_RESERVED &&
- length == min_alloc_size - SUPER_RESERVED) {
- // this is due to the change just after luminous to min_alloc_size
- // granularity allocations, and our baked in assumption at the top
- // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
- // (vs luminous's round_up_to(SUPER_RESERVED,block_size)). harmless,
- // since we will never allocate this region below min_alloc_size.
- dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
- << " and min_alloc_size, 0x" << std::hex << offset << "~"
- << length << dendl;
- } else {
- derr << "fsck error: free extent 0x" << std::hex << offset
- << "~" << length << std::dec
- << " intersects allocated blocks" << dendl;
- ++errors;
- }
+ derr << "fsck error: free extent 0x" << std::hex << offset
+ << "~" << length << std::dec
+ << " intersects allocated blocks" << dendl;
+ ++errors;
}
}
fm->enumerate_reset();
size_t count = used_blocks.count();
if (used_blocks.size() != count) {
assert(used_blocks.size() > count);
- ++errors;
used_blocks.flip();
size_t start = used_blocks.find_first();
while (start != decltype(used_blocks)::npos) {
while (true) {
size_t next = used_blocks.find_next(cur);
if (next != cur + 1) {
+ ++errors;
derr << "fsck error: leaked extent 0x" << std::hex
<< ((uint64_t)start * fm->get_alloc_size()) << "~"
<< ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
<< dendl;
+ if (repair) {
+ repairer.fix_leaked(db,
+ fm,
+ start * min_alloc_size,
+ (cur + 1 - start) * min_alloc_size);
+ }
start = next;
break;
}
used_blocks.flip();
}
}
-
+ if (repair) {
+ dout(5) << __func__ << " applying repair results" << dendl;
+ repaired = repairer.apply(db);
+ dout(5) << __func__ << " repair applied" << dendl;
+ }
out_scan:
mempool_thread.shutdown();
_flush_cache();
<< dendl;
utime_t duration = ceph_clock_now() - start;
- dout(1) << __func__ << " finish with " << errors << " errors, " << repaired
- << " repaired, " << (errors - repaired) << " remaining in "
+ dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, " << repaired
+ << " repaired, " << (errors - (int)repaired) << " remaining in "
<< duration << " seconds" << dendl;
- return errors - repaired;
+ return errors - (int)repaired;
+}
+
+/// methods to inject various errors fsck can repair
+void BlueStore::inject_broken_shared_blob_key(const string& key,
+ const bufferlist& bl)
+{
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+ txn->set(PREFIX_SHARED_BLOB, key, bl);
+ db->submit_transaction_sync(txn);
+};
+
+void BlueStore::inject_leaked(uint64_t len)
+{
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+
+ PExtentVector exts;
+ int r = alloc->reserve(len);
+ assert(r == 0);
+ int64_t alloc_len = alloc->allocate(len, min_alloc_size,
+ min_alloc_size * 256, 0, &exts);
+ assert(alloc_len >= (int64_t)len);
+ for (auto& p : exts) {
+ fm->allocate(p.offset, p.length, txn);
+ }
+ db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
+{
+ KeyValueDB::Transaction txn;
+ OnodeRef o;
+ CollectionRef c = _get_collection(cid);
+ assert(c);
+ {
+ RWLock::WLocker l(c->lock); // just to avoid internal asserts
+ o = c->get_onode(oid, false);
+ assert(o);
+ o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+ }
+
+ bool injected = false;
+ txn = db->get_transaction();
+ auto& em = o->extent_map.extent_map;
+ std::vector<const PExtentVector*> v;
+ if (em.size()) {
+ v.push_back(&em.begin()->blob->get_blob().get_extents());
+ }
+ if (em.size() > 1) {
+ auto it = em.end();
+ --it;
+ v.push_back(&(it->blob->get_blob().get_extents()));
+ }
+ for (auto pext : v) {
+ if (pext->size()) {
+ auto p = pext->begin();
+ while (p != pext->end()) {
+ if (p->is_valid()) {
+ dout(20) << __func__ << " release 0x" << std::hex << p->offset
+ << "~" << p->length << std::dec << dendl;
+ fm->release(p->offset, p->length, txn);
+ injected = true;
+ break;
+ }
+ ++p;
+ }
+ }
+ }
+ assert(injected);
+ db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_statfs(const store_statfs_t& new_statfs)
+{
+ BlueStoreRepairer repairer;
+ repairer.fix_statfs(db, new_statfs);
+ repairer.apply(db);
+}
+
+void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
+ coll_t cid2, ghobject_t oid2,
+ uint64_t offset)
+{
+ OnodeRef o1;
+ CollectionRef c1 = _get_collection(cid1);
+ assert(c1);
+ {
+ RWLock::WLocker l(c1->lock); // just to avoid internal asserts
+ o1 = c1->get_onode(oid1, false);
+ assert(o1);
+ o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
+ }
+ OnodeRef o2;
+ CollectionRef c2 = _get_collection(cid2);
+ assert(c2);
+ {
+ RWLock::WLocker l(c2->lock); // just to avoid internal asserts
+ o2 = c2->get_onode(oid2, false);
+ assert(o2);
+ o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
+ }
+ Extent& e1 = *(o1->extent_map.seek_lextent(offset));
+ Extent& e2 = *(o2->extent_map.seek_lextent(offset));
+
+ // require onode/extent layout to be the same (and simple)
+ // to make things easier
+ assert(o1->onode.extent_map_shards.empty());
+ assert(o2->onode.extent_map_shards.empty());
+ assert(o1->extent_map.spanning_blob_map.size() == 0);
+ assert(o2->extent_map.spanning_blob_map.size() == 0);
+ assert(e1.logical_offset == e2.logical_offset);
+ assert(e1.length == e2.length);
+ assert(e1.blob_offset == e2.blob_offset);
+
+ KeyValueDB::Transaction txn;
+ txn = db->get_transaction();
+
+ // along with misreference error this will create space leaks errors
+ e2.blob->dirty_blob() = e1.blob->get_blob();
+ o2->extent_map.dirty_range(offset, e2.length);
+ o2->extent_map.update(txn, false);
+
+ _record_onode(o2, txn);
+ db->submit_transaction_sync(txn);
}
void BlueStore::collect_metadata(map<string,string> *pm)
// finalize onodes
for (auto o : txc->onodes) {
- // finalize extent_map shards
- o->extent_map.update(t, false);
- if (o->extent_map.needs_reshard()) {
- o->extent_map.reshard(db, t);
- o->extent_map.update(t, true);
- if (o->extent_map.needs_reshard()) {
- dout(20) << __func__ << " warning: still wants reshard, check options?"
- << dendl;
- o->extent_map.clear_needs_reshard();
- }
- logger->inc(l_bluestore_onode_reshard);
- }
-
- // bound encode
- size_t bound = 0;
- denc(o->onode, bound);
- o->extent_map.bound_encode_spanning_blobs(bound);
- if (o->onode.extent_map_shards.empty()) {
- denc(o->extent_map.inline_bl, bound);
- }
-
- // encode
- bufferlist bl;
- unsigned onode_part, blob_part, extent_part;
- {
- auto p = bl.get_contiguous_appender(bound, true);
- denc(o->onode, p);
- onode_part = p.get_logical_offset();
- o->extent_map.encode_spanning_blobs(p);
- blob_part = p.get_logical_offset() - onode_part;
- if (o->onode.extent_map_shards.empty()) {
- denc(o->extent_map.inline_bl, p);
- }
- extent_part = p.get_logical_offset() - onode_part - blob_part;
- }
-
- dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
- << " (" << onode_part << " bytes onode + "
- << blob_part << " bytes spanning blobs + "
- << extent_part << " bytes inline extents)"
- << dendl;
- t->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
+ _record_onode(o, t);
o->flushing_count++;
}
}
}
+void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
+{
+ // finalize extent_map shards
+ o->extent_map.update(txn, false);
+ if (o->extent_map.needs_reshard()) {
+ o->extent_map.reshard(db, txn);
+ o->extent_map.update(txn, true);
+ if (o->extent_map.needs_reshard()) {
+ dout(20) << __func__ << " warning: still wants reshard, check options?"
+ << dendl;
+ o->extent_map.clear_needs_reshard();
+ }
+ logger->inc(l_bluestore_onode_reshard);
+ }
+
+ // bound encode
+ size_t bound = 0;
+ denc(o->onode, bound);
+ o->extent_map.bound_encode_spanning_blobs(bound);
+ if (o->onode.extent_map_shards.empty()) {
+ denc(o->extent_map.inline_bl, bound);
+ }
+
+ // encode
+ bufferlist bl;
+ unsigned onode_part, blob_part, extent_part;
+ {
+ auto p = bl.get_contiguous_appender(bound, true);
+ denc(o->onode, p);
+ onode_part = p.get_logical_offset();
+ o->extent_map.encode_spanning_blobs(p);
+ blob_part = p.get_logical_offset() - onode_part;
+ if (o->onode.extent_map_shards.empty()) {
+ denc(o->extent_map.inline_bl, p);
+ }
+ extent_part = p.get_logical_offset() - onode_part - blob_part;
+ }
+
+ dout(20) << __func__ << " onode " << o->oid << " is " << bl.length()
+ << " (" << onode_part << " bytes onode + "
+ << blob_part << " bytes spanning blobs + "
+ << extent_part << " bytes inline extents)"
+ << dendl;
+
+
+ txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
+}
+
// ===========================================
+// BlueStoreRepairer
+
+size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
+ const interval_set<uint64_t>& extents)
+{
+ assert(granularity); // initialized
+ // can't call for the second time
+ assert(!was_filtered_out);
+ assert(collections_bfs.size() == objects_bfs.size());
+
+ size_t prev_pos = 0;
+ size_t npos = collections_bfs.size();
+
+ bloom_vector collections_reduced;
+ bloom_vector objects_reduced;
+
+ for (auto e : extents) {
+ if (e.second == 0) {
+ continue;
+ }
+ size_t pos = max(e.first / granularity, prev_pos);
+ size_t end_pos = 1 + (e.first + e.second - 1) / granularity;
+ while (pos != npos && pos < end_pos) {
+ assert( collections_bfs[pos].element_count() ==
+ objects_bfs[pos].element_count());
+ if (collections_bfs[pos].element_count()) {
+ collections_reduced.push_back(std::move(collections_bfs[pos]));
+ objects_reduced.push_back(std::move(objects_bfs[pos]));
+ }
+ ++pos;
+ }
+ prev_pos = end_pos;
+ }
+ collections_reduced.swap(collections_bfs);
+ objects_reduced.swap(objects_bfs);
+ was_filtered_out = true;
+ return collections_bfs.size();
+}
+
+bool BlueStoreRepairer::remove_key(KeyValueDB *db,
+ const string& prefix,
+ const string& key)
+{
+ if (!remove_key_txn) {
+ remove_key_txn = db->get_transaction();
+ }
+ ++to_repair_cnt;
+ remove_key_txn->rmkey(prefix, key);
+
+ return true;
+}
+
+bool BlueStoreRepairer::fix_shared_blob(
+ KeyValueDB *db,
+ uint64_t sbid,
+ const bufferlist* bl)
+{
+ KeyValueDB::Transaction txn;
+ if (fix_misreferences_txn) { // reuse this txn
+ txn = fix_misreferences_txn;
+ } else {
+ if (!fix_shared_blob_txn) {
+ fix_shared_blob_txn = db->get_transaction();
+ }
+ txn = fix_shared_blob_txn;
+ }
+ string key;
+ get_shared_blob_key(sbid, &key);
+
+ ++to_repair_cnt;
+ if (bl) {
+ txn->set(PREFIX_SHARED_BLOB, key, *bl);
+ } else {
+ txn->rmkey(PREFIX_SHARED_BLOB, key);
+ }
+ return true;
+}
+
+bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
+ const store_statfs_t& new_statfs)
+{
+ if (!fix_statfs_txn) {
+ fix_statfs_txn = db->get_transaction();
+ }
+ BlueStore::volatile_statfs vstatfs;
+ vstatfs = new_statfs;
+ bufferlist bl;
+ vstatfs.encode(bl);
+ ++to_repair_cnt;
+ fix_statfs_txn->set(PREFIX_STAT, "bluestore_statfs", bl);
+ return true;
+}
+
+bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
+ FreelistManager* fm,
+ uint64_t offset, uint64_t len)
+{
+ if (!fix_fm_leaked_txn) {
+ fix_fm_leaked_txn = db->get_transaction();
+ }
+ ++to_repair_cnt;
+ fm->release(offset, len, fix_fm_leaked_txn);
+ return true;
+}
+bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
+ FreelistManager* fm,
+ uint64_t offset, uint64_t len)
+{
+ if (!fix_fm_false_free_txn) {
+ fix_fm_false_free_txn = db->get_transaction();
+ }
+ ++to_repair_cnt;
+ fm->allocate(offset, len, fix_fm_false_free_txn);
+ return true;
+}
+
+bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
+{
+ if (misreferenced_extents.size()) {
+ size_t n = space_usage_tracker.filter_out(misreferenced_extents);
+ assert(n > 0);
+ if (!fix_misreferences_txn) {
+ fix_misreferences_txn = db->get_transaction();
+ }
+ return true;
+ }
+ return false;
+}
+
+unsigned BlueStoreRepairer::apply(KeyValueDB* db)
+{
+ if (fix_fm_leaked_txn) {
+ db->submit_transaction_sync(fix_fm_leaked_txn);
+ fix_fm_leaked_txn = nullptr;
+ }
+ if (fix_fm_false_free_txn) {
+ db->submit_transaction_sync(fix_fm_false_free_txn);
+ fix_fm_false_free_txn = nullptr;
+ }
+ if (remove_key_txn) {
+ db->submit_transaction_sync(remove_key_txn);
+ remove_key_txn = nullptr;
+ }
+ if (fix_misreferences_txn) {
+ db->submit_transaction_sync(fix_misreferences_txn);
+ fix_misreferences_txn = nullptr;
+ }
+ if (fix_shared_blob_txn) {
+ db->submit_transaction_sync(fix_shared_blob_txn);
+ fix_shared_blob_txn = nullptr;
+ }
+
+ if (fix_statfs_txn) {
+ db->submit_transaction_sync(fix_statfs_txn);
+ fix_statfs_txn = nullptr;
+ }
+ unsigned repaired = to_repair_cnt;
+ to_repair_cnt = 0;
+ return repaired;
+}
+
+// =======================================================
+
#include "include/unordered_map.h"
#include "include/memory.h"
#include "include/mempool.h"
+#include "common/bloom_filter.hpp"
#include "common/Finisher.h"
#include "common/perf_counters.h"
#include "compressor/Compressor.h"
class Allocator;
class FreelistManager;
class BlueFS;
+class BlueStoreRepairer;
//#define DEBUG_CACHE
//#define DEBUG_DEFERRED
int64_t& compressed_allocated() {
return values[STATFS_COMPRESSED_ALLOCATED];
}
+ volatile_statfs& operator=(const store_statfs_t& st) {
+ values[STATFS_ALLOCATED] = st.allocated;
+ values[STATFS_STORED] = st.stored;
+ values[STATFS_COMPRESSED_ORIGINAL] = st.compressed_original;
+ values[STATFS_COMPRESSED] = st.compressed;
+ values[STATFS_COMPRESSED_ALLOCATED] = st.compressed_allocated;
+ return *this;
+ }
bool is_empty() {
return values[STATFS_ALLOCATED] == 0 &&
values[STATFS_STORED] == 0 &&
private:
int _fsck_check_extents(
+ const coll_t& cid,
const ghobject_t& oid,
const PExtentVector& extents,
bool compressed,
mempool_dynamic_bitset &used_blocks,
uint64_t granularity,
+ BlueStoreRepairer* repairer,
store_statfs_t& expected_statfs);
void _buffer_cache_write(
uint64_t tail_pad,
bufferlist& padded);
+ void _record_onode(OnodeRef &o, KeyValueDB::Transaction &txn);
+
// -- ondisk version ---
public:
const int32_t latest_ondisk_format = 2; ///< our version
RWLock::WLocker l(debug_read_error_lock);
debug_mdata_error_objects.insert(o);
}
+
+ /// methods to inject various errors fsck can repair
+ void inject_broken_shared_blob_key(const string& key,
+ const bufferlist& bl);
+ void inject_leaked(uint64_t len);
+ void inject_false_free(coll_t cid, ghobject_t oid);
+ void inject_statfs(const store_statfs_t& new_statfs);
+ void inject_misreference(coll_t cid1, ghobject_t oid1,
+ coll_t cid2, ghobject_t oid2,
+ uint64_t offset);
+
void compact() override {
assert(db);
db->compact();
o->put();
}
+class BlueStoreRepairer
+{
+public:
+ // to simplify future potential migration to mempools
+ using fsck_interval = interval_set<uint64_t>;
+
+ // Structure to track what pextents are used for specific cid/oid.
+ // Similar to Bloom filter positive and false-positive matches are
+ // possible only.
+ // Maintains two lists of bloom filters for both cids and oids
+ // where each list entry is a BF for specific disk pextent
+ // The length of the extent per filter is measured on init.
+ // Allows to filter out 'uninteresting' pextents to speadup subsequent
+ // 'is_used' access.
+ struct StoreSpaceTracker {
+ const uint64_t BLOOM_FILTER_SALT_COUNT = 2;
+ const uint64_t BLOOM_FILTER_TABLE_SIZE = 32; // bytes per single filter
+ const uint64_t BLOOM_FILTER_EXPECTED_COUNT = 16; // arbitrary selected
+ static const uint64_t DEF_MEM_CAP = 128 * 1024 * 1024;
+
+ typedef mempool::bluestore_fsck::vector<bloom_filter> bloom_vector;
+ bloom_vector collections_bfs;
+ bloom_vector objects_bfs;
+
+ bool was_filtered_out = false;
+ uint64_t granularity = 0; // extent length for a single filter
+
+ StoreSpaceTracker() {
+ }
+ StoreSpaceTracker(const StoreSpaceTracker& from) :
+ collections_bfs(from.collections_bfs),
+ objects_bfs(from.objects_bfs),
+ granularity(from.granularity) {
+ }
+
+ void init(uint64_t total,
+ uint64_t min_alloc_size,
+ uint64_t mem_cap = DEF_MEM_CAP) {
+ assert(!granularity); // not initialized yet
+ assert(min_alloc_size && isp2(min_alloc_size));
+ assert(mem_cap);
+
+ total = ROUND_UP_TO(total, min_alloc_size);
+ granularity = total * BLOOM_FILTER_TABLE_SIZE * 2 / mem_cap;
+
+ if (!granularity) {
+ granularity = min_alloc_size;
+ } else {
+ granularity = ROUND_UP_TO(granularity, min_alloc_size);
+ }
+
+ uint64_t entries = P2ROUNDUP(total, granularity) / granularity;
+ collections_bfs.resize(entries,
+ bloom_filter(BLOOM_FILTER_SALT_COUNT,
+ BLOOM_FILTER_TABLE_SIZE,
+ 0,
+ BLOOM_FILTER_EXPECTED_COUNT));
+ objects_bfs.resize(entries,
+ bloom_filter(BLOOM_FILTER_SALT_COUNT,
+ BLOOM_FILTER_TABLE_SIZE,
+ 0,
+ BLOOM_FILTER_EXPECTED_COUNT));
+ }
+ inline uint32_t get_hash(const coll_t& cid) const {
+ return cid.hash_to_shard(1);
+ }
+ inline void set_used(uint64_t offset, uint64_t len,
+ const coll_t& cid, const ghobject_t& oid) {
+ assert(granularity); // initialized
+
+ // can't call this func after filter_out has been apllied
+ assert(!was_filtered_out);
+ if (!len) {
+ return;
+ }
+ auto pos = offset / granularity;
+ auto end_pos = (offset + len - 1) / granularity;
+ while (pos <= end_pos) {
+ collections_bfs[pos].insert(get_hash(cid));
+ objects_bfs[pos].insert(oid.hobj.get_hash());
+ ++pos;
+ }
+ }
+ // filter-out entries unrelated to the specified(broken) extents.
+ // 'is_used' calls are permitted after that only
+ size_t filter_out(const fsck_interval& extents);
+
+ // determines if collection's present after filtering-out
+ inline bool is_used(const coll_t& cid) const {
+ assert(was_filtered_out);
+ for(auto& bf : collections_bfs) {
+ if (bf.contains(get_hash(cid))) {
+ return true;
+ }
+ }
+ return false;
+ }
+ // determines if object's present after filtering-out
+ inline bool is_used(const ghobject_t& oid) const {
+ assert(was_filtered_out);
+ for(auto& bf : objects_bfs) {
+ if (bf.contains(oid.hobj.get_hash())) {
+ return true;
+ }
+ }
+ return false;
+ }
+ // determines if collection's present before filtering-out
+ inline bool is_used(const coll_t& cid, uint64_t offs) const {
+ assert(granularity); // initialized
+ assert(!was_filtered_out);
+ auto &bf = collections_bfs[offs / granularity];
+ if (bf.contains(get_hash(cid))) {
+ return true;
+ }
+ return false;
+ }
+ // determines if object's present before filtering-out
+ inline bool is_used(const ghobject_t& oid, uint64_t offs) const {
+ assert(granularity); // initialized
+ assert(!was_filtered_out);
+ auto &bf = objects_bfs[offs / granularity];
+ if (bf.contains(oid.hobj.get_hash())) {
+ return true;
+ }
+ return false;
+ }
+ };
+public:
+
+ bool remove_key(KeyValueDB *db, const string& prefix, const string& key);
+ bool fix_shared_blob(KeyValueDB *db,
+ uint64_t sbid,
+ const bufferlist* bl);
+ bool fix_statfs(KeyValueDB *db, const store_statfs_t& new_statfs);
+
+ bool fix_leaked(KeyValueDB *db,
+ FreelistManager* fm,
+ uint64_t offset, uint64_t len);
+ bool fix_false_free(KeyValueDB *db,
+ FreelistManager* fm,
+ uint64_t offset, uint64_t len);
+
+ void init(uint64_t total_space, uint64_t lres_tracking_unit_size);
+
+ bool preprocess_misreference(KeyValueDB *db);
+
+ unsigned apply(KeyValueDB* db);
+
+ void note_misreference(uint64_t offs, uint64_t len, bool inc_error) {
+ misreferenced_extents.insert(offs, len);
+ if (inc_error) {
+ ++to_repair_cnt;
+ }
+ }
+
+ StoreSpaceTracker& get_space_usage_tracker() {
+ return space_usage_tracker;
+ }
+ const fsck_interval& get_misreferences() const {
+ return misreferenced_extents;
+ }
+ KeyValueDB::Transaction get_fix_misreferences_txn() {
+ return fix_misreferences_txn;
+ }
+
+private:
+ unsigned to_repair_cnt = 0;
+ KeyValueDB::Transaction fix_fm_leaked_txn;
+ KeyValueDB::Transaction fix_fm_false_free_txn;
+ KeyValueDB::Transaction remove_key_txn;
+ KeyValueDB::Transaction fix_statfs_txn;
+ KeyValueDB::Transaction fix_shared_blob_txn;
+
+ KeyValueDB::Transaction fix_misreferences_txn;
+
+ StoreSpaceTracker space_usage_tracker;
+
+ // non-shared extents with multiple references
+ fsck_interval misreferenced_extents;
+
+};
+
#endif
g_conf->apply_changes(NULL);
}
+TEST_P(StoreTest, BluestoreRepairTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+ const size_t offs_base = 65536 / 2;
+
+ g_ceph_context->_conf->set_val("bluestore_fsck_on_mount", "false");
+ g_ceph_context->_conf->set_val("bluestore_fsck_on_umount", "false");
+ g_ceph_context->_conf->set_val("bluestore_max_blob_size", stringify(2 * offs_base));
+ g_ceph_context->_conf->set_val("bluestore_extent_map_shard_max_size", "12000");
+ g_ceph_context->_conf->apply_changes(NULL);
+
+ BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+
+ // fill the store with some data
+ coll_t cid(spg_t(pg_t(0,555), shard_id_t::NO_SHARD));
+ auto ch = store->create_new_collection(cid);
+
+ ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+ ghobject_t hoid_dup(hobject_t(sobject_t("Object 1(dup)", CEPH_NOSNAP)));
+ ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
+ ghobject_t hoid_cloned = hoid2;
+ hoid_cloned.hobj.snap = 1;
+ ghobject_t hoid3(hobject_t(sobject_t("Object 3", CEPH_NOSNAP)));
+ ghobject_t hoid3_cloned = hoid3;
+ hoid3_cloned.hobj.snap = 1;
+ bufferlist bl;
+ bl.append("1234512345");
+ int r;
+ const size_t repeats = 16;
+ {
+ cerr << "create collection + write" << std::endl;
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ for( auto i = 0ul; i < repeats; ++i ) {
+ t.write(cid, hoid, i * offs_base, bl.length(), bl);
+ t.write(cid, hoid_dup, i * offs_base, bl.length(), bl);
+ }
+ for( auto i = 0ul; i < repeats; ++i ) {
+ t.write(cid, hoid2, i * offs_base, bl.length(), bl);
+ }
+ t.clone(cid, hoid2, hoid_cloned);
+
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ bstore->umount();
+ //////////// leaked pextent fix ////////////
+ cerr << "fix leaked pextents" << std::endl;
+ ASSERT_EQ(bstore->fsck(false), 0);
+ ASSERT_EQ(bstore->repair(false), 0);
+ bstore->mount();
+ bstore->inject_leaked(0x30000);
+ bstore->umount();
+ ASSERT_EQ(bstore->fsck(false), 1);
+ ASSERT_EQ(bstore->repair(false), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+
+ //////////// false free fix ////////////
+ cerr << "fix false free pextents" << std::endl;
+ bstore->mount();
+ bstore->inject_false_free(cid, hoid);
+ bstore->umount();
+ ASSERT_EQ(bstore->fsck(false), 2);
+ ASSERT_EQ(bstore->repair(false), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+
+ //////////// verify invalid statfs ///////////
+ cerr << "fix invalid statfs" << std::endl;
+ store_statfs_t statfs0, statfs;
+ bstore->mount();
+ ASSERT_EQ(bstore->statfs(&statfs0), 0);
+ statfs = statfs0;
+ statfs.allocated += 0x10000;
+ statfs.stored += 0x10000;
+ ASSERT_FALSE(statfs0 == statfs);
+ bstore->inject_statfs(statfs);
+ bstore->umount();
+
+ ASSERT_EQ(bstore->fsck(false), 1);
+ ASSERT_EQ(bstore->repair(false), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+ ASSERT_EQ(bstore->mount(), 0);
+ ASSERT_EQ(bstore->statfs(&statfs), 0);
+ // adjust free space to success in comparison
+ statfs0.available = statfs.available;
+ ASSERT_EQ(statfs0, statfs);
+
+ ///////// undecodable shared blob key / stray shared blob records ///////
+ cerr << "undecodable shared blob key" << std::endl;
+ bstore->inject_broken_shared_blob_key("undec1",
+ bufferlist());
+ bstore->inject_broken_shared_blob_key("undecodable key 2",
+ bufferlist());
+ bstore->inject_broken_shared_blob_key("undecodable key 3",
+ bufferlist());
+ bstore->umount();
+ ASSERT_EQ(bstore->fsck(false), 3);
+ ASSERT_EQ(bstore->repair(false), 0);
+ ASSERT_EQ(bstore->fsck(false), 0);
+
+ cerr << "misreferencing" << std::endl;
+ bstore->mount();
+ bstore->inject_misreference(cid, hoid, cid, hoid_dup, 0);
+ bstore->inject_misreference(cid, hoid, cid, hoid_dup, (offs_base * repeats) / 2);
+ bstore->inject_misreference(cid, hoid, cid, hoid_dup, offs_base * (repeats -1) );
+
+ bstore->umount();
+ ASSERT_EQ(bstore->fsck(false), 6);
+ ASSERT_EQ(bstore->repair(false), 0);
+
+ ASSERT_EQ(bstore->fsck(true), 0);
+
+ cerr << "Completing" << std::endl;
+ bstore->mount();
+ g_ceph_context->_conf->set_val("bluestore_fsck_on_mount", "true");
+ g_ceph_context->_conf->set_val("bluestore_fsck_on_umount", "true");
+ g_ceph_context->_conf->set_val("bluestore_max_alloc_size", "0");
+ g_ceph_context->_conf->set_val("bluestore_extent_map_shard_max_size", "1200");
+
+ g_ceph_context->_conf->apply_changes(NULL);
+}
+
int main(int argc, char **argv) {
vector<const char*> args;
argv_to_vec(argc, (const char **)argv, args);