From ec5ba4e8cf90c70b3468bb7c3c67568ccb08b2e2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 19 Dec 2016 17:04:26 -0500 Subject: [PATCH] os/bluestore: preserve source collection cache during split OSD split transactions look something like mkcoll new split old ... omap_rmkey_range old omap_setkeys old omap_setkeys new The last part splits the log into two pieces. The problem is that the rmkey_range needs to wait on old omap transactions to flush, and those are linked to the old onode, and split clears the cache. The result is that we don't wait, rmkeyrange leaves some recent pg log keys behind, and on OSD restart we get an error because the object doesn't belong to the (old) collection. Fix this by preserving objects in the old collection and only clear out objects that are moving to the newly split collections. This will include the pgmeta object that we care about. (Note that we are one step closer to preserving the cache contents across the split, but not quite there yet: at this point we don't have all of the destination collections. A change in the ObjectStore interface is probably needed to make that not be extremely awkward.) Signed-off-by: Sage Weil --- src/os/bluestore/BlueStore.cc | 43 +++++++++++++++++++++++++++-------- src/os/bluestore/BlueStore.h | 1 + 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 2bcd2163e25..f04545c593f 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -1218,6 +1218,34 @@ void BlueStore::OnodeSpace::clear() onode_map.clear(); } +void BlueStore::OnodeSpace::clear_pre_split(SharedBlobSet& sbset, + uint32_t ps, int bits) +{ + std::lock_guard l(cache->lock); + dout(10) << __func__ << dendl; + + auto p = onode_map.begin(); + while (p != onode_map.end()) { + if (p->second->oid.match(bits, ps)) { + // this onode stays in the collection post-split + ++p; + } else { + // We have an awkward race here: previous pipelined transactions may + // still reference blobs and their shared_blobs. They will be flushed + // shortly by _osr_reap_done, but it's awkward to block for that (and + // a waste of time). Instead, explicitly remove them from the shared blob + // map. + for (auto& e : p->second->extent_map.extent_map) { + if (e.blob->get_blob().is_shared()) { + sbset.remove(e.blob->shared_blob.get()); + } + } + cache->_rm_onode(p->second); + p = onode_map.erase(p); + } + } +} + bool BlueStore::OnodeSpace::empty() { std::lock_guard l(cache->lock); @@ -8943,15 +8971,12 @@ int BlueStore::_split_collection(TransContext *txc, RWLock::WLocker l2(d->lock); int r; - // blow away src cache - c->onode_map.clear(); - - // We have an awkward race here: previous pipelinex transactions may - // still reference blobs and their shared_blobs. They will be flushed - // shortly by _osr_reap_done, but it's awkward to block for that (and - // a waste of time). Instead, explicitly remove them from the shared blob - // map. - c->shared_blob_set.violently_clear(); + // drop any cached items (onodes and referenced shared blobs) that will + // not belong to this collection post-split. + spg_t pgid; + bool is_pg = c->cid.is_pg(&pgid); + assert(is_pg); + c->onode_map.clear_pre_split(c->shared_blob_set, pgid.ps(), bits); // the destination should be empty. assert(d->onode_map.empty()); diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index b92777f5cd2..baf061b9a6a 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1077,6 +1077,7 @@ public: const ghobject_t& new_oid, const string& new_okey); void clear(); + void clear_pre_split(SharedBlobSet& sbset, uint32_t ps, int bits); bool empty(); /// return true if f true for any item -- 2.39.5