From: Sage Weil Date: Fri, 17 Mar 2017 14:13:22 +0000 (-0400) Subject: os/bluestore: move cached items around on collection split X-Git-Tag: v12.0.1~12^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=9732b6c8e90965bd6a9d3a0eed899ba0c68603e1;p=ceph.git os/bluestore: move cached items around on collection split We've been avoiding doing this for a while and it has finally caught up with us: the SharedBlob may outlive the split due to deferred IO, and a read on the child collection may load a competing Blob and SharedBlob and read from the on-disk blocks that haven't been written yet. Fix by preserving the one-SharedBlob-instance invariant by moving cache items to the new Collection and cache shard like we should have from the beginning. Signed-off-by: Sage Weil --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index bd3e215cf5e8..50deb1c6a175 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -1001,6 +1001,34 @@ void BlueStore::TwoQCache::_rm_buffer(Buffer *b) } } +void BlueStore::TwoQCache::_move_buffer(Cache *srcc, Buffer *b) +{ + TwoQCache *src = static_cast(srcc); + src->_rm_buffer(b); + + // preserve which list we're on (even if we can't preserve the order!) + switch (b->cache_private) { + case BUFFER_WARM_IN: + assert(!b->is_empty()); + buffer_warm_in.push_back(*b); + break; + case BUFFER_WARM_OUT: + assert(b->is_empty()); + buffer_warm_out.push_back(*b); + break; + case BUFFER_HOT: + assert(!b->is_empty()); + buffer_hot.push_back(*b); + break; + default: + assert(0 == "bad cache_private"); + } + if (!b->is_empty()) { + buffer_bytes += b->length; + buffer_list_bytes[b->cache_private] += b->length; + } +} + void BlueStore::TwoQCache::_adjust_buffer_size(Buffer *b, int64_t delta) { dout(20) << __func__ << " delta " << delta << " on " << *b << dendl; @@ -1089,8 +1117,8 @@ void BlueStore::TwoQCache::_trim(uint64_t onode_max, uint64_t buffer_max) } Buffer *b = &*p; - assert(b->is_clean()); dout(20) << __func__ << " buffer_hot rm " << *b << dendl; + assert(b->is_clean()); // adjust evict size before buffer goes invalid to_evict_bytes -= b->length; evicted += b->length; @@ -1356,16 +1384,17 @@ void BlueStore::BufferSpace::finish_write(Cache* cache, uint64_t seq) } Buffer *b = &*i; - ldout(cache->cct, 20) << __func__ << " " << *b << dendl; assert(b->is_writing()); if (b->flags & Buffer::FLAG_NOCACHE) { writing.erase(i++); + ldout(cache->cct, 20) << __func__ << " discard " << *b << dendl; buffer_map.erase(b->offset); } else { b->state = Buffer::STATE_CLEAN; writing.erase(i++); cache->_add_buffer(b, 1, nullptr); + ldout(cache->cct, 20) << __func__ << " added " << *b << dendl; } } @@ -1470,34 +1499,6 @@ void BlueStore::OnodeSpace::clear() onode_map.clear(); } -void BlueStore::OnodeSpace::clear_pre_split(SharedBlobSet& sbset, - uint32_t ps, int bits) -{ - std::lock_guard l(cache->lock); - ldout(cache->cct, 10) << __func__ << dendl; - - auto p = onode_map.begin(); - while (p != onode_map.end()) { - if (p->second->oid.match(bits, ps)) { - // this onode stays in the collection post-split - ++p; - } else { - // We have an awkward race here: previous pipelined transactions may - // still reference blobs and their shared_blobs. They will be flushed - // shortly by _osr_reap_done, but it's awkward to block for that (and - // a waste of time). Instead, explicitly remove them from the shared blob - // map. - for (auto& e : p->second->extent_map.extent_map) { - if (e.blob->get_blob().is_shared()) { - sbset.remove(e.blob->shared_blob.get()); - } - } - cache->_rm_onode(p->second); - p = onode_map.erase(p); - } - } -} - bool BlueStore::OnodeSpace::empty() { std::lock_guard l(cache->lock); @@ -3059,6 +3060,78 @@ BlueStore::OnodeRef BlueStore::Collection::get_onode( return onode_map.add(oid, o); } +void BlueStore::Collection::split_cache( + Collection *dest) +{ + ldout(store->cct, 10) << __func__ << " to " << dest << dendl; + + // lock (one or both) cache shards + std::lock(cache->lock, dest->cache->lock); + std::lock_guard l(cache->lock, std::adopt_lock); + std::lock_guard l2(dest->cache->lock, std::adopt_lock); + + int destbits = dest->cnode.bits; + spg_t destpg; + bool is_pg = dest->cid.is_pg(&destpg); + assert(is_pg); + + auto p = onode_map.onode_map.begin(); + while (p != onode_map.onode_map.end()) { + if (!p->second->oid.match(destbits, destpg.pgid.ps())) { + // onode does not belong to this child + ++p; + } else { + OnodeRef o = p->second; + ldout(store->cct, 20) << __func__ << " moving " << o << " " << o->oid + << dendl; + + cache->_rm_onode(p->second); + p = onode_map.onode_map.erase(p); + + o->c = dest; + dest->cache->_add_onode(o, 1); + dest->onode_map.onode_map[o->oid] = o; + dest->onode_map.cache = dest->cache; + + // move over shared blobs and buffers. cover shared blobs from + // both extent map and spanning blob map (the full extent map + // may not be faulted in) + vector sbvec; + for (auto& e : o->extent_map.extent_map) { + sbvec.push_back(e.blob->shared_blob.get()); + } + for (auto& b : o->extent_map.spanning_blob_map) { + sbvec.push_back(b.second->shared_blob.get()); + } + for (auto sb : sbvec) { + if (sb->coll == dest) { + ldout(store->cct, 20) << __func__ << " already moved " << *sb + << dendl; + continue; + } + ldout(store->cct, 20) << __func__ << " moving " << *sb << dendl; + sb->coll = dest; + if (dest->cache != cache) { + if (sb->get_sbid()) { + ldout(store->cct, 20) << __func__ << " moving registration " << *sb << dendl; + shared_blob_set.remove(sb); + dest->shared_blob_set.add(dest, sb); + } + for (auto& i : sb->bc.buffer_map) { + if (!i.second->is_writing()) { + ldout(store->cct, 20) << __func__ << " moving " << *i.second + << dendl; + dest->cache->_move_buffer(cache, i.second.get()); + } + } + } + } + + + } + } +} + void BlueStore::Collection::trim_cache() { // see if mempool stats have updated @@ -10063,18 +10136,27 @@ int BlueStore::_split_collection(TransContext *txc, RWLock::WLocker l2(d->lock); int r; - // drop any cached items (onodes and referenced shared blobs) that will - // not belong to this collection post-split. - spg_t pgid; + // move any cached items (onodes and referenced shared blobs) that will + // belong to the child collection post-split. leave everything else behind. + // this may include things that don't strictly belong to the now-smaller + // parent split, but the OSD will always send us a split for every new + // child. + + spg_t pgid, dest_pgid; bool is_pg = c->cid.is_pg(&pgid); assert(is_pg); - c->onode_map.clear_pre_split(c->shared_blob_set, pgid.ps(), bits); + is_pg = d->cid.is_pg(&dest_pgid); + assert(is_pg); - // the destination should be empty. + // the destination should initially be empty. assert(d->onode_map.empty()); assert(d->shared_blob_set.empty()); + assert(d->cnode.bits == bits); + + c->split_cache(d.get()); - // adjust bits + // adjust bits. note that this will be redundant for all but the first + // split call for this parent (first child). c->cnode.bits = bits; assert(d->cnode.bits == bits); r = 0; diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 87d436a18f66..11780d87eaa7 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -993,6 +993,7 @@ public: virtual void _add_buffer(Buffer *b, int level, Buffer *near) = 0; virtual void _rm_buffer(Buffer *b) = 0; + virtual void _move_buffer(Cache *src, Buffer *b) = 0; virtual void _adjust_buffer_size(Buffer *b, int64_t delta) = 0; virtual void _touch_buffer(Buffer *b) = 0; @@ -1092,6 +1093,10 @@ public: auto q = buffer_lru.iterator_to(*b); buffer_lru.erase(q); } + void _move_buffer(Cache *src, Buffer *b) override { + src->_rm_buffer(b); + _add_buffer(b, 0, nullptr); + } void _adjust_buffer_size(Buffer *b, int64_t delta) override { assert((int64_t)buffer_size + delta >= 0); buffer_size += delta; @@ -1178,6 +1183,7 @@ public: } void _add_buffer(Buffer *b, int level, Buffer *near) override; void _rm_buffer(Buffer *b) override; + void _move_buffer(Cache *src, Buffer *b) override; void _adjust_buffer_size(Buffer *b, int64_t delta) override; void _touch_buffer(Buffer *b) override { switch (b->cache_private) { @@ -1223,6 +1229,8 @@ public: /// forward lookups mempool::bluestore_meta_other::unordered_map onode_map; + friend class Collection; // for split_cache() + public: OnodeSpace(Cache *c) : cache(c) {} ~OnodeSpace() { @@ -1238,7 +1246,6 @@ public: const ghobject_t& new_oid, const mempool::bluestore_meta_other::string& new_okey); void clear(); - void clear_pre_split(SharedBlobSet& sbset, uint32_t ps, int bits); bool empty(); /// return true if f true for any item @@ -1301,6 +1308,7 @@ public: return false; } + void split_cache(Collection *dest); void trim_cache(); Collection(BlueStore *ns, Cache *ca, coll_t c); @@ -1886,7 +1894,8 @@ private: uint64_t offset, bufferlist& bl, unsigned flags) { - b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl, flags); + b->shared_blob->bc.write(b->shared_blob->get_cache(), txc->seq, offset, bl, + flags); txc->shared_blobs_written.insert(b->shared_blob); } diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 34ab2949d2fc..abecf91bc92e 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -4974,7 +4974,8 @@ void colsplittest( CEPH_NOSNAP, i<