From: Sage Weil Date: Mon, 15 May 2017 14:07:49 +0000 (-0400) Subject: os/bluestore: try to unshare blobs from gen objects X-Git-Tag: ses5-milestone6~8^2~8^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F14239%2Fhead;p=ceph.git os/bluestore: try to unshare blobs from gen objects OSD EC write pattern clones a to-be-written range to a gen object and then deletes it shortly after. Try to unshare the original blob in the nogen object if it is in the cache so reduce the shared blob tracking overhead and copy-on-write behavior. This doesn't make our write pattern perfect, by any means, since a small overwrite will generally - clone the range to the gen object - write the same range, creating a new blob on the nogen object - remove the clone, and unshare the original blob This doesn't fix that the overwrite was written somewhere else and we probably have two competing blobs. However, future small writes will find the alternate blob in _do_small_write and reuse the same allocation, which is something. Signed-off-by: Sage Weil --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 1983dbb1a112..e613dd0b402a 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -1638,10 +1638,15 @@ void BlueStore::SharedBlob::get_ref(uint64_t offset, uint32_t length) } void BlueStore::SharedBlob::put_ref(uint64_t offset, uint32_t length, - PExtentVector *r) + PExtentVector *r, + set *maybe_unshared) { assert(persistent); - persistent->ref_map.put(offset, length, r); + bool maybe = false; + persistent->ref_map.put(offset, length, r, maybe_unshared ? &maybe : nullptr); + if (maybe_unshared && maybe) { + maybe_unshared->insert(this); + } } // Blob @@ -3074,12 +3079,11 @@ void BlueStore::Collection::load_shared_blob(SharedBlobRef sb) void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b) { - assert(!b->shared_blob->is_loaded()); - ldout(store->cct, 10) << __func__ << " " << *b << dendl; - bluestore_blob_t& blob = b->dirty_blob(); + assert(!b->shared_blob->is_loaded()); // update blob + bluestore_blob_t& blob = b->dirty_blob(); blob.set_flag(bluestore_blob_t::FLAG_SHARED); // update shared blob @@ -3096,6 +3100,20 @@ void BlueStore::Collection::make_blob_shared(uint64_t sbid, BlobRef b) ldout(store->cct, 20) << __func__ << " now " << *b << dendl; } +uint64_t BlueStore::Collection::make_blob_unshared(SharedBlob *sb) +{ + ldout(store->cct, 10) << __func__ << " " << *sb << dendl; + assert(sb->is_loaded()); + + uint64_t sbid = sb->get_sbid(); + shared_blob_set.remove(sb); + sb->loaded = false; + delete sb->persistent; + sb->sbid_unloaded = 0; + ldout(store->cct, 20) << __func__ << " now " << *sb << dendl; + return sbid; +} + BlueStore::OnodeRef BlueStore::Collection::get_onode( const ghobject_t& oid, bool create) @@ -9742,7 +9760,8 @@ void BlueStore::_wctx_finish( TransContext *txc, CollectionRef& c, OnodeRef o, - WriteContext *wctx) + WriteContext *wctx, + set *maybe_unshared_blobs) { auto oep = wctx->old_extents.begin(); while (oep != wctx->old_extents.end()) { @@ -9765,7 +9784,9 @@ void BlueStore::_wctx_finish( PExtentVector final; c->load_shared_blob(b->shared_blob); for (auto e : r) { - b->shared_blob->put_ref(e.offset, e.length, &final); + b->shared_blob->put_ref( + e.offset, e.length, &final, + b->is_referenced() ? nullptr : maybe_unshared_blobs); } dout(20) << __func__ << " shared_blob release " << final << " from " << *b->shared_blob << dendl; @@ -10102,7 +10123,8 @@ int BlueStore::_do_zero(TransContext *txc, } void BlueStore::_do_truncate( - TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset) + TransContext *txc, CollectionRef& c, OnodeRef o, uint64_t offset, + set *maybe_unshared_blobs) { dout(15) << __func__ << " " << c->cid << " " << o->oid << " 0x" << std::hex << offset << std::dec << dendl; @@ -10110,15 +10132,15 @@ void BlueStore::_do_truncate( _dump_onode(o, 30); if (offset == o->onode.size) - return ; + return; if (offset < o->onode.size) { WriteContext wctx; uint64_t length = o->onode.size - offset; o->extent_map.fault_range(db, offset, length); o->extent_map.punch_hole(c, offset, length, &wctx.old_extents); - o->extent_map.dirty_range(txc->t, offset, length); - _wctx_finish(txc, c, o, &wctx); + o->extent_map.dirty_range(offset, length); + _wctx_finish(txc, c, o, &wctx, maybe_unshared_blobs); // if we have shards past EOF, ask for a reshard if (!o->onode.extent_map_shards.empty() && @@ -10153,7 +10175,8 @@ int BlueStore::_do_remove( CollectionRef& c, OnodeRef o) { - _do_truncate(txc, c, o, 0); + set maybe_unshared_blobs; + _do_truncate(txc, c, o, 0, &maybe_unshared_blobs); if (o->onode.has_omap()) { o->flush(); _do_omap_clear(txc, o->onode.nid); @@ -10174,6 +10197,72 @@ int BlueStore::_do_remove( o->extent_map.clear(); o->onode = bluestore_onode_t(); _debug_obj_on_delete(o->oid); + + if (!o->oid.is_no_gen() && + !maybe_unshared_blobs.empty()) { + // see if we can unshare blobs still referenced by the head + dout(10) << __func__ << " gen and maybe_unshared_blobs " + << maybe_unshared_blobs << dendl; + ghobject_t nogen = o->oid; + nogen.generation = ghobject_t::NO_GEN; + OnodeRef h = c->onode_map.lookup(nogen); + if (h && h->exists) { + dout(20) << __func__ << " checking for unshareable blobs on " << h + << " " << h->oid << dendl; + map expect; + for (auto& e : h->extent_map.extent_map) { + const bluestore_blob_t& b = e.blob->get_blob(); + SharedBlob *sb = e.blob->shared_blob.get(); + if (b.is_shared() && + sb->loaded && + maybe_unshared_blobs.count(sb)) { + b.map(e.blob_offset, e.length, [&](uint64_t off, uint64_t len) { + expect[sb].get(off, len); + return 0; + }); + } + } + vector unshared_blobs; + unshared_blobs.reserve(maybe_unshared_blobs.size()); + for (auto& p : expect) { + dout(20) << " ? " << *p.first << " vs " << p.second << dendl; + if (p.first->persistent->ref_map == p.second) { + SharedBlob *sb = p.first; + dout(20) << __func__ << " unsharing " << *sb << dendl; + unshared_blobs.push_back(sb); + txc->unshare_blob(sb); + uint64_t sbid = c->make_blob_unshared(sb); + string key; + get_shared_blob_key(sbid, &key); + txc->t->rmkey(PREFIX_SHARED_BLOB, key); + } + } + + uint32_t b_start = OBJECT_MAX_SIZE; + uint32_t b_end = 0; + for (auto& e : h->extent_map.extent_map) { + const bluestore_blob_t& b = e.blob->get_blob(); + SharedBlob *sb = e.blob->shared_blob.get(); + if (b.is_shared() && + std::find(unshared_blobs.begin(), unshared_blobs.end(), + sb) != unshared_blobs.end()) { + dout(20) << __func__ << " unsharing " << *e.blob << dendl; + bluestore_blob_t& blob = e.blob->dirty_blob(); + blob.clear_flag(bluestore_blob_t::FLAG_SHARED); + if (e.logical_offset < b_start) { + b_start = e.logical_offset; + } + if (e.logical_end() > b_end) { + b_end = e.logical_end(); + } + } + } + if (!unshared_blobs.empty()) { + h->extent_map.dirty_range(b_start, b_end); + txc->write_onode(h); + } + } + } return 0; } diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 97e3fccd0564..d73ae8945e64 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -388,7 +388,7 @@ public: /// put logical references, and get back any released extents void put_ref(uint64_t offset, uint32_t length, - PExtentVector *r); + PExtentVector *r, set *maybe_unshared_blobs); friend bool operator==(const SharedBlob &l, const SharedBlob &r) { return l.get_sbid() == r.get_sbid(); @@ -1339,6 +1339,7 @@ public: void open_shared_blob(uint64_t sbid, BlobRef b); void load_shared_blob(SharedBlobRef sb); void make_blob_shared(uint64_t sbid, BlobRef b); + uint64_t make_blob_unshared(SharedBlob *sb); BlobRef new_blob() { BlobRef b = new Blob(); @@ -1557,6 +1558,10 @@ public: void write_shared_blob(SharedBlobRef &sb) { shared_blobs.insert(sb); } + void unshare_blob(SharedBlob *sb) { + shared_blobs.erase(sb); + } + /// note we logically modified object (when onode itself is unmodified) void note_modified_object(OnodeRef &o) { // onode itself isn't written, though @@ -2495,7 +2500,8 @@ private: TransContext *txc, CollectionRef& c, OnodeRef o, - WriteContext *wctx); + WriteContext *wctx, + set *maybe_unshared_blobs=0); int _do_transaction(Transaction *t, TransContext *txc, @@ -2538,7 +2544,8 @@ private: void _do_truncate(TransContext *txc, CollectionRef& c, OnodeRef o, - uint64_t offset); + uint64_t offset, + set *maybe_unshared_blobs=0); void _truncate(TransContext *txc, CollectionRef& c, OnodeRef& o, diff --git a/src/os/bluestore/bluestore_types.cc b/src/os/bluestore/bluestore_types.cc index 561605792b10..fdb14ebd86cb 100644 --- a/src/os/bluestore/bluestore_types.cc +++ b/src/os/bluestore/bluestore_types.cc @@ -196,10 +196,11 @@ void bluestore_extent_ref_map_t::get(uint64_t offset, uint32_t length) void bluestore_extent_ref_map_t::put( uint64_t offset, uint32_t length, - PExtentVector *release) + PExtentVector *release, + bool *maybe_unshared) { //NB: existing entries in 'release' container must be preserved! - + bool unshared = true; auto p = ref_map.lower_bound(offset); if (p == ref_map.end() || p->first > offset) { if (p == ref_map.begin()) { @@ -213,30 +214,42 @@ void bluestore_extent_ref_map_t::put( if (p->first < offset) { uint64_t left = p->first + p->second.length - offset; p->second.length = offset - p->first; + if (p->second.refs != 1) { + unshared = false; + } p = ref_map.insert(map::value_type( offset, record_t(left, p->second.refs))).first; } while (length > 0) { assert(p->first == offset); if (length < p->second.length) { + if (p->second.refs != 1) { + unshared = false; + } ref_map.insert(make_pair(offset + length, record_t(p->second.length - length, p->second.refs))); if (p->second.refs > 1) { p->second.length = length; --p->second.refs; + if (p->second.refs != 1) { + unshared = false; + } _maybe_merge_left(p); } else { if (release) release->push_back(bluestore_pextent_t(p->first, length)); ref_map.erase(p); } - return; + goto out; } offset += p->second.length; length -= p->second.length; if (p->second.refs > 1) { --p->second.refs; + if (p->second.refs != 1) { + unshared = false; + } _maybe_merge_left(p); ++p; } else { @@ -248,6 +261,19 @@ void bluestore_extent_ref_map_t::put( if (p != ref_map.end()) _maybe_merge_left(p); //_check(); +out: + if (maybe_unshared) { + if (unshared) { + // we haven't seen a ref != 1 yet; check the whole map. + for (auto& p : ref_map) { + if (p.second.refs != 1) { + unshared = false; + break; + } + } + } + *maybe_unshared = unshared; + } } bool bluestore_extent_ref_map_t::contains(uint64_t offset, uint32_t length) const diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h index 81c051b7ca13..654246fa6b91 100644 --- a/src/os/bluestore/bluestore_types.h +++ b/src/os/bluestore/bluestore_types.h @@ -223,7 +223,8 @@ struct bluestore_extent_ref_map_t { } void get(uint64_t offset, uint32_t len); - void put(uint64_t offset, uint32_t len, PExtentVector *release); + void put(uint64_t offset, uint32_t len, PExtentVector *release, + bool *maybe_unshared); bool contains(uint64_t offset, uint32_t len) const; bool intersects(uint64_t offset, uint32_t len) const; diff --git a/src/test/objectstore/test_bluestore_types.cc b/src/test/objectstore/test_bluestore_types.cc index c6f6eeec0285..39c3918043ad 100644 --- a/src/test/objectstore/test_bluestore_types.cc +++ b/src/test/objectstore/test_bluestore_types.cc @@ -120,18 +120,22 @@ TEST(bluestore_extent_ref_map_t, put) { bluestore_extent_ref_map_t m; PExtentVector r; + bool maybe_unshared = false; m.get(10, 30); - m.put(10, 30, &r); - cout << m << " " << r << std::endl; + maybe_unshared = true; + m.put(10, 30, &r, &maybe_unshared); + cout << m << " " << r << " " << (int)maybe_unshared << std::endl; ASSERT_EQ(0u, m.ref_map.size()); ASSERT_EQ(1u, r.size()); ASSERT_EQ(10u, r[0].offset); ASSERT_EQ(30u, r[0].length); + ASSERT_TRUE(maybe_unshared); r.clear(); m.get(10, 30); m.get(20, 10); - m.put(10, 30, &r); - cout << m << " " << r << std::endl; + maybe_unshared = true; + m.put(10, 30, &r, &maybe_unshared); + cout << m << " " << r << " " << (int)maybe_unshared << std::endl; ASSERT_EQ(1u, m.ref_map.size()); ASSERT_EQ(10u, m.ref_map[20].length); ASSERT_EQ(1u, m.ref_map[20].refs); @@ -140,11 +144,13 @@ TEST(bluestore_extent_ref_map_t, put) ASSERT_EQ(10u, r[0].length); ASSERT_EQ(30u, r[1].offset); ASSERT_EQ(10u, r[1].length); + ASSERT_TRUE(maybe_unshared); r.clear(); m.get(30, 10); m.get(30, 10); - m.put(20, 15, &r); - cout << m << " " << r << std::endl; + maybe_unshared = true; + m.put(20, 15, &r, &maybe_unshared); + cout << m << " " << r << " " << (int)maybe_unshared << std::endl; ASSERT_EQ(2u, m.ref_map.size()); ASSERT_EQ(5u, m.ref_map[30].length); ASSERT_EQ(1u, m.ref_map[30].refs); @@ -153,9 +159,11 @@ TEST(bluestore_extent_ref_map_t, put) ASSERT_EQ(1u, r.size()); ASSERT_EQ(20u, r[0].offset); ASSERT_EQ(10u, r[0].length); + ASSERT_FALSE(maybe_unshared); r.clear(); - m.put(33, 5, &r); - cout << m << " " << r << std::endl; + maybe_unshared = true; + m.put(33, 5, &r, &maybe_unshared); + cout << m << " " << r << " " << (int)maybe_unshared << std::endl; ASSERT_EQ(3u, m.ref_map.size()); ASSERT_EQ(3u, m.ref_map[30].length); ASSERT_EQ(1u, m.ref_map[30].refs); @@ -166,6 +174,12 @@ TEST(bluestore_extent_ref_map_t, put) ASSERT_EQ(1u, r.size()); ASSERT_EQ(33u, r[0].offset); ASSERT_EQ(2u, r[0].length); + ASSERT_FALSE(maybe_unshared); + r.clear(); + maybe_unshared = true; + m.put(38, 2, &r, &maybe_unshared); + cout << m << " " << r << " " << (int)maybe_unshared << std::endl; + ASSERT_TRUE(maybe_unshared); } TEST(bluestore_extent_ref_map_t, contains)