From 1a39872b9e64801c8878703dc5d7b6799cceb74a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 28 Sep 2016 17:54:46 -0400 Subject: [PATCH] os/bluestore: try to split blobs instead of spanning them If a blob crosses a shard boundary, try to split it into pieces instead of making it spanning. This is only possible under certain conditions (uncompressed, no in-flight writes, no unused space, shard boundary aligned with chunk boundary). Generally speaking this will reduce the number of blobs that end up spanning and reduce the onode key size. Signed-off-by: Sage Weil --- src/os/bluestore/BlueStore.cc | 188 ++++++++++++++++++- src/os/bluestore/BlueStore.h | 16 ++ src/os/bluestore/bluestore_types.h | 13 ++ src/test/objectstore/test_bluestore_types.cc | 74 ++++++++ 4 files changed, 284 insertions(+), 7 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 92ce9f5d905..b166305e9c3 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -1071,6 +1071,50 @@ void BlueStore::BufferSpace::finish_write(uint64_t seq) cache->_audit("finish_write end"); } +void BlueStore::BufferSpace::split(size_t pos, BlueStore::BufferSpace &r) +{ + std::lock_guard lk(cache->lock); + assert(r.cache == cache); + auto p = buffer_map.begin(); + while (p != buffer_map.end() && + p->second->end() <= pos) { + dout(30) << __func__ << " skip " << *p->second << dendl; + ++p; + } + if (p != buffer_map.end()) { + if (p->second->offset < pos) { + dout(30) << __func__ << " cut " << *p->second << dendl; + size_t left = pos - p->second->offset; + size_t right = p->second->length - left; + if (p->second->data.length()) { + bufferlist bl; + bl.substr_of(p->second->data, left, right); + r._add_buffer(new Buffer(&r, p->second->state, p->second->seq, 0, bl), + 0, p->second.get()); + } else { + r._add_buffer(new Buffer(&r, p->second->state, p->second->seq, 0, right), + 0, p->second.get()); + } + p->second->truncate(left); + ++p; + } + while (p != buffer_map.end()) { + dout(30) << __func__ << " move " << *p->second << dendl; + if (p->second->data.length()) { + r._add_buffer(new Buffer(&r, p->second->state, p->second->seq, + p->second->offset - pos, p->second->data), + 0, p->second.get()); + } else { + r._add_buffer(new Buffer(&r, p->second->state, p->second->seq, + p->second->offset - pos, p->second->length), + 0, p->second.get()); + } + _rm_buffer(p++); + } + } + assert(writing_map.empty()); +} + // OnodeSpace #undef dout_prefix @@ -1382,6 +1426,62 @@ bool BlueStore::Blob::put_ref( return false; } +void BlueStore::Blob::split(size_t blob_offset, Blob *r) +{ + dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec + << " start " << *this << dendl; + assert(blob.can_split()); + bluestore_blob_t &lb = dirty_blob(); + bluestore_blob_t &rb = r->dirty_blob(); + + unsigned i = 0; + size_t left = blob_offset; + for (auto p = lb.extents.begin(); p != lb.extents.end(); ++p, ++i) { + if (p->length <= left) { + left -= p->length; + continue; + } + if (left) { + if (p->is_valid()) { + rb.extents.emplace_back(bluestore_pextent_t(p->offset + left, + p->length - left)); + } else { + rb.extents.emplace_back(bluestore_pextent_t( + bluestore_pextent_t::INVALID_OFFSET, + p->length - left)); + } + p->length = left; + ++i; + ++p; + } + while (p != lb.extents.end()) { + rb.extents.push_back(*p++); + } + lb.extents.resize(i); + break; + } + rb.flags = lb.flags; + + if (lb.has_csum()) { + rb.csum_type = lb.csum_type; + rb.csum_chunk_order = lb.csum_chunk_order; + size_t csum_order = lb.get_csum_chunk_size(); + assert(blob_offset % csum_order == 0); + size_t pos = (blob_offset / csum_order) * lb.get_csum_value_size(); + // deep copy csum data + bufferptr old; + old.swap(lb.csum_data); + rb.csum_data = bufferptr(old.c_str() + pos, old.length() - pos); + lb.csum_data = bufferptr(old.c_str(), pos); + } + + shared_blob->bc.split(blob_offset, r->shared_blob->bc); + + dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec + << " finish " << *this << dendl; + dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec + << " and " << *r << dendl; +} // Extent @@ -1583,10 +1683,43 @@ void BlueStore::ExtentMap::reshard(Onode *o) } if (ep->blob->id < 0 && ep->blob_escapes_range(shard_start, shard_end - shard_start)) { - ep->blob->id = bid++; - spanning_blob_map.insert(*ep->blob); - ep->blob->get(); - dout(20) << __func__ << " adding spanning " << *ep << dendl; + // We have two options: (1) split the blob into pieces at the + // shard boundaries (and adjust extents accordingly), or (2) + // mark it spanning. We prefer to cut the blob if we can. Note that + // we may have to split it multiple times--potentially at every + // shard boundary. + bool must_span = false; + BlobRef b = ep->blob; + if (b->can_split()) { + uint32_t bstart = ep->logical_offset - ep->blob_offset; + uint32_t bend = bstart + b->get_blob().get_logical_length(); + for (const auto& sh : shards) { + if (bstart < sh.offset && bend > sh.offset) { + uint32_t blob_offset = sh.offset - bstart; + if (b->get_blob().can_split_at(blob_offset)) { + dout(20) << __func__ << " splitting blob, bstart 0x" + << std::hex << bstart + << " blob_offset 0x" << blob_offset + << std::dec << " " << *b << dendl; + b = split_blob(b, blob_offset, sh.offset); + // switch b to the new right-hand side, in case it + // *also* has to get split. + bstart += blob_offset; + } else { + must_span = true; + break; + } + } + } + } else { + must_span = true; + } + if (must_span) { + b->id = bid++; + spanning_blob_map.insert(*b); + b->get(); + dout(20) << __func__ << " adding spanning " << *b << dendl; + } } ++ep; } @@ -2026,6 +2159,48 @@ BlueStore::Extent *BlueStore::ExtentMap::set_lextent( return le; } +BlueStore::BlobRef BlueStore::ExtentMap::split_blob( + BlobRef lb, + uint32_t blob_offset, + uint32_t pos) +{ + uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset; + dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos + << " blob_offset 0x" << blob_offset << std::dec + << " " << *lb << dendl; + BlobRef rb = onode->c->new_blob(); + lb->split(blob_offset, rb.get()); + + for (auto ep = seek_lextent(pos); + ep != extent_map.end() && ep->logical_offset < end_pos; + ++ep) { + if (ep->blob != lb) { + continue; + } + vector released; + if (ep->logical_offset < pos) { + // split extent + size_t left = pos - ep->logical_offset; + Extent *ne = new Extent(pos, 0, ep->length - left, ep->blob_depth, rb); + extent_map.insert(*ne); + lb->ref_map.put(ep->blob_offset + left, ep->length - left, &released); + ep->length = left; + rb->ref_map.get(ne->blob_offset, ne->length); + dout(30) << __func__ << " split " << *ep << dendl; + dout(30) << __func__ << " to " << *ne << dendl; + } else { + // switch blob + assert(ep->blob_offset >= blob_offset); + lb->ref_map.put(ep->blob_offset, ep->length, &released); + ep->blob = rb; + ep->blob_offset -= blob_offset; + rb->ref_map.get(ep->blob_offset, ep->length); + dout(30) << __func__ << " adjusted " << *ep << dendl; + } + } + return rb; +} + // Onode @@ -6029,9 +6204,8 @@ void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t) o->extent_map.reshard(o.get()); reshard = o->extent_map.update(o.get(), t, true); if (reshard) { - derr << __func__ << " warning: still wants reshard, check options?" - << dendl; - assert(0 == "reshard problem"); + dout(20) << __func__ << " warning: still wants reshard, check options?" + << dendl; } logger->inc(l_bluestore_onode_reshard); } diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 9831a0b728f..a0fdddb7b96 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -287,6 +287,8 @@ public: discard(offset, (uint64_t)-1 - offset); } + void split(size_t pos, BufferSpace &r); + void dump(Formatter *f) const { std::lock_guard l(cache->lock); f->open_array_section("buffers"); @@ -434,6 +436,11 @@ public: return id >= 0; } + bool can_split() const { + // splitting a BufferSpace writing_map is too hard; don't try. + return shared_blob->bc.writing_map.empty() && get_blob().can_split(); + } + void dup(Blob& o) { o.shared_blob = shared_blob; o.blob = blob; @@ -471,6 +478,9 @@ public: bool put_ref(uint64_t offset, uint64_t length, uint64_t min_alloc_size, vector *r); + /// split the blob + void split(size_t blob_offset, Blob *o); + void get() { ++nref; } @@ -530,6 +540,10 @@ public: blob_offset; } + uint32_t end() const { + return logical_offset + length; + } + bool blob_escapes_range(uint32_t o, uint32_t l) { uint32_t bstart = logical_offset - blob_offset; return (bstart < o || @@ -649,6 +663,8 @@ public: uint64_t offset, uint64_t length, uint8_t blob_depth, BlobRef b, extent_map_t *old_extents); + /// split a blob (and referring extents) + BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos); }; struct OnodeSpace; diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h index 6e09d114c97..a6af3ccdf64 100644 --- a/src/os/bluestore/bluestore_types.h +++ b/src/os/bluestore/bluestore_types.h @@ -299,6 +299,19 @@ struct bluestore_blob_t { return csum_data.length() + extents.size() * 16 + 48; } + bool can_split() const { + return + !has_flag(FLAG_SHARED) && + !has_flag(FLAG_COMPRESSED) && + !has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex + } + bool can_split_at(uint32_t blob_offset) const { + if (has_csum() && + blob_offset % get_csum_chunk_size() != 0) + return false; + return true; + } + void encode(bufferlist& bl) const; void decode(bufferlist::iterator& p); void dump(Formatter *f) const; diff --git a/src/test/objectstore/test_bluestore_types.cc b/src/test/objectstore/test_bluestore_types.cc index f230b7c0e03..74e3b1ff908 100644 --- a/src/test/objectstore/test_bluestore_types.cc +++ b/src/test/objectstore/test_bluestore_types.cc @@ -667,6 +667,80 @@ TEST(Blob, put_ref) } } +TEST(bluestore_blob_t, can_split) +{ + bluestore_blob_t a; + a.flags = bluestore_blob_t::FLAG_MUTABLE; + ASSERT_TRUE(a.can_split()); + a.flags = bluestore_blob_t::FLAG_SHARED; + ASSERT_FALSE(a.can_split()); + a.flags = bluestore_blob_t::FLAG_COMPRESSED; + ASSERT_FALSE(a.can_split()); + a.flags = bluestore_blob_t::FLAG_HAS_UNUSED; + ASSERT_FALSE(a.can_split()); +} + +TEST(bluestore_blob_t, can_split_at) +{ + bluestore_blob_t a; + a.flags = bluestore_blob_t::FLAG_MUTABLE; + a.extents.emplace_back(bluestore_pextent_t(0x10000, 0x2000)); + a.extents.emplace_back(bluestore_pextent_t(0x20000, 0x2000)); + ASSERT_TRUE(a.can_split_at(0x1000)); + ASSERT_TRUE(a.can_split_at(0x1800)); + a.init_csum(bluestore_blob_t::CSUM_CRC32C, 12, 0x4000); + ASSERT_TRUE(a.can_split_at(0x1000)); + ASSERT_TRUE(a.can_split_at(0x2000)); + ASSERT_TRUE(a.can_split_at(0x3000)); + ASSERT_FALSE(a.can_split_at(0x2800)); +} + +TEST(Blob, split) +{ + BlueStore::Cache *cache = BlueStore::Cache::create("lru", NULL); + { + BlueStore::Blob L, R; + L.shared_blob = new BlueStore::SharedBlob(-1, string(), cache); + L.shared_blob->get(); // hack to avoid dtor from running + R.shared_blob = new BlueStore::SharedBlob(-1, string(), cache); + R.shared_blob->get(); // hack to avoid dtor from running + L.dirty_blob().extents.emplace_back(bluestore_pextent_t(0x2000, 0x2000)); + L.dirty_blob().init_csum(bluestore_blob_t::CSUM_CRC32C, 12, 0x2000); + L.split(0x1000, &R); + ASSERT_EQ(0x1000u, L.get_blob().get_logical_length()); + ASSERT_EQ(4u, L.get_blob().csum_data.length()); + ASSERT_EQ(1u, L.get_blob().extents.size()); + ASSERT_EQ(0x2000u, L.get_blob().extents.front().offset); + ASSERT_EQ(0x1000u, L.get_blob().extents.front().length); + ASSERT_EQ(0x1000u, R.get_blob().get_logical_length()); + ASSERT_EQ(4u, R.get_blob().csum_data.length()); + ASSERT_EQ(1u, R.get_blob().extents.size()); + ASSERT_EQ(0x3000u, R.get_blob().extents.front().offset); + ASSERT_EQ(0x1000u, R.get_blob().extents.front().length); + } + { + BlueStore::Blob L, R; + L.shared_blob = new BlueStore::SharedBlob(-1, string(), cache); + L.shared_blob->get(); // hack to avoid dtor from running + R.shared_blob = new BlueStore::SharedBlob(-1, string(), cache); + R.shared_blob->get(); // hack to avoid dtor from running + L.dirty_blob().extents.emplace_back(bluestore_pextent_t(0x2000, 0x1000)); + L.dirty_blob().extents.emplace_back(bluestore_pextent_t(0x12000, 0x1000)); + L.dirty_blob().init_csum(bluestore_blob_t::CSUM_CRC32C, 12, 0x2000); + L.split(0x1000, &R); + ASSERT_EQ(0x1000u, L.get_blob().get_logical_length()); + ASSERT_EQ(4u, L.get_blob().csum_data.length()); + ASSERT_EQ(1u, L.get_blob().extents.size()); + ASSERT_EQ(0x2000u, L.get_blob().extents.front().offset); + ASSERT_EQ(0x1000u, L.get_blob().extents.front().length); + ASSERT_EQ(0x1000u, R.get_blob().get_logical_length()); + ASSERT_EQ(4u, R.get_blob().csum_data.length()); + ASSERT_EQ(1u, R.get_blob().extents.size()); + ASSERT_EQ(0x12000u, R.get_blob().extents.front().offset); + ASSERT_EQ(0x1000u, R.get_blob().extents.front().length); + } +} + TEST(ExtentMap, find_lextent) { BlueStore::ExtentMap em(nullptr); -- 2.39.5