From: Sage Weil Date: Wed, 28 Sep 2016 21:54:46 +0000 (-0400) Subject: os/bluestore: try to split blobs instead of spanning them X-Git-Tag: v11.0.1~15^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1a39872b9e64801c8878703dc5d7b6799cceb74a;p=ceph.git os/bluestore: try to split blobs instead of spanning them If a blob crosses a shard boundary, try to split it into pieces instead of making it spanning. This is only possible under certain conditions (uncompressed, no in-flight writes, no unused space, shard boundary aligned with chunk boundary). Generally speaking this will reduce the number of blobs that end up spanning and reduce the onode key size. Signed-off-by: Sage Weil --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 92ce9f5d905b..b166305e9c3d 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -1071,6 +1071,50 @@ void BlueStore::BufferSpace::finish_write(uint64_t seq) cache->_audit("finish_write end"); } +void BlueStore::BufferSpace::split(size_t pos, BlueStore::BufferSpace &r) +{ + std::lock_guard lk(cache->lock); + assert(r.cache == cache); + auto p = buffer_map.begin(); + while (p != buffer_map.end() && + p->second->end() <= pos) { + dout(30) << __func__ << " skip " << *p->second << dendl; + ++p; + } + if (p != buffer_map.end()) { + if (p->second->offset < pos) { + dout(30) << __func__ << " cut " << *p->second << dendl; + size_t left = pos - p->second->offset; + size_t right = p->second->length - left; + if (p->second->data.length()) { + bufferlist bl; + bl.substr_of(p->second->data, left, right); + r._add_buffer(new Buffer(&r, p->second->state, p->second->seq, 0, bl), + 0, p->second.get()); + } else { + r._add_buffer(new Buffer(&r, p->second->state, p->second->seq, 0, right), + 0, p->second.get()); + } + p->second->truncate(left); + ++p; + } + while (p != buffer_map.end()) { + dout(30) << __func__ << " move " << *p->second << dendl; + if (p->second->data.length()) { + r._add_buffer(new Buffer(&r, p->second->state, p->second->seq, + p->second->offset - pos, p->second->data), + 0, p->second.get()); + } else { + r._add_buffer(new Buffer(&r, p->second->state, p->second->seq, + p->second->offset - pos, p->second->length), + 0, p->second.get()); + } + _rm_buffer(p++); + } + } + assert(writing_map.empty()); +} + // OnodeSpace #undef dout_prefix @@ -1382,6 +1426,62 @@ bool BlueStore::Blob::put_ref( return false; } +void BlueStore::Blob::split(size_t blob_offset, Blob *r) +{ + dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec + << " start " << *this << dendl; + assert(blob.can_split()); + bluestore_blob_t &lb = dirty_blob(); + bluestore_blob_t &rb = r->dirty_blob(); + + unsigned i = 0; + size_t left = blob_offset; + for (auto p = lb.extents.begin(); p != lb.extents.end(); ++p, ++i) { + if (p->length <= left) { + left -= p->length; + continue; + } + if (left) { + if (p->is_valid()) { + rb.extents.emplace_back(bluestore_pextent_t(p->offset + left, + p->length - left)); + } else { + rb.extents.emplace_back(bluestore_pextent_t( + bluestore_pextent_t::INVALID_OFFSET, + p->length - left)); + } + p->length = left; + ++i; + ++p; + } + while (p != lb.extents.end()) { + rb.extents.push_back(*p++); + } + lb.extents.resize(i); + break; + } + rb.flags = lb.flags; + + if (lb.has_csum()) { + rb.csum_type = lb.csum_type; + rb.csum_chunk_order = lb.csum_chunk_order; + size_t csum_order = lb.get_csum_chunk_size(); + assert(blob_offset % csum_order == 0); + size_t pos = (blob_offset / csum_order) * lb.get_csum_value_size(); + // deep copy csum data + bufferptr old; + old.swap(lb.csum_data); + rb.csum_data = bufferptr(old.c_str() + pos, old.length() - pos); + lb.csum_data = bufferptr(old.c_str(), pos); + } + + shared_blob->bc.split(blob_offset, r->shared_blob->bc); + + dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec + << " finish " << *this << dendl; + dout(10) << __func__ << " 0x" << std::hex << blob_offset << std::dec + << " and " << *r << dendl; +} // Extent @@ -1583,10 +1683,43 @@ void BlueStore::ExtentMap::reshard(Onode *o) } if (ep->blob->id < 0 && ep->blob_escapes_range(shard_start, shard_end - shard_start)) { - ep->blob->id = bid++; - spanning_blob_map.insert(*ep->blob); - ep->blob->get(); - dout(20) << __func__ << " adding spanning " << *ep << dendl; + // We have two options: (1) split the blob into pieces at the + // shard boundaries (and adjust extents accordingly), or (2) + // mark it spanning. We prefer to cut the blob if we can. Note that + // we may have to split it multiple times--potentially at every + // shard boundary. + bool must_span = false; + BlobRef b = ep->blob; + if (b->can_split()) { + uint32_t bstart = ep->logical_offset - ep->blob_offset; + uint32_t bend = bstart + b->get_blob().get_logical_length(); + for (const auto& sh : shards) { + if (bstart < sh.offset && bend > sh.offset) { + uint32_t blob_offset = sh.offset - bstart; + if (b->get_blob().can_split_at(blob_offset)) { + dout(20) << __func__ << " splitting blob, bstart 0x" + << std::hex << bstart + << " blob_offset 0x" << blob_offset + << std::dec << " " << *b << dendl; + b = split_blob(b, blob_offset, sh.offset); + // switch b to the new right-hand side, in case it + // *also* has to get split. + bstart += blob_offset; + } else { + must_span = true; + break; + } + } + } + } else { + must_span = true; + } + if (must_span) { + b->id = bid++; + spanning_blob_map.insert(*b); + b->get(); + dout(20) << __func__ << " adding spanning " << *b << dendl; + } } ++ep; } @@ -2026,6 +2159,48 @@ BlueStore::Extent *BlueStore::ExtentMap::set_lextent( return le; } +BlueStore::BlobRef BlueStore::ExtentMap::split_blob( + BlobRef lb, + uint32_t blob_offset, + uint32_t pos) +{ + uint32_t end_pos = pos + lb->get_blob().get_logical_length() - blob_offset; + dout(20) << __func__ << " 0x" << std::hex << pos << " end 0x" << end_pos + << " blob_offset 0x" << blob_offset << std::dec + << " " << *lb << dendl; + BlobRef rb = onode->c->new_blob(); + lb->split(blob_offset, rb.get()); + + for (auto ep = seek_lextent(pos); + ep != extent_map.end() && ep->logical_offset < end_pos; + ++ep) { + if (ep->blob != lb) { + continue; + } + vector released; + if (ep->logical_offset < pos) { + // split extent + size_t left = pos - ep->logical_offset; + Extent *ne = new Extent(pos, 0, ep->length - left, ep->blob_depth, rb); + extent_map.insert(*ne); + lb->ref_map.put(ep->blob_offset + left, ep->length - left, &released); + ep->length = left; + rb->ref_map.get(ne->blob_offset, ne->length); + dout(30) << __func__ << " split " << *ep << dendl; + dout(30) << __func__ << " to " << *ne << dendl; + } else { + // switch blob + assert(ep->blob_offset >= blob_offset); + lb->ref_map.put(ep->blob_offset, ep->length, &released); + ep->blob = rb; + ep->blob_offset -= blob_offset; + rb->ref_map.get(ep->blob_offset, ep->length); + dout(30) << __func__ << " adjusted " << *ep << dendl; + } + } + return rb; +} + // Onode @@ -6029,9 +6204,8 @@ void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t) o->extent_map.reshard(o.get()); reshard = o->extent_map.update(o.get(), t, true); if (reshard) { - derr << __func__ << " warning: still wants reshard, check options?" - << dendl; - assert(0 == "reshard problem"); + dout(20) << __func__ << " warning: still wants reshard, check options?" + << dendl; } logger->inc(l_bluestore_onode_reshard); } diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 9831a0b728f8..a0fdddb7b96d 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -287,6 +287,8 @@ public: discard(offset, (uint64_t)-1 - offset); } + void split(size_t pos, BufferSpace &r); + void dump(Formatter *f) const { std::lock_guard l(cache->lock); f->open_array_section("buffers"); @@ -434,6 +436,11 @@ public: return id >= 0; } + bool can_split() const { + // splitting a BufferSpace writing_map is too hard; don't try. + return shared_blob->bc.writing_map.empty() && get_blob().can_split(); + } + void dup(Blob& o) { o.shared_blob = shared_blob; o.blob = blob; @@ -471,6 +478,9 @@ public: bool put_ref(uint64_t offset, uint64_t length, uint64_t min_alloc_size, vector *r); + /// split the blob + void split(size_t blob_offset, Blob *o); + void get() { ++nref; } @@ -530,6 +540,10 @@ public: blob_offset; } + uint32_t end() const { + return logical_offset + length; + } + bool blob_escapes_range(uint32_t o, uint32_t l) { uint32_t bstart = logical_offset - blob_offset; return (bstart < o || @@ -649,6 +663,8 @@ public: uint64_t offset, uint64_t length, uint8_t blob_depth, BlobRef b, extent_map_t *old_extents); + /// split a blob (and referring extents) + BlobRef split_blob(BlobRef lb, uint32_t blob_offset, uint32_t pos); }; struct OnodeSpace; diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h index 6e09d114c97e..a6af3ccdf640 100644 --- a/src/os/bluestore/bluestore_types.h +++ b/src/os/bluestore/bluestore_types.h @@ -299,6 +299,19 @@ struct bluestore_blob_t { return csum_data.length() + extents.size() * 16 + 48; } + bool can_split() const { + return + !has_flag(FLAG_SHARED) && + !has_flag(FLAG_COMPRESSED) && + !has_flag(FLAG_HAS_UNUSED); // splitting unused set is complex + } + bool can_split_at(uint32_t blob_offset) const { + if (has_csum() && + blob_offset % get_csum_chunk_size() != 0) + return false; + return true; + } + void encode(bufferlist& bl) const; void decode(bufferlist::iterator& p); void dump(Formatter *f) const; diff --git a/src/test/objectstore/test_bluestore_types.cc b/src/test/objectstore/test_bluestore_types.cc index f230b7c0e038..74e3b1ff908c 100644 --- a/src/test/objectstore/test_bluestore_types.cc +++ b/src/test/objectstore/test_bluestore_types.cc @@ -667,6 +667,80 @@ TEST(Blob, put_ref) } } +TEST(bluestore_blob_t, can_split) +{ + bluestore_blob_t a; + a.flags = bluestore_blob_t::FLAG_MUTABLE; + ASSERT_TRUE(a.can_split()); + a.flags = bluestore_blob_t::FLAG_SHARED; + ASSERT_FALSE(a.can_split()); + a.flags = bluestore_blob_t::FLAG_COMPRESSED; + ASSERT_FALSE(a.can_split()); + a.flags = bluestore_blob_t::FLAG_HAS_UNUSED; + ASSERT_FALSE(a.can_split()); +} + +TEST(bluestore_blob_t, can_split_at) +{ + bluestore_blob_t a; + a.flags = bluestore_blob_t::FLAG_MUTABLE; + a.extents.emplace_back(bluestore_pextent_t(0x10000, 0x2000)); + a.extents.emplace_back(bluestore_pextent_t(0x20000, 0x2000)); + ASSERT_TRUE(a.can_split_at(0x1000)); + ASSERT_TRUE(a.can_split_at(0x1800)); + a.init_csum(bluestore_blob_t::CSUM_CRC32C, 12, 0x4000); + ASSERT_TRUE(a.can_split_at(0x1000)); + ASSERT_TRUE(a.can_split_at(0x2000)); + ASSERT_TRUE(a.can_split_at(0x3000)); + ASSERT_FALSE(a.can_split_at(0x2800)); +} + +TEST(Blob, split) +{ + BlueStore::Cache *cache = BlueStore::Cache::create("lru", NULL); + { + BlueStore::Blob L, R; + L.shared_blob = new BlueStore::SharedBlob(-1, string(), cache); + L.shared_blob->get(); // hack to avoid dtor from running + R.shared_blob = new BlueStore::SharedBlob(-1, string(), cache); + R.shared_blob->get(); // hack to avoid dtor from running + L.dirty_blob().extents.emplace_back(bluestore_pextent_t(0x2000, 0x2000)); + L.dirty_blob().init_csum(bluestore_blob_t::CSUM_CRC32C, 12, 0x2000); + L.split(0x1000, &R); + ASSERT_EQ(0x1000u, L.get_blob().get_logical_length()); + ASSERT_EQ(4u, L.get_blob().csum_data.length()); + ASSERT_EQ(1u, L.get_blob().extents.size()); + ASSERT_EQ(0x2000u, L.get_blob().extents.front().offset); + ASSERT_EQ(0x1000u, L.get_blob().extents.front().length); + ASSERT_EQ(0x1000u, R.get_blob().get_logical_length()); + ASSERT_EQ(4u, R.get_blob().csum_data.length()); + ASSERT_EQ(1u, R.get_blob().extents.size()); + ASSERT_EQ(0x3000u, R.get_blob().extents.front().offset); + ASSERT_EQ(0x1000u, R.get_blob().extents.front().length); + } + { + BlueStore::Blob L, R; + L.shared_blob = new BlueStore::SharedBlob(-1, string(), cache); + L.shared_blob->get(); // hack to avoid dtor from running + R.shared_blob = new BlueStore::SharedBlob(-1, string(), cache); + R.shared_blob->get(); // hack to avoid dtor from running + L.dirty_blob().extents.emplace_back(bluestore_pextent_t(0x2000, 0x1000)); + L.dirty_blob().extents.emplace_back(bluestore_pextent_t(0x12000, 0x1000)); + L.dirty_blob().init_csum(bluestore_blob_t::CSUM_CRC32C, 12, 0x2000); + L.split(0x1000, &R); + ASSERT_EQ(0x1000u, L.get_blob().get_logical_length()); + ASSERT_EQ(4u, L.get_blob().csum_data.length()); + ASSERT_EQ(1u, L.get_blob().extents.size()); + ASSERT_EQ(0x2000u, L.get_blob().extents.front().offset); + ASSERT_EQ(0x1000u, L.get_blob().extents.front().length); + ASSERT_EQ(0x1000u, R.get_blob().get_logical_length()); + ASSERT_EQ(4u, R.get_blob().csum_data.length()); + ASSERT_EQ(1u, R.get_blob().extents.size()); + ASSERT_EQ(0x12000u, R.get_blob().extents.front().offset); + ASSERT_EQ(0x1000u, R.get_blob().extents.front().length); + } +} + TEST(ExtentMap, find_lextent) { BlueStore::ExtentMap em(nullptr);