From: Igor Fedotov Date: Thu, 20 Feb 2020 10:43:04 +0000 (+0300) Subject: os/bluestore: imlement deferred writes for big writes X-Git-Tag: v16.1.0~2618^2~7 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f7a2cc3dfea1d161b03401b6b5f1b44b68e46b5c;p=ceph.git os/bluestore: imlement deferred writes for big writes Intended to ensure spinner's read performance for 4K min alloc size to be on par with 64K one. Degradation is caused by additional fragmentation produced by partial overwrites when 4K min alloc size is set./. E.g. consider 4K partial overwrite in the middle of the previously written contiguous 64K blob. For 64K min alloc size deferred write procedure comes in and preserve data continuity. When min alloc size set to 4K overwritten blob is broken into 3 parts whithout the patch. This patch applies deferred writing processing for the case and hence avoids blob continuity breakage. At cost of write performance drop though. Signed-off-by: Igor Fedotov --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 0937c6a4f7be..03a7a963ac2b 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4662,6 +4662,9 @@ void BlueStore::_init_logger() "Large aligned writes into fresh blobs (bytes)", NULL, 0, unit_t(UNIT_BYTES)); b.add_u64_counter(l_bluestore_write_big_blobs, "bluestore_write_big_blobs", "Large aligned writes into fresh blobs (blobs)"); + b.add_u64_counter(l_bluestore_write_big_deferred, + "bluestore_write_big_deferred", + "Big overwrites using deferred"); b.add_u64_counter(l_bluestore_write_small, "bluestore_write_small", "Small writes into existing or sparse small blobs"); b.add_u64_counter(l_bluestore_write_small_bytes, "bluestore_write_small_bytes", @@ -12950,11 +12953,10 @@ void BlueStore::_do_write_small( ++ep; } } - auto prev_ep = ep; - if (prev_ep != begin) { + auto prev_ep = end; + if (ep != begin) { + prev_ep = ep; --prev_ep; - } else { - prev_ep = end; // to avoid this extent check as it's a duplicate } boost::container::flat_set inspected_blobs; @@ -13107,7 +13109,7 @@ void BlueStore::_do_write_small( bl.claim_append(tail_bl); logger->inc(l_bluestore_write_penalty_read_ops); } - logger->inc(l_bluestore_write_small_pre_read); + logger->inc(l_bluestore_write_small_pre_read); _buffer_cache_write(txc, b, b_off, bl, wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); @@ -13282,8 +13284,8 @@ void BlueStore::_do_write_big( << dendl; logger->inc(l_bluestore_write_big); logger->inc(l_bluestore_write_big_bytes, length); - o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); auto max_bsize = std::max(wctx->target_blob_size, min_alloc_size); + uint64_t prefer_deferred_size_snapshot = prefer_deferred_size.load(); while (length > 0) { bool new_blob = false; uint32_t l = std::min(max_bsize, length); @@ -13292,16 +13294,109 @@ void BlueStore::_do_write_big( //attempting to reuse existing blob if (!wctx->compress) { - // look for an existing mutable blob we can reuse - auto begin = o->extent_map.extent_map.begin(); - auto end = o->extent_map.extent_map.end(); + // look for an existing mutable blob we can write into auto ep = o->extent_map.seek_lextent(offset); - auto prev_ep = ep; - if (prev_ep != begin) { + auto end = o->extent_map.extent_map.end(); + // First try if we can apply deferred write + if (prefer_deferred_size_snapshot && ep != end && + offset >= ep->blob_start() && + ep->blob->get_blob().is_mutable()) { + auto b0 = ep->blob; + auto b_off = offset - ep->blob_start(); + uint64_t chunk_size = b0->get_blob().get_chunk_size(block_size); + auto l_aligned = l; + + // read some data to fill out the chunk? + uint64_t head_read = p2phase(b_off, chunk_size); + uint64_t tail_read = p2nphase(b_off + l, chunk_size); + if ((head_read || tail_read) && + (b0->get_blob().get_ondisk_length() >= + b_off + l + tail_read)) { + b_off = head_read; + l_aligned += head_read + tail_read; + } else { + head_read = tail_read = 0; + } + + if (l_aligned <= prefer_deferred_size_snapshot && + b_off % chunk_size == 0 && + l_aligned % chunk_size == 0 && + b0->get_blob().is_allocated(b_off, l_aligned)) { + dout(20) << __func__ << " " << *b0 + << " deferring big " << std::hex + << " (0x" << b_off << "~" << l_aligned << ")" + << std::dec << " write via deferred" + << dendl; + + bluestore_deferred_op_t *op = _get_deferred_op(txc); + op->op = bluestore_deferred_op_t::OP_WRITE; + int r = b0->get_blob().map( + b_off, l_aligned, + [&](uint64_t offset, uint64_t length) { + op->extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + ceph_assert(r == 0); + + dout(20) << __func__ << " reading head 0x" << std::hex << head_read + << " and tail 0x" << tail_read << std::dec << dendl; + if (head_read) { + int r = _do_read(c.get(), o, offset - head_read, head_read, + op->data, 0); + ceph_assert(r >= 0 && r <= (int)head_read); + size_t zlen = head_read - r; + if (zlen) { + op->data.append_zero(zlen); + logger->inc(l_bluestore_write_pad_bytes, zlen); + } + logger->inc(l_bluestore_write_penalty_read_ops); + } + blp.copy(l, op->data); + + if (tail_read) { + bufferlist tail_bl; + int r = _do_read(c.get(), o, offset + l, tail_read, + tail_bl, 0); + ceph_assert(r >= 0 && r <= (int)tail_read); + size_t zlen = tail_read - r; + if (zlen) { + tail_bl.append_zero(zlen); + logger->inc(l_bluestore_write_pad_bytes, zlen); + } + op->data.claim_append(tail_bl); + logger->inc(l_bluestore_write_penalty_read_ops); + } + + _buffer_cache_write(txc, b0, b_off, op->data, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + + if (b0->get_blob().csum_type) { + b0->dirty_blob().calc_csum(b_off, op->data); + } + Extent *le = o->extent_map.set_lextent(c, offset, + offset - ep->blob_start(), l, b0, &wctx->old_extents); + txc->statfs_delta.stored() += le->length; + + offset += l; + length -= l; + logger->inc(l_bluestore_write_big_blobs); + logger->inc(l_bluestore_write_big_deferred); + + continue; + } + } + o->extent_map.punch_hole(c, offset, l, &wctx->old_extents); + + // seek again as punch_hole could invalidate ep + ep = o->extent_map.seek_lextent(offset); + auto begin = o->extent_map.extent_map.begin(); + auto prev_ep = end; + if (ep != begin) { + prev_ep = ep; --prev_ep; - } else { - prev_ep = end; // to avoid this extent check as it's a duplicate } + dout(20) << __func__ << " no deferred" << dendl; + auto min_off = offset >= max_bsize ? offset - max_bsize : 0; // search suitable extent in both forward and reverse direction in // [offset - target_max_blob_size, offset + target_max_blob_size] range @@ -13310,12 +13405,16 @@ void BlueStore::_do_write_big( do { any_change = false; if (ep != end && ep->logical_offset < offset + max_bsize) { - if (offset >= ep->blob_start() && + dout(20) << __func__ << " considering " << *ep << dendl; + dout(20) << __func__ << " considering " << *(ep->blob) + << " bstart 0x" << std::hex << ep->blob_start() << std::dec << dendl; + + if (offset >= ep->blob_start() && ep->blob->can_reuse_blob(min_alloc_size, max_bsize, offset - ep->blob_start(), &l)) { b = ep->blob; - b_off = offset - ep->blob_start(); + b_off = offset - ep->blob_start(); prev_ep = end; // to avoid check below dout(20) << __func__ << " reuse blob " << *b << std::hex << " (0x" << b_off << "~" << l << ")" << std::dec << dendl; @@ -13326,7 +13425,10 @@ void BlueStore::_do_write_big( } if (prev_ep != end && prev_ep->logical_offset >= min_off) { - if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize, + dout(20) << __func__ << " considering rev " << *prev_ep << dendl; + dout(20) << __func__ << " considering reverse " << *(prev_ep->blob) + << " bstart 0x" << std::hex << prev_ep->blob_start() << std::dec << dendl; + if (prev_ep->blob->can_reuse_blob(min_alloc_size, max_bsize, offset - prev_ep->blob_start(), &l)) { b = prev_ep->blob; @@ -13341,13 +13443,15 @@ void BlueStore::_do_write_big( } } } while (b == nullptr && any_change); - } + } else { + o->extent_map.punch_hole(c, offset, l, &wctx->old_extents); + } // if (!wctx->compress) + if (b == nullptr) { b = c->new_blob(); b_off = 0; new_blob = true; } - bufferlist t; blp.copy(l, t); wctx->write(offset, b, l, b_off, t, b_off, l, false, new_blob); @@ -13947,8 +14051,12 @@ int BlueStore::_do_write( << " 0x" << std::hex << offset << "~" << length << " - have 0x" << o->onode.size << " (" << std::dec << o->onode.size << ")" - << " bytes" - << " fadvise_flags 0x" << std::hex << fadvise_flags << std::dec + << " bytes" << std::hex + << " fadvise_flags 0x" << fadvise_flags + << " alloc_hint 0x" << o->onode.alloc_hint_flags + << " expected_object_size " << o->onode.expected_object_size + << " expected_write_size " << o->onode.expected_write_size + << std::dec << dendl; _dump_onode<30>(cct, *o); diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index b680774ba35c..f869f527bb9b 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -118,6 +118,7 @@ enum { l_bluestore_write_big, l_bluestore_write_big_bytes, l_bluestore_write_big_blobs, + l_bluestore_write_big_deferred, l_bluestore_write_small, l_bluestore_write_small_bytes, l_bluestore_write_small_unused, diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h index d83b8bc94f1d..32750ffb2d1a 100644 --- a/src/os/bluestore/bluestore_types.h +++ b/src/os/bluestore/bluestore_types.h @@ -597,21 +597,20 @@ public: ceph_assert(p != extents.end()); while (b_off >= p->length) { b_off -= p->length; - ++p; - ceph_assert(p != extents.end()); + if (++p == extents.end()) + return false; } b_len += b_off; while (b_len) { - ceph_assert(p != extents.end()); if (require_allocated != p->is_valid()) { return false; } - if (p->length >= b_len) { return true; } b_len -= p->length; - ++p; + if (++p == extents.end()) + return false; } ceph_abort_msg("we should not get here"); return false; diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index c3e5c021c241..eceecb00d37c 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -6674,6 +6674,249 @@ TEST_P(StoreTestSpecificAUSize, BlobReuseOnOverwrite) { } } +TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite) { + + if (string(GetParam()) != "bluestore") + return; + + size_t block_size = 4096; + StartDeferred(block_size); + SetVal(g_conf(), "bluestore_max_blob_size", "65536"); + SetVal(g_conf(), "bluestore_prefer_deferred_size", "32768"); + + g_conf().apply_changes(nullptr); + + int r; + coll_t cid; + ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, "")); + ghobject_t hoid2(hobject_t("test2", "", CEPH_NOSNAP, 0, -1, "")); + + const PerfCounters* logger = store->get_perf_counters(); + + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + bufferlist bl, bl2; + + bl.append(std::string(block_size * 2, 'c')); + bl2.append(std::string(block_size * 3, 'd')); + + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + t.set_alloc_hint(cid, hoid2, block_size * 4, block_size * 4, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ); + t.write(cid, hoid2, 0, bl2.length(), bl2, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 2u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 0u); + + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 5); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 5); + } + + // overwrite at the beginning, 4K alignment + { + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size, 'b')); + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 3u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 1u); + + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'b')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid, block_size, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'c')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + + // overwrite at 4K, 12K alignment + { + ObjectStore::Transaction t; + bufferlist bl; + + bl.append(std::string(block_size, 'e')); + t.write(cid, hoid2, block_size , bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 4u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 2u); + + // makes sure deferred has been submitted + // and do all the checks again + sleep(g_conf().get_val("bluestore_max_defer_interval") + 2); + + ASSERT_EQ(logger->get(l_bluestore_write_big), 4u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 2u); + + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'b')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid, block_size, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'c')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid2, 0, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'd')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid2, block_size, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'e')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid2, block_size * 2, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'd')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 5); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 5); + } + ASSERT_EQ(logger->get(l_bluestore_blobs), 2u); + ASSERT_EQ(logger->get(l_bluestore_extents), 2u); + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(block_size * 2, 'f')); + + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 5u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 2u); + + { + ObjectStore::Transaction t; + t.zero(cid, hoid, 0, 100); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, 100, bl); + ASSERT_EQ(r, (int)100); + expected.append(string(100, 0)); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid, 100, block_size * 2 - 100, bl); + ASSERT_EQ(r, (int)block_size * 2 - 100); + expected.append(string(block_size * 2 - 100, 'f')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + sleep(2); + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 2 - 100); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 2); + } + ASSERT_EQ(logger->get(l_bluestore_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_extents), 1u); + + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(block_size, 'g')); + + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 6u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 3u); + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'g')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + { + bufferlist bl, expected; + r = store->read(ch, hoid, block_size, block_size, bl); + ASSERT_EQ(r, (int)block_size); + expected.append(string(block_size, 'f')); + ASSERT_TRUE(bl_eq(expected, bl)); + } + + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 2); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 2); + } + ASSERT_EQ(logger->get(l_bluestore_blobs), 1u); + ASSERT_EQ(logger->get(l_bluestore_extents), 1u); + + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + t.remove_collection(cid); + cerr << "Cleaning" << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } +} + TEST_P(StoreTestSpecificAUSize, BlobReuseOnOverwriteReverse) { if (string(GetParam()) != "bluestore")