From: Igor Fedotov Date: Wed, 11 Mar 2020 16:10:12 +0000 (+0300) Subject: os/bluestore: improve deferred big writes. X-Git-Tag: v16.1.0~2618^2~4 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=754e079fc3ee1513f10910765315929f422b0adf;p=ceph.git os/bluestore: improve deferred big writes. Signed-off-by: Igor Fedotov --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 436472a87088..ba73687728d3 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -13270,6 +13270,115 @@ void BlueStore::_do_write_small( return; } +bool BlueStore::BigDeferredWriteContext::can_defer( + BlueStore::extent_map_t::iterator ep, + uint64_t prefer_deferred_size, + uint64_t block_size, + uint64_t offset, + uint64_t l) +{ + bool res = false; + auto& blob = ep->blob->get_blob(); + if (offset >= ep->blob_start() && + blob.is_mutable()) { + off = offset; + b_off = offset - ep->blob_start(); + uint64_t chunk_size = blob.get_chunk_size(block_size); + uint64_t ondisk = blob.get_ondisk_length(); + used = std::min(l, ondisk - b_off); + + // will read some data to fill out the chunk? + head_read = p2phase(b_off, chunk_size); + tail_read = p2nphase(b_off + used, chunk_size); + b_off -= head_read; + + ceph_assert(b_off % chunk_size == 0); + ceph_assert(blob_aligned_len() % chunk_size == 0); + + res = blob_aligned_len() <= prefer_deferred_size && + blob_aligned_len() <= ondisk && + blob.is_allocated(b_off, blob_aligned_len()); + } + return res; +} + +bool BlueStore::BigDeferredWriteContext::apply_defer( + BlueStore::extent_map_t::iterator ep) +{ + int r = ep->blob->get_blob().map( + b_off, blob_aligned_len(), + [&](const bluestore_pextent_t& pext, + uint64_t offset, + uint64_t length) { + // apply deferred if overwrite breaks blob continuity only. + // if it totally overlaps some pextent - fallback to regular write + if (pext.offset < offset || + pext.end() > offset + length) { + res_extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + } + return -1; + }); + return r >= 0; +} + +void BlueStore::_do_write_big_apply_deferred( + TransContext* txc, + CollectionRef& c, + OnodeRef o, + BlueStore::extent_map_t::iterator ep, + BlueStore::BigDeferredWriteContext& dctx, + bufferlist::iterator& blp, + WriteContext* wctx) +{ + bluestore_deferred_op_t* op = _get_deferred_op(txc); + op->op = bluestore_deferred_op_t::OP_WRITE; + op->extents.swap(dctx.res_extents); + + dout(20) << __func__ << " reading head 0x" << std::hex << dctx.head_read + << " and tail 0x" << dctx.tail_read << std::dec << dendl; + if (dctx.head_read) { + int r = _do_read(c.get(), o, + dctx.off - dctx.head_read, + dctx.head_read, + op->data, + 0); + ceph_assert(r >= 0 && r <= (int)dctx.head_read); + size_t zlen = dctx.head_read - r; + if (zlen) { + op->data.append_zero(zlen); + logger->inc(l_bluestore_write_pad_bytes, zlen); + } + logger->inc(l_bluestore_write_penalty_read_ops); + } + blp.copy(dctx.used, op->data); + + if (dctx.tail_read) { + bufferlist tail_bl; + int r = _do_read(c.get(), o, + dctx.off + dctx.used, dctx.tail_read, + tail_bl, 0); + ceph_assert(r >= 0 && r <= (int)dctx.tail_read); + size_t zlen = dctx.tail_read - r; + if (zlen) { + tail_bl.append_zero(zlen); + logger->inc(l_bluestore_write_pad_bytes, zlen); + } + op->data.claim_append(tail_bl); + logger->inc(l_bluestore_write_penalty_read_ops); + } + auto b0 = ep->blob; + _buffer_cache_write(txc, b0, dctx.b_off, op->data, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + + if (b0->get_blob().csum_type) { + b0->dirty_blob().calc_csum(dctx.b_off, op->data); + } + Extent* le = o->extent_map.set_lextent(c, dctx.off, + dctx.off - ep->blob_start(), dctx.used, b0, &wctx->old_extents); + txc->statfs_delta.stored() += le->length; +} + void BlueStore::_do_write_big( TransContext *txc, CollectionRef &c, @@ -13294,116 +13403,96 @@ void BlueStore::_do_write_big( //attempting to reuse existing blob if (!wctx->compress) { - // look for an existing mutable blob we can write into - auto ep = o->extent_map.seek_lextent(offset); auto end = o->extent_map.extent_map.end(); - // First try if we can apply deferred write - if (prefer_deferred_size_snapshot && ep != end && - offset >= ep->blob_start() && - ep->blob->get_blob().is_mutable()) { - auto b0 = ep->blob; - auto b_off = offset - ep->blob_start(); - uint64_t chunk_size = b0->get_blob().get_chunk_size(block_size); - auto l_aligned = l; - - // read some data to fill out the chunk? - uint64_t head_read = p2phase(b_off, chunk_size); - uint64_t tail_read = p2nphase(b_off + l, chunk_size); - if ((head_read || tail_read) && - (b0->get_blob().get_ondisk_length() >= - b_off + l + tail_read)) { - b_off = head_read; - l_aligned += head_read + tail_read; - } else { - head_read = tail_read = 0; - } - if (l_aligned <= prefer_deferred_size_snapshot && - b_off % chunk_size == 0 && - l_aligned % chunk_size == 0 && - b0->get_blob().is_allocated(b_off, l_aligned)) { - dout(20) << __func__ << " " << *b0 + if (prefer_deferred_size_snapshot && + l <= prefer_deferred_size_snapshot * 2) { + // Single write that spans two adjusted existing blobs can result + // in up to two deferred blocks of 'prefer_deferred_size' + // So we're trying to minimize the amount of resulting blobs + // and preserve 2 blobs rather than inserting one more in between + // E.g. write 0x10000~20000 over existing blobs + // (0x0~20000 and 0x20000~20000) is better (from subsequent reading + // performance point of view) to result in two deferred writes to + // existing blobs than having 3 blobs: 0x0~10000, 0x10000~20000, 0x30000~10000 + + // look for an existing mutable blob we can write into + auto ep = o->extent_map.seek_lextent(offset); + auto ep_next = end; + BigDeferredWriteContext head_info, tail_info; + + bool will_defer = ep != end ? + head_info.can_defer(ep, + prefer_deferred_size_snapshot, + block_size, + offset, + l) : + false; + auto offset_next = offset + head_info.used; + auto remaining = l - head_info.used; + + if (will_defer && remaining) { + will_defer = false; + if (remaining <= prefer_deferred_size_snapshot) { + ep_next = o->extent_map.seek_lextent(offset_next); + // check if we can defer remaining totally + will_defer = ep_next == end ? + false : + tail_info.can_defer(ep_next, + prefer_deferred_size_snapshot, + block_size, + offset_next, + remaining); + + will_defer = will_defer && remaining == tail_info.used; + } + } + if (will_defer) { + dout(20) << __func__ << " " << *(ep->blob) << " deferring big " << std::hex - << " (0x" << b_off << "~" << l_aligned << ")" + << " (0x" << head_info.b_off << "~" << head_info.blob_aligned_len() << ")" << std::dec << " write via deferred" << dendl; + if (remaining) { + dout(20) << __func__ << " " << *(ep_next->blob) + << " deferring big " << std::hex + << " (0x" << tail_info.b_off << "~" << tail_info.blob_aligned_len() << ")" + << std::dec << " write via deferred" + << dendl; + } - PExtentVector extents; - int r = b0->get_blob().map( - b_off, l_aligned, - [&](const bluestore_pextent_t& pext, - uint64_t offset, - uint64_t length) { - // apply deferred if overwrite breaks blob continuity only. - // if it totally overlaps some pextent - fallback to regular write - if (pext.offset < offset || - pext.end() > offset + length) { - extents.emplace_back(bluestore_pextent_t(offset, length)); - return 0; - } - return -1; - }); - if (r < 0) { + will_defer = head_info.apply_defer(ep); + if (!will_defer) { dout(20) << __func__ - << " deferring big fell back" + << " deferring big fell back, head isn't continuous" << dendl; - } else { - bluestore_deferred_op_t *op = _get_deferred_op(txc); - op->op = bluestore_deferred_op_t::OP_WRITE; - op->extents.swap(extents); - - dout(20) << __func__ << " reading head 0x" << std::hex << head_read - << " and tail 0x" << tail_read << std::dec << dendl; - if (head_read) { - int r = _do_read(c.get(), o, offset - head_read, head_read, - op->data, 0); - ceph_assert(r >= 0 && r <= (int)head_read); - size_t zlen = head_read - r; - if (zlen) { - op->data.append_zero(zlen); - logger->inc(l_bluestore_write_pad_bytes, zlen); - } - logger->inc(l_bluestore_write_penalty_read_ops); - } - blp.copy(l, op->data); - - if (tail_read) { - bufferlist tail_bl; - int r = _do_read(c.get(), o, offset + l, tail_read, - tail_bl, 0); - ceph_assert(r >= 0 && r <= (int)tail_read); - size_t zlen = tail_read - r; - if (zlen) { - tail_bl.append_zero(zlen); - logger->inc(l_bluestore_write_pad_bytes, zlen); - } - op->data.claim_append(tail_bl); - logger->inc(l_bluestore_write_penalty_read_ops); - } - - _buffer_cache_write(txc, b0, b_off, op->data, - wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); - - if (b0->get_blob().csum_type) { - b0->dirty_blob().calc_csum(b_off, op->data); + } else if (remaining) { + will_defer = tail_info.apply_defer(ep_next); + if (!will_defer) { + dout(20) << __func__ + << " deferring big fell back, tail isn't continuous" + << dendl; } - Extent *le = o->extent_map.set_lextent(c, offset, - offset - ep->blob_start(), l, b0, &wctx->old_extents); - txc->statfs_delta.stored() += le->length; - - offset += l; - length -= l; - logger->inc(l_bluestore_write_big_blobs); - logger->inc(l_bluestore_write_big_deferred); - - continue; } } + if (will_defer) { + _do_write_big_apply_deferred(txc, c, o, ep, head_info, blp, wctx); + if (remaining) { + _do_write_big_apply_deferred(txc, c, o, ep_next, tail_info, + blp, wctx); + } + offset += l; + length -= l; + logger->inc(l_bluestore_write_big_blobs, remaining ? 2 : 1); + logger->inc(l_bluestore_write_big_deferred, remaining ? 2 : 1); + continue; + } } + o->extent_map.punch_hole(c, offset, l, &wctx->old_extents); // seek again as punch_hole could invalidate ep - ep = o->extent_map.seek_lextent(offset); + auto ep = o->extent_map.seek_lextent(offset); auto begin = o->extent_map.extent_map.begin(); auto prev_ep = end; if (ep != begin) { @@ -13880,9 +13969,7 @@ void BlueStore::_do_write_data( _do_write_small(txc, c, o, head_offset, head_length, p, wctx); } - if (middle_length) { - _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx); - } + _do_write_big(txc, c, o, middle_offset, middle_length, p, wctx); if (tail_length) { _do_write_small(txc, c, o, tail_offset, tail_length, p, wctx); diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index f869f527bb9b..ba71ab0b69e8 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1939,6 +1939,26 @@ public: void dump(ceph::Formatter *f); }; + struct BigDeferredWriteContext { + uint64_t off = 0; // original logical offset + uint32_t b_off = 0; // blob relative offset + uint32_t used = 0; + uint64_t head_read = 0; + uint64_t tail_read = 0; + PExtentVector res_extents; + + inline uint64_t blob_aligned_len() const { + return used + head_read + tail_read; + } + + bool can_defer(BlueStore::extent_map_t::iterator ep, + uint64_t prefer_deferred_size, + uint64_t block_size, + uint64_t offset, + uint64_t l); + bool apply_defer(BlueStore::extent_map_t::iterator ep); + }; + // -------------------------------------------------------- // members private: @@ -3061,6 +3081,14 @@ private: uint64_t offset, uint64_t length, ceph::buffer::list::iterator& blp, WriteContext *wctx); + void _do_write_big_apply_deferred( + TransContext* txc, + CollectionRef& c, + OnodeRef o, + BlueStore::extent_map_t::iterator ep, + BigDeferredWriteContext& dctx, + bufferlist::iterator& blp, + WriteContext* wctx); void _do_write_big( TransContext *txc, CollectionRef &c, diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 90516760dfec..7dd561f719ee 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -6680,9 +6680,11 @@ TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite) { return; size_t block_size = 4096; + // this will enable continuous allocations + SetVal(g_conf(), "bluestore_allocator", "avl"); StartDeferred(block_size); - SetVal(g_conf(), "bluestore_max_blob_size", "65536"); - SetVal(g_conf(), "bluestore_prefer_deferred_size", "32768"); + SetVal(g_conf(), "bluestore_max_blob_size", "131072"); + SetVal(g_conf(), "bluestore_prefer_deferred_size", "65536"); g_conf().apply_changes(nullptr); @@ -6963,6 +6965,139 @@ TEST_P(StoreTestSpecificAUSize, DeferredOnBigOverwrite) { ASSERT_LE(statfs.allocated, (unsigned)block_size * 2); } + { + ObjectStore::Transaction t; + t.remove(cid, hoid); + t.remove(cid, hoid2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(block_size * 32, 'a')); + + // this will create two 128K aligned blobs + t.write(cid, hoid, 0, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + t.write(cid, hoid, bl.length(), bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 10u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 4u); + + // check whether overwrite (less than prefer_deferred_size) partially overlapping two adjacent blobs goes + // deferred + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(block_size * 3, 'b')); + + t.write(cid, hoid, 0x20000 - block_size, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + ASSERT_EQ(logger->get(l_bluestore_write_big), 11u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 6u); + + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, 0x20000 - block_size, bl); + ASSERT_EQ(r, 0x20000 - block_size); + expected.append(string(r, 'a')); + ASSERT_TRUE(bl_eq(expected, bl)); + expected.clear(); + + r = store->read(ch, hoid, 0x20000 - block_size, block_size * 3, bl); + ASSERT_EQ(r, 3 * block_size); + expected.append(string(r, 'b')); + ASSERT_TRUE(bl_eq(expected, bl)); + expected.clear(); + + r = store->read(ch, hoid, 0x20000 + 2 * block_size, block_size * 30, bl); + ASSERT_EQ(r, 30 * block_size); + expected.append(string(r, 'a')); + ASSERT_TRUE(bl_eq(expected, bl)); + expected.clear(); + } + + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 64); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 64); + } + + // check whether overwrite (larger than prefer_deferred_size) partially + // overlapping two adjacent blobs goes deferred + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(block_size * 30, 'c')); + + t.write(cid, hoid, 0x10000 + block_size, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + sleep(2); + ASSERT_EQ(logger->get(l_bluestore_write_big), 12u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 8u); + + { + bufferlist bl, expected; + r = store->read(ch, hoid, 0, 0x11000, bl); + ASSERT_EQ(r, 0x11000); + expected.append(string(r, 'a')); + ASSERT_TRUE(bl_eq(expected, bl)); + expected.clear(); + + r = store->read(ch, hoid, 0x11000, block_size * 30, bl); + ASSERT_EQ(r, block_size * 30); + expected.append(string(r, 'c')); + ASSERT_TRUE(bl_eq(expected, bl)); + expected.clear(); + + r = store->read(ch, hoid, block_size * 47, 0x10000 + block_size, bl); + ASSERT_EQ(r, 0x10000 + block_size); + expected.append(string(r, 'a')); + ASSERT_TRUE(bl_eq(expected, bl)); + expected.clear(); + } + + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 64); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 64); + } + + // check whether overwrite (2 * prefer_deferred_size) partially + // overlapping two adjacent blobs goes non-deferred if one of the part is + // above prefer_deferred_size + { + ObjectStore::Transaction t; + bufferlist bl; + bl.append(std::string(block_size * 30, 'e')); + + t.write(cid, hoid, 0x20000 - block_size, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + sleep(2); + ASSERT_EQ(logger->get(l_bluestore_write_big), 13u); + ASSERT_EQ(logger->get(l_bluestore_write_big_deferred), 8u); + + { + struct store_statfs_t statfs; + int r = store->statfs(&statfs); + ASSERT_EQ(r, 0); + ASSERT_EQ(statfs.data_stored, (unsigned)block_size * 64); + ASSERT_LE(statfs.allocated, (unsigned)block_size * 64); + } + { ObjectStore::Transaction t; t.remove(cid, hoid);