From: Igor Fedotov Date: Fri, 7 Apr 2017 18:32:22 +0000 (+0000) Subject: os/bluestore: refactor small write handling to reuse blob more effectively. X-Git-Tag: v12.0.2~92^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=6f6954250157f9f3344053c264763ac6edf11a9f;p=ceph.git os/bluestore: refactor small write handling to reuse blob more effectively. Signed-off-by: Igor Fedotov --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 2dad3552069aa..c59964c21596e 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -8703,7 +8703,7 @@ void BlueStore::_do_write_small( dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length << std::dec << dendl; assert(length < min_alloc_size); - uint64_t end = offset + length; + uint64_t end_offs = offset + length; logger->inc(l_bluestore_write_small); logger->inc(l_bluestore_write_small_bytes, length); @@ -8712,232 +8712,291 @@ void BlueStore::_do_write_small( blp.copy(length, bl); // Look for an existing mutable blob we can use. - // NB: Current approach prevents us from reusing blobs that might be extended - // but have all the extents prior to the offset. Don't care for now... + auto begin = o->extent_map.extent_map.begin(); + auto end = o->extent_map.extent_map.end(); auto ep = o->extent_map.seek_lextent(offset); - if (ep != o->extent_map.extent_map.begin()) { + if (ep != begin) { --ep; if (ep->blob_end() <= offset) { ++ep; } } - BlobRef b; + auto prev_ep = ep; + if (prev_ep != begin) { + --prev_ep; + } else { + prev_ep = end; // to avoid this extent check as it's a duplicate + } + auto max_bsize = MAX(wctx->target_blob_size, min_alloc_size); - while (ep != o->extent_map.extent_map.end()) { - if (ep->blob_start() >= end) { - break; - } - b = ep->blob; - if (!b->get_blob().is_mutable()) { - dout(20) << __func__ << " ignoring immutable " << *b << dendl; - ++ep; - continue; - } - if (ep->logical_offset % min_alloc_size != - ep->blob_offset % min_alloc_size) { - dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl; - ++ep; - continue; - } - uint64_t bstart = ep->blob_start(); - dout(20) << __func__ << " considering " << *b - << " bstart 0x" << std::hex << bstart << std::dec << dendl; - - // can we pad our head/tail out with zeros? - uint64_t chunk_size = b->get_blob().get_chunk_size(block_size); - uint64_t head_pad = P2PHASE(offset, chunk_size); - uint64_t tail_pad = P2NPHASE(end, chunk_size); - if (head_pad || tail_pad) { - o->extent_map.fault_range(db, offset - head_pad, - length + head_pad + tail_pad); - } - if (head_pad && - o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) { - head_pad = 0; - } - - if (tail_pad && o->extent_map.has_any_lextents(end, tail_pad)) { - tail_pad = 0; - } - - bufferlist padded = bl; - if (head_pad) { - bufferlist z; - z.append_zero(head_pad); - z.claim_append(padded); - padded.claim(z); - } - if (tail_pad) { - padded.append_zero(tail_pad); - } - if (head_pad || tail_pad) { - dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad - << " tail 0x" << tail_pad << std::dec << dendl; - logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad); - } - - // direct write into unused blocks of an existing mutable blob? - uint64_t b_off = offset - head_pad - bstart; - uint64_t b_len = length + head_pad + tail_pad; - if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) && - b->get_blob().get_ondisk_length() >= b_off + b_len && - b->get_blob().is_unused(b_off, b_len) && - b->get_blob().is_allocated(b_off, b_len)) { - dout(20) << __func__ << " write to unused 0x" << std::hex - << b_off << "~" << b_len - << " pad 0x" << head_pad << " + 0x" << tail_pad - << std::dec << " of mutable " << *b << dendl; - _buffer_cache_write(txc, b, b_off, padded, - wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); - - if (!g_conf->bluestore_debug_omit_block_device_write) { - if (b_len <= prefer_deferred_size) { - dout(20) << __func__ << " deferring small 0x" << std::hex - << b_len << std::dec << " unused write via deferred" << dendl; + auto min_off = offset >= max_bsize ? offset - max_bsize : 0; + uint32_t alloc_len = min_alloc_size; + auto offset0 = P2ALIGN(offset, alloc_len); + + bool any_change; + + // search suitable extent in both forward and reverse direction in + // [offset - target_max_blob_size, offset + target_max_blob_size] range + // then check if blob can be reused via try_reuse_blob func or apply + // direct/deferred write (the latter for extents including or higher + // than 'offset' only). + do { + any_change = false; + + if (ep != end && ep->logical_offset < offset + max_bsize) { + BlobRef b = ep->blob; + auto bstart = ep->blob_start(); + dout(20) << __func__ << " considering " << *b + << " bstart 0x" << std::hex << bstart << std::dec << dendl; + if (bstart >= end_offs) { + dout(20) << __func__ << " ignoring distant " << *b << dendl; + } else if (!b->get_blob().is_mutable()) { + dout(20) << __func__ << " ignoring immutable " << *b << dendl; + } else if (ep->logical_offset % min_alloc_size != + ep->blob_offset % min_alloc_size) { + dout(20) << __func__ << " ignoring offset-skewed " << *b << dendl; + } else { + uint64_t chunk_size = b->get_blob().get_chunk_size(block_size); + // can we pad our head/tail out with zeros? + uint64_t head_pad, tail_pad; + head_pad = P2PHASE(offset, chunk_size); + tail_pad = P2NPHASE(end_offs, chunk_size); + if (head_pad || tail_pad) { + o->extent_map.fault_range(db, offset - head_pad, + end_offs - offset + head_pad + tail_pad); + } + if (head_pad && + o->extent_map.has_any_lextents(offset - head_pad, chunk_size)) { + head_pad = 0; + } + if (tail_pad && o->extent_map.has_any_lextents(end_offs, tail_pad)) { + tail_pad = 0; + } + + uint64_t b_off = offset - head_pad - bstart; + uint64_t b_len = length + head_pad + tail_pad; + + // direct write into unused blocks of an existing mutable blob? + if ((b_off % chunk_size == 0 && b_len % chunk_size == 0) && + b->get_blob().get_ondisk_length() >= b_off + b_len && + b->get_blob().is_unused(b_off, b_len) && + b->get_blob().is_allocated(b_off, b_len)) { + bufferlist padded; + _apply_padding(head_pad, tail_pad, bl, padded); + + dout(20) << __func__ << " write to unused 0x" << std::hex + << b_off << "~" << b_len + << " pad 0x" << head_pad << " + 0x" << tail_pad + << std::dec << " of mutable " << *b << dendl; + _buffer_cache_write(txc, b, b_off, padded, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + + if (!g_conf->bluestore_debug_omit_block_device_write) { + if (b_len <= prefer_deferred_size) { + dout(20) << __func__ << " deferring small 0x" << std::hex + << b_len << std::dec << " unused write via deferred" << dendl; + bluestore_deferred_op_t *op = _get_deferred_op(txc, o); + op->op = bluestore_deferred_op_t::OP_WRITE; + b->get_blob().map( + b_off, b_len, + [&](uint64_t offset, uint64_t length) { + op->extents.emplace_back(bluestore_pextent_t(offset, length)); + return 0; + }); + op->data = padded; + } else { + b->get_blob().map_bl( + b_off, padded, + [&](uint64_t offset, bufferlist& t) { + bdev->aio_write(offset, t, + &txc->ioc, wctx->buffered); + }); + } + } + b->dirty_blob().calc_csum(b_off, padded); + dout(20) << __func__ << " lex old " << *ep << dendl; + Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length, + b, + &wctx->old_extents); + b->dirty_blob().mark_used(le->blob_offset, le->length); + txc->statfs_delta.stored() += le->length; + dout(20) << __func__ << " lex " << *le << dendl; + logger->inc(l_bluestore_write_small_unused); + return; + } + // read some data to fill out the chunk? + uint64_t head_read = P2PHASE(b_off, chunk_size); + uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size); + if ((head_read || tail_read) && + (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) && + head_read + tail_read < min_alloc_size) { + b_off -= head_read; + b_len += head_read + tail_read; + + } else { + head_read = tail_read = 0; + } + + // chunk-aligned deferred overwrite? + if (b->get_blob().get_ondisk_length() >= b_off + b_len && + b_off % chunk_size == 0 && + b_len % chunk_size == 0 && + b->get_blob().is_allocated(b_off, b_len)) { + + bufferlist padded; + _apply_padding(head_pad, tail_pad, bl, padded); + + dout(20) << __func__ << " reading head 0x" << std::hex << head_read + << " and tail 0x" << tail_read << std::dec << dendl; + if (head_read) { + bufferlist head_bl; + int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read, + head_bl, 0); + assert(r >= 0 && r <= (int)head_read); + size_t zlen = head_read - r; + if (zlen) { + head_bl.append_zero(zlen); + logger->inc(l_bluestore_write_pad_bytes, zlen); + } + head_bl.claim_append(padded); + padded.swap(head_bl); + logger->inc(l_bluestore_write_penalty_read_ops); + } + if (tail_read) { + bufferlist tail_bl; + int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read, + tail_bl, 0); + assert(r >= 0 && r <= (int)tail_read); + size_t zlen = tail_read - r; + if (zlen) { + tail_bl.append_zero(zlen); + logger->inc(l_bluestore_write_pad_bytes, zlen); + } + padded.claim_append(tail_bl); + logger->inc(l_bluestore_write_penalty_read_ops); + } + logger->inc(l_bluestore_write_small_pre_read); + bluestore_deferred_op_t *op = _get_deferred_op(txc, o); op->op = bluestore_deferred_op_t::OP_WRITE; - b->get_blob().map( + _buffer_cache_write(txc, b, b_off, padded, + wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); + + int r = b->get_blob().map( b_off, b_len, [&](uint64_t offset, uint64_t length) { op->extents.emplace_back(bluestore_pextent_t(offset, length)); return 0; }); - op->data = padded; - } else { - b->get_blob().map_bl( - b_off, padded, - [&](uint64_t offset, bufferlist& t) { - bdev->aio_write(offset, t, - &txc->ioc, wctx->buffered); - }); + assert(r == 0); + if (b->get_blob().csum_type) { + b->dirty_blob().calc_csum(b_off, padded); + } + op->data.claim(padded); + dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~" + << b_len << std::dec << " of mutable " << *b + << " at " << op->extents << dendl; + Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length, + b, &wctx->old_extents); + b->dirty_blob().mark_used(le->blob_offset, le->length); + txc->statfs_delta.stored() += le->length; + dout(20) << __func__ << " lex " << *le << dendl; + logger->inc(l_bluestore_write_small_deferred); + return; } - } - b->dirty_blob().calc_csum(b_off, padded); - dout(20) << __func__ << " lex old " << *ep << dendl; - Extent *le = o->extent_map.set_lextent(c, offset, b_off + head_pad, length, - b, - &wctx->old_extents); - b->dirty_blob().mark_used(le->blob_offset, le->length); - txc->statfs_delta.stored() += le->length; - dout(20) << __func__ << " lex " << *le << dendl; - logger->inc(l_bluestore_write_small_unused); - return; - } + //try to reuse blob + if (b->try_reuse_blob(min_alloc_size, + max_bsize, + offset0 - bstart, + &alloc_len)) { + assert(alloc_len == min_alloc_size); // expecting data always + // fit into reused blob + // Need to check for pending writes desiring to + // reuse the same pextent. The rationale is that during GC two chunks + // from garbage blobs(compressed?) can share logical space within the same + // AU. That's in turn might be caused by unaligned len in clone_range2. + // Hence the second write will fail in an attempt to reuse blob at + // do_alloc_write(). + if (!wctx->has_conflict(b, + offset0, + offset0 + alloc_len, + min_alloc_size)) { + + // we can't reuse pad_head/pad_tail since they might be truncated + // due to existent extents + uint64_t b_off = offset - bstart; + uint64_t b_off0 = b_off; + _pad_zeros(&bl, &b_off0, chunk_size); - // read some data to fill out the chunk? - uint64_t head_read = P2PHASE(b_off, chunk_size); - uint64_t tail_read = P2NPHASE(b_off + b_len, chunk_size); - if ((head_read || tail_read) && - (b->get_blob().get_ondisk_length() >= b_off + b_len + tail_read) && - head_read + tail_read < min_alloc_size) { - dout(20) << __func__ << " reading head 0x" << std::hex << head_read - << " and tail 0x" << tail_read << std::dec << dendl; - if (head_read) { - bufferlist head_bl; - int r = _do_read(c.get(), o, offset - head_pad - head_read, head_read, - head_bl, 0); - assert(r >= 0 && r <= (int)head_read); - size_t zlen = head_read - r; - if (zlen) { - head_bl.append_zero(zlen); - logger->inc(l_bluestore_write_pad_bytes, zlen); - } - b_off -= head_read; - b_len += head_read; - head_bl.claim_append(padded); - padded.swap(head_bl); - logger->inc(l_bluestore_write_penalty_read_ops); - } - if (tail_read) { - bufferlist tail_bl; - int r = _do_read(c.get(), o, offset + length + tail_pad, tail_read, - tail_bl, 0); - assert(r >= 0 && r <= (int)tail_read); - b_len += tail_read; - padded.claim_append(tail_bl); - size_t zlen = tail_read - r; - if (zlen) { - padded.append_zero(zlen); - logger->inc(l_bluestore_write_pad_bytes, zlen); + dout(20) << __func__ << " reuse blob " << *b << std::hex + << " (" << b_off0 << "~" << bl.length() << ")" + << " (" << b_off << "~" << length << ")" + << std::dec << dendl; + + o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); + wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, + false, false); + logger->inc(l_bluestore_write_small_unused); + return; + } } - logger->inc(l_bluestore_write_penalty_read_ops); } - logger->inc(l_bluestore_write_small_pre_read); - } - - // chunk-aligned deferred overwrite? - if (b->get_blob().get_ondisk_length() >= b_off + b_len && - b_off % chunk_size == 0 && - b_len % chunk_size == 0 && - b->get_blob().is_allocated(b_off, b_len)) { - bluestore_deferred_op_t *op = _get_deferred_op(txc, o); - op->op = bluestore_deferred_op_t::OP_WRITE; - _buffer_cache_write(txc, b, b_off, padded, - wctx->buffered ? 0 : Buffer::FLAG_NOCACHE); - - int r = b->get_blob().map( - b_off, b_len, - [&](uint64_t offset, uint64_t length) { - op->extents.emplace_back(bluestore_pextent_t(offset, length)); - return 0; - }); - assert(r == 0); - if (b->get_blob().csum_type) { - b->dirty_blob().calc_csum(b_off, padded); - } - op->data.claim(padded); - dout(20) << __func__ << " deferred write 0x" << std::hex << b_off << "~" - << b_len << std::dec << " of mutable " << *b - << " at " << op->extents << dendl; - Extent *le = o->extent_map.set_lextent(c, offset, offset - bstart, length, - b, &wctx->old_extents); - b->dirty_blob().mark_used(le->blob_offset, le->length); - txc->statfs_delta.stored() += le->length; - dout(20) << __func__ << " lex " << *le << dendl; - logger->inc(l_bluestore_write_small_deferred); - return; - } - uint32_t alloc_len = min_alloc_size; - auto offset0 = P2ALIGN(offset, alloc_len); - if (!head_read && !tail_read && - b->try_reuse_blob(min_alloc_size, - max_bsize, - offset0 - bstart, - &alloc_len)) { - assert(alloc_len == min_alloc_size); // expecting data always - // fit into reused blob - // Need to check for pending writes desiring to - // reuse the same pextent. The rationale is that during GC two chunks - // from garbage blobs(compressed?) can share logical space within the same - // AU. That's in turn might be caused by unaligned len in clone_range2. - // Hence the second write will fail in an attempt to reuse blob at - // do_alloc_write(). - if (!wctx->has_conflict(b, - offset0, - offset0 + alloc_len, - min_alloc_size)) { - uint64_t b_off = offset - bstart; - uint64_t b_off0 = b_off - head_pad; - dout(20) << __func__ << " reuse blob " << *b << std::hex - << " (" << b_off0 << "~" << padded.length() << ")" - << " (" << b_off << "~" << length << ")" - << std::dec << dendl; - - o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); - wctx->write(offset, b, alloc_len, b_off0, padded, b_off, length, - false, false); - logger->inc(l_bluestore_write_small_unused); - return; + ++ep; + any_change = true; + } // if (ep != end && ep->logical_offset < offset + max_bsize) + + // check extent for reuse in reverse order + if (prev_ep != end && prev_ep->logical_offset >= min_off) { + BlobRef b = prev_ep->blob; + auto bstart = prev_ep->blob_start(); + dout(20) << __func__ << " considering " << *b + << " bstart 0x" << std::hex << bstart << std::dec << dendl; + if (b->try_reuse_blob(min_alloc_size, + max_bsize, + offset0 - bstart, + &alloc_len)) { + assert(alloc_len == min_alloc_size); // expecting data always + // fit into reused blob + // Need to check for pending writes desiring to + // reuse the same pextent. The rationale is that during GC two chunks + // from garbage blobs(compressed?) can share logical space within the same + // AU. That's in turn might be caused by unaligned len in clone_range2. + // Hence the second write will fail in an attempt to reuse blob at + // do_alloc_write(). + if (!wctx->has_conflict(b, + offset0, + offset0 + alloc_len, + min_alloc_size)) { + + uint64_t chunk_size = b->get_blob().get_chunk_size(block_size); + uint64_t b_off = offset - bstart; + uint64_t b_off0 = b_off; + _pad_zeros(&bl, &b_off0, chunk_size); + + dout(20) << __func__ << " reuse blob " << *b << std::hex + << " (" << b_off0 << "~" << bl.length() << ")" + << " (" << b_off << "~" << length << ")" + << std::dec << dendl; + + o->extent_map.punch_hole(c, offset, length, &wctx->old_extents); + wctx->write(offset, b, alloc_len, b_off0, bl, b_off, length, + false, false); + logger->inc(l_bluestore_write_small_unused); + return; + } + } + if (prev_ep != begin) { + --prev_ep; + any_change = true; + } else { + prev_ep = end; // to avoid useless first extent re-check } - } - - - ++ep; - } + } // if (prev_ep != end && prev_ep->logical_offset >= min_off) + } while (any_change); // new blob. - b = c->new_blob(); - unsigned alloc_len = min_alloc_size; + + BlobRef b = c->new_blob(); uint64_t b_off = P2PHASE(offset, alloc_len); uint64_t b_off0 = b_off; _pad_zeros(&bl, &b_off0, block_size); @@ -10602,4 +10661,27 @@ void BlueStore::flush_cache() } coll_map.clear(); } + +void BlueStore::_apply_padding(uint64_t head_pad, + uint64_t tail_pad, + bufferlist& bl, + bufferlist& padded) +{ + padded = bl; + if (head_pad) { + bufferlist z; + z.append_zero(head_pad); + z.claim_append(padded); + padded.claim(z); + } + if (tail_pad) { + padded.append_zero(tail_pad); + } + if (head_pad || tail_pad) { + dout(20) << __func__ << " can pad head 0x" << std::hex << head_pad + << " tail 0x" << tail_pad << std::dec << dendl; + logger->inc(l_bluestore_write_pad_bytes, head_pad + tail_pad); + } +} + // =========================================== diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 8a8c8a8493bae..42cca7710b1c4 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1975,6 +1975,11 @@ private: return val1; } + void _apply_padding(uint64_t head_pad, + uint64_t tail_pad, + bufferlist& bl, + bufferlist& padded); + // -- ondisk version --- public: const int32_t latest_ondisk_format = 2; ///< our version