From: Radoslaw Zarzynski Date: Tue, 18 Aug 2020 13:53:49 +0000 (+0200) Subject: os/bluestore: eradicate bufferlist rebuilds around FileWriter X-Git-Tag: v16.1.0~780^2~7 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=353402d88cc8ff9fe20a8233a4cab26e06fd8106;p=ceph.git os/bluestore: eradicate bufferlist rebuilds around FileWriter This commits reworks the way how `FileWriter::flush_buffer()` complements the buffer with zeros when the requested flush length is not a multiply of block size. Originally, it was zeroing the `buffer` directly -- without an assistance from `page_aligned_appender` though which all regular appends flow. This mismatch was fragmenting `buffer` and leading to rebuilds in e.g. `KernelDevice::aio_write`. In the patch, `page_aligned_appender` gets `append_zero()` and `FileWriter` directs the filling-with-zeros through the appender, and thus allowing it to put the content contiguously in memory -- without creating a new `ptr_node`. Credits for detecting the problem belong to Jianpeng Ma . Thanks! Signed-off-by: Radoslaw Zarzynski --- diff --git a/src/include/buffer.h b/src/include/buffer.h index 4df15cd4aae..a07282ecee2 100644 --- a/src/include/buffer.h +++ b/src/include/buffer.h @@ -854,23 +854,8 @@ struct error_code; min_alloc(min_pages * CEPH_PAGE_SIZE), pos(nullptr), end(nullptr) {} - friend class list; - - public: - ~page_aligned_appender() { - flush(); - } - - void flush() { - if (pos && pos != buffer.c_str()) { - size_t len = pos - buffer.c_str(); - pbl->append(buffer, 0, len); - buffer.set_length(buffer.length() - len); - buffer.set_offset(buffer.offset() + len); - } - } - - void append(const char *buf, size_t len) { + template + void _append_common(size_t len, Func&& impl_f) { while (len > 0) { if (!pos) { size_t alloc = (len + CEPH_PAGE_SIZE - 1) & CEPH_PAGE_MASK; @@ -885,9 +870,8 @@ struct error_code; if (l > (size_t)(end - pos)) { l = end - pos; } - memcpy(pos, buf, l); + impl_f(l, pos); pos += l; - buf += l; len -= l; if (pos == end) { pbl->append(buffer, 0, buffer.length()); @@ -895,6 +879,50 @@ struct error_code; } } } + + friend class list; + + public: + ~page_aligned_appender() { + flush(); + } + + void flush() { + if (pos && pos != buffer.c_str()) { + size_t len = pos - buffer.c_str(); + pbl->append(buffer, 0, len); + buffer.set_length(buffer.length() - len); + buffer.set_offset(buffer.offset() + len); + } + } + + void append(const char* buf, size_t entire_len) { + _append_common(entire_len, [buf] (const size_t chunk_len, + char* const dst) mutable { + memcpy(dst, buf, chunk_len); + buf += chunk_len; + }); + } + + void append_zero(size_t entire_len) { + _append_common(entire_len, [] (const size_t chunk_len, + char* const dst) { + memset(dst, '\0', chunk_len); + }); + } + + void substr_of(const list& bl, unsigned off, unsigned len) { + for (const auto& bptr : bl.buffers()) { + if (off >= bptr.length()) { + off -= bptr.length(); + continue; + } + const auto round_size = std::min(bptr.length() - off, len); + append(bptr.c_str() + off, round_size); + len -= round_size; + off = 0; + } + } }; page_aligned_appender get_page_aligned_appender(unsigned min_pages=1) { diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index e10641a55f2..9c9a3bd7b19 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -2766,25 +2766,27 @@ ceph::bufferlist BlueFS::FileWriter::flush_buffer( << " unflushed" << dendl; } if (const unsigned tail = bl.length() & ~super.block_mask(); tail) { + const auto padding_len = super.block_size - tail; dout(20) << __func__ << " caching tail of 0x" << std::hex << tail - << " and padding block with 0x" << (super.block_size - tail) + << " and padding block with 0x" << padding_len << " buffer.length() " << buffer.length() << std::dec << dendl; - tail_block.substr_of(bl, bl.length() - tail, tail); - //Because class page_aligned_appender, bl.append_zero will create a new ptr. - //This make rebuild for direct-io. We split ptr into two parts which make only rebuild size < 4k. - if (bl.get_num_buffers() == 1) { - bufferptr buf = bl.buffers().front(); - if (length > super.block_size && buf.is_aligned(super.block_size) && !buf.is_n_align_sized(super.block_size)) { - bl.clear(); - bufferptr tmp(buf, length - tail, tail); - buf.set_length(length - tail); - bl.append(buf); - bl.append(tmp); - } - } - bl.append_zero(super.block_size - tail); + // We need to go through the `buffer_appender` to get a chance to + // preserve in-memory contiguity and not mess with the alignment. + // Otherwise a costly rebuild could happen in e.g. `KernelDevice`. + buffer_appender.append_zero(padding_len); + buffer_appender.flush(); + buffer.splice(buffer.length() - padding_len, padding_len, &bl); + // Deep copy the tail here. This allows to avoid costlier copy on + // bufferlist rebuild in e.g. `KernelDevice` and minimizes number + // of memory allocations. + // The alternative approach would be to place the entire tail and + // padding on a dedicated, 4 KB long memory chunk. This shouldn't + // trigger the rebuild while still being less expensive. + buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail); + buffer_appender.flush(); + buffer.splice(buffer.length() - tail, tail, &tail_block); } else { tail_block.clear(); } diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index ac471907910..3d95d69fdea 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -197,6 +197,10 @@ public: buffer.claim_append(bl); } + void append_zero(size_t len) { + buffer_appender.append_zero(len); + } + uint64_t get_effective_write_pos() { buffer_appender.flush(); return pos + buffer.length();