os/bluestore: eradicate bufferlist rebuilds around FileWriter

author Radoslaw Zarzynski <rzarzyns@redhat.com>

Tue, 18 Aug 2020 13:53:49 +0000 (15:53 +0200)

committer Radoslaw Zarzynski <rzarzyns@redhat.com>

Fri, 21 Aug 2020 16:55:57 +0000 (18:55 +0200)
author Radoslaw Zarzynski <rzarzyns@redhat.com>
Tue, 18 Aug 2020 13:53:49 +0000 (15:53 +0200)
committer Radoslaw Zarzynski <rzarzyns@redhat.com>
Fri, 21 Aug 2020 16:55:57 +0000 (18:55 +0200)
diff --git a/src/include/buffer.h b/src/include/buffer.h

index 4df15cd4aae6a7927a2867667854fb675d671d5b..a07282ecee2dcef2d92f24c3547b9bc4e780f44e 100644 (file)
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -854,23 +854,8 @@ struct error_code;
           min_alloc(min_pages * CEPH_PAGE_SIZE),
           pos(nullptr), end(nullptr) {}
  
-      friend class list;
-
-    public:
-      ~page_aligned_appender() {
-       flush();
-      }
-
-      void flush() {
-       if (pos && pos != buffer.c_str()) {
-         size_t len = pos - buffer.c_str();
-         pbl->append(buffer, 0, len);
-         buffer.set_length(buffer.length() - len);
-         buffer.set_offset(buffer.offset() + len);
-       }
-      }
-
-      void append(const char *buf, size_t len) {
+      template <class Func>
+      void _append_common(size_t len, Func&& impl_f) {
         while (len > 0) {
           if (!pos) {
             size_t alloc = (len + CEPH_PAGE_SIZE - 1) & CEPH_PAGE_MASK;
@@ -885,9 +870,8 @@ struct error_code;
           if (l > (size_t)(end - pos)) {
             l = end - pos;
           }
-         memcpy(pos, buf, l);
+         impl_f(l, pos);
           pos += l;
-         buf += l;
           len -= l;
           if (pos == end) {
             pbl->append(buffer, 0, buffer.length());
@@ -895,6 +879,50 @@ struct error_code;
           }
         }
        }
+
+      friend class list;
+
+    public:
+      ~page_aligned_appender() {
+       flush();
+      }
+
+      void flush() {
+       if (pos && pos != buffer.c_str()) {
+         size_t len = pos - buffer.c_str();
+         pbl->append(buffer, 0, len);
+         buffer.set_length(buffer.length() - len);
+         buffer.set_offset(buffer.offset() + len);
+       }
+      }
+
+      void append(const char* buf, size_t entire_len) {
+        _append_common(entire_len, [buf] (const size_t chunk_len,
+                                          char* const dst) mutable {
+         memcpy(dst, buf, chunk_len);
+         buf += chunk_len;
+       });
+      }
+
+      void append_zero(size_t entire_len) {
+       _append_common(entire_len, [] (const size_t chunk_len,
+                                      char* const dst) {
+         memset(dst, '\0', chunk_len);
+       });
+      }
+
+      void substr_of(const list& bl, unsigned off, unsigned len) {
+       for (const auto& bptr : bl.buffers()) {
+         if (off >= bptr.length()) {
+           off -= bptr.length();
+           continue;
+         }
+         const auto round_size = std::min(bptr.length() - off, len);
+         append(bptr.c_str() + off, round_size);
+         len -= round_size;
+         off = 0;
+       }
+      }
      };
  
      page_aligned_appender get_page_aligned_appender(unsigned min_pages=1) {
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc

index e10641a55f255a9c55400b1d09e91ab79087986d..9c9a3bd7b19bb100c7892b9dd00bac3c7b25e4cc 100644 (file)
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -2766,25 +2766,27 @@ ceph::bufferlist BlueFS::FileWriter::flush_buffer(
               << " unflushed" << dendl;
    }
    if (const unsigned tail = bl.length() & ~super.block_mask(); tail) {
+    const auto padding_len = super.block_size - tail;
      dout(20) << __func__ << " caching tail of 0x"
               << std::hex << tail
-             << " and padding block with 0x" << (super.block_size - tail)
+             << " and padding block with 0x" << padding_len
               << " buffer.length() " << buffer.length()
               << std::dec << dendl;
-    tail_block.substr_of(bl, bl.length() - tail, tail);
-    //Because class page_aligned_appender, bl.append_zero will create a new ptr.
-    //This make rebuild for direct-io. We split ptr into two parts which make only rebuild size < 4k.
-    if (bl.get_num_buffers() == 1) {
-      bufferptr buf = bl.buffers().front();
-      if (length > super.block_size && buf.is_aligned(super.block_size) && !buf.is_n_align_sized(super.block_size)) {
-       bl.clear();
-       bufferptr tmp(buf, length - tail, tail);
-       buf.set_length(length - tail);
-       bl.append(buf);
-       bl.append(tmp);
-      }
-    }
-    bl.append_zero(super.block_size - tail);
+    // We need to go through the `buffer_appender` to get a chance to
+    // preserve in-memory contiguity and not mess with the alignment.
+    // Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
+    buffer_appender.append_zero(padding_len);
+    buffer_appender.flush();
+    buffer.splice(buffer.length() - padding_len, padding_len, &bl);
+    // Deep copy the tail here. This allows to avoid costlier copy on
+    // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
+    // of memory allocations.
+    // The alternative approach would be to place the entire tail and
+    // padding on a dedicated, 4 KB long memory chunk. This shouldn't
+    // trigger the rebuild while still being less expensive.
+    buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail);
+    buffer_appender.flush();
+    buffer.splice(buffer.length() - tail, tail, &tail_block);
    } else {
      tail_block.clear();
    }
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h

index ac471907910158025c50f28418a3b0fedc28c147..3d95d69fdea47cc9cb4d7fe0455f5fb63d90577c 100644 (file)
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -197,6 +197,10 @@ public:
        buffer.claim_append(bl);
      }
  
+    void append_zero(size_t len) {
+      buffer_appender.append_zero(len);
+    }
+
      uint64_t get_effective_write_pos() {
        buffer_appender.flush();
        return pos + buffer.length();
author	Radoslaw Zarzynski <rzarzyns@redhat.com>
	Tue, 18 Aug 2020 13:53:49 +0000 (15:53 +0200)
committer	Radoslaw Zarzynski <rzarzyns@redhat.com>
	Fri, 21 Aug 2020 16:55:57 +0000 (18:55 +0200)
src/include/buffer.h		patch \| blob \| history
src/os/bluestore/BlueFS.cc		patch \| blob \| history
src/os/bluestore/BlueFS.h		patch \| blob \| history