]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore/bluefs: Simplify flush procedure
authorAdam Kupczyk <akupczyk@ibm.com>
Thu, 27 Nov 2025 07:56:02 +0000 (07:56 +0000)
committerAdam Kupczyk <akupczyk@ibm.com>
Fri, 22 May 2026 10:51:44 +0000 (10:51 +0000)
Refactor FileWriter:
1) add get_flush_offset(), has_unflushed_data()
2) renamed flush_buffer()->get_flush_buffer()
3) refactored logic of flush:
   - removed bufferlist tail
   - use buffer to store data that has to be reused

Simplify logic of flushing data to disk.

Signed-off-by: Adam Kupczyk <akupczyk@ibm.com>
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h

index c9f896c9de0f94af70d402aae142f1c96d816f1e..aafb47eb5544c566a3a5a28632e9bf94c5c5ae75 100644 (file)
@@ -3812,45 +3812,32 @@ int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to)
   return 0;
 }
 
-ceph::bufferlist BlueFS::FileWriter::flush_buffer(
-  CephContext* const cct,
-  const bool partial,
-  const unsigned length,
-  const bluefs_super_t& super)
+ceph::bufferlist BlueFS::FileWriter::get_flush_buffer(
+  CephContext* cct,
+  uint64_t want_end)
 {
   ceph_assert(ceph_mutex_is_locked(this->lock) || file->fnode.ino <= 1);
-  ceph::bufferlist bl;
-  if (partial) {
-    tail_block.splice(0, tail_block.length(), &bl);
-  }
-  dout(20) << __func__ << " tail is" << std::hex << bl.length() << dendl;
-  ceph_assert(length >= bl.length());
-  const auto remaining_len = length - bl.length();
-  buffer.splice(0, remaining_len, &bl);
-  if (buffer.length()) {
-    dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec
-             << " unflushed" << dendl;
-  }
-  // Append padding to fill block
-  unsigned tail = bl.length() & ~super.block_mask();
-  if (tail) {
-    unsigned padding_len = super.block_size - tail;
-    dout(20) << __func__ << " caching tail of 0x"
-             << std::hex << tail
-             << " and padding block with 0x" << padding_len
-             << " buffer.length() " << buffer.length()
-             << std::dec << dendl;
+  ceph_assert(get_pos() < want_end);
+  ceph::bufferlist bl_to_disk;
+  uint64_t data_end = get_pos() + get_buffer_length();
+  uint64_t io_end = p2roundup<uint64_t>(
+    std::min(want_end, data_end), super_block_size);
+  uint64_t io_size = io_end - buffer_pos;
+  if (io_end <= data_end) {
+    // only partial flush
+    buffer.splice(0, io_size, &bl_to_disk);
+    pos = want_end;
+    buffer_pos += io_size;
+  } else {
+    // need to pad with zeros
+    ceph_assert(io_end - data_end < super_block_size);
+    uint64_t tail = p2phase<uint64_t>(data_end, super_block_size);
     // We need to go through the `buffer_appender` to get a chance to
     // preserve in-memory contiguity and not mess with the alignment.
     // Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
-    buffer_appender.append_zero(padding_len);
-    buffer.splice(buffer.length() - padding_len, padding_len, &bl);
-    // Deep copy the tail here. This allows to avoid costlier copy on
-    // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
-    // of memory allocations.
-    // The alternative approach would be to place the entire tail and
-    // padding on a dedicated, 4 KB long memory chunk. This shouldn't
-    // trigger the rebuild while still being less expensive.
+    buffer_appender.append_zero(super_block_size - tail);
+    // now aligned
+    buffer.splice(0, io_size, &bl_to_disk);
     if (file->envelope_mode() &&
       buffer.get_append_buffer_unused_tail_length() < tail + File::envelope_t::head_size()) {
       // Envelope mode header must completely fit in single buffer::ptr,
@@ -3862,12 +3849,14 @@ ceph::bufferlist BlueFS::FileWriter::flush_buffer(
       // pages. The size is min 2 * super.block_size so header will fit.
       buffer.clear();
     }
-    buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail);
-    buffer.splice(buffer.length() - tail, tail, &tail_block);
-  } else {
-    tail_block.clear();
+    // append (duplicate) tail for next flush
+    buffer_appender.substr_of(bl_to_disk, (data_end - tail) - buffer_pos, tail);
+
+    pos = want_end;
+    buffer_pos += io_size - super_block_size;
+    ceph_assert(buffer.length() < super_block_size);
   }
-  return bl;
+  return bl_to_disk;
 }
 
 int BlueFS::_signal_dirty_to_log_D(FileWriter *h)
@@ -4009,19 +3998,8 @@ int BlueFS::_flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool bu
     ceph_assert(ceph_mutex_is_locked(h->lock));
     ceph_assert(ceph_mutex_is_locked(h->file->lock));
   }
-  uint64_t x_off = 0;
-  auto p = h->file->fnode.seek(offset, &x_off);
-  ceph_assert(p != h->file->fnode.extents.end());
-  dout(20) << __func__ << " in " << *p << " x_off 0x"
-           << std::hex << x_off << std::dec << dendl;
 
-  unsigned partial = x_off & ~super.block_mask();
-  if (partial) {
-    dout(20) << __func__ << " using partial tail 0x"
-             << std::hex << partial << std::dec << dendl;
-    x_off -= partial;
-    offset -= partial;
-    length += partial;
+  if (h->has_flushed_data()) {
     dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
     for (auto p : h->iocv) {
       if (p) {
@@ -4029,12 +4007,16 @@ int BlueFS::_flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool bu
       }
     }
   }
-
-  auto bl = h->flush_buffer(cct, partial, length, super);
-  ceph_assert(bl.length() >= length);
-  h->set_pos(offset + length);
+  uint64_t write_offset = h->get_flush_offset();
+  bufferlist bl = h->get_flush_buffer(cct, offset + length);
   length = bl.length();
 
+  uint64_t x_off = 0;
+  auto p = h->file->fnode.seek(write_offset, &x_off);
+  ceph_assert(p != h->file->fnode.extents.end());
+  dout(20) << __func__ << " in " << *p << " x_off 0x"
+           << std::hex << x_off << std::dec << dendl;
+
   logger->inc(l_bluefs_write_count, 1);
   logger->inc(l_bluefs_write_bytes, length);
 
index 16a2555e7e2e1595bc34d70e320e86c1440fe9aa..eb21c5134cb861d7875e8fcebd2338534b5af449 100644 (file)
@@ -413,21 +413,34 @@ public:
       return pos;
     }
     void set_pos(uint64_t new_pos) {
+      ceph_assert(buffer.length() == 0);
+      ceph_assert(p2aligned<uint32_t>(new_pos, super_block_size));
       pos = new_pos;
+      buffer_pos = pos;
     }
   private:
-    uint64_t pos = 0;        ///< offset of data in file
-    ceph::buffer::list buffer;      ///< new data to write (at end of file)
-    ceph::buffer::list tail_block;  ///< existing partial block at end of file, if any
+    uint64_t pos = 0;          ///< offset of unflushed data
+    ceph::buffer::list buffer; ///< new data to write (at end of file)
+                               ///  includes unaligned tail from last flush
+    uint32_t super_block_size;
+    uint64_t buffer_pos = 0;   ///< offset of buffer in file
   public:
+    // does not take into account part of `buffer` already flushed to disk
     unsigned get_buffer_length() const {
-      return buffer.length();
+      return buffer.length() - (pos - buffer_pos);
     }
-    ceph::bufferlist flush_buffer(
+    // indicates that a part of buffer has already been flushed to disk
+    bool has_flushed_data() {
+      return pos != buffer_pos;
+    }
+    // offset (in file) of `buffer` to flush
+    uint64_t get_flush_offset() {
+      return buffer_pos;
+    }
+    // create bufferlist to write to disk, modify buffer
+    ceph::bufferlist get_flush_buffer(
       CephContext* cct,
-      const bool partial,
-      const unsigned length,
-      const bluefs_super_t& super);
+      uint64_t flush_end);
     ceph::buffer::list::page_aligned_appender buffer_appender;  //< for const char* only
     bufferlist::contiguous_filler envelope_head_filler;
   public:
@@ -440,6 +453,7 @@ public:
 
     FileWriter(FileRef f, unsigned super_block_size)
       : file(std::move(f))
+      , super_block_size(super_block_size)
       , buffer_appender(buffer.get_page_aligned_appender(
         std::max<uint64_t>(g_conf()->bluefs_alloc_size, 2 * super_block_size) / CEPH_PAGE_SIZE))
       , envelope_head_filler() {
@@ -494,7 +508,7 @@ public:
     }
 
     uint64_t get_effective_write_pos() {
-      return pos + buffer.length();
+      return buffer_pos + buffer.length();
     }
 
   };