]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/os/seastore/zbd: Split iovs in case of bigger buffer sizes. 54062/head
authorAravind Ramesh <aravind.ramesh@wdc.com>
Tue, 11 Jul 2023 04:09:36 +0000 (06:09 +0200)
committerMatan Breizman <mbreizma@redhat.com>
Tue, 17 Oct 2023 16:21:29 +0000 (16:21 +0000)
In Seastore's ZBDSegmentManager, during long running write workloads
if write buffer size exceeds max buf size(IOV_MAX, 16MB),
it was resulting in an assert:
ceph/src/include/buffer.h:1189 : In function 'void ceph::buffer::v15_2_0::list::prepare_iov(VectorT*) const [with VectorT = std::vector<iovec>]', ceph_assert(%s)

This case is handled in BlockSegmentManager by splitting the
IO vectors to appropriate sizes and assert does not happen.

Updated the ZBDSegmentManager to similarly split IO vectors to
appropriate size.

Signed-off-by: Aravind Ramesh <Aravind.Ramesh@wdc.com>
(cherry picked from commit c15c0de4886a839ffebfd22d8204e7d1ca8a65b2)

src/crimson/os/seastore/segment_manager/zbd.cc

index 4f8103d785f859a993b8a35e4e4160879db1a2e1..88521a947f86463287b136c122e8bfe53e9c6afb 100644 (file)
@@ -9,6 +9,7 @@
 #include "crimson/os/seastore/segment_manager/zbd.h"
 #include "crimson/common/config_proxy.h"
 #include "crimson/os/seastore/logging.h"
+#include "crimson/common/errorator-loop.h"
 #include "include/buffer.h"
 
 SET_SUBSYS(seastore_device);
@@ -269,36 +270,53 @@ static write_ertr::future<> do_write(
 }
 
 static write_ertr::future<> do_writev(
+  device_id_t device_id,
   seastar::file &device,
   uint64_t offset,
   bufferlist&& bl,
   size_t block_size)
 {
   LOG_PREFIX(ZBDSegmentManager::do_writev);
-  DEBUG("offset {} len {}",
-    offset,
-    bl.length());
+  DEBUG("{} offset {} len {}",
+    device_id_printer_t{device_id}, offset, bl.length());
   // writev requires each buffer to be aligned to the disks' block
   // size, we need to rebuild here
   bl.rebuild_aligned(block_size);
   
-  std::vector<iovec> iov;
-  bl.prepare_iov(&iov);
-  return device.dma_write(
-    offset,
-    std::move(iov)
-  ).handle_exception(
-    [FNAME](auto e) -> write_ertr::future<size_t> {
-      ERROR("dma_write got error {}",
-       e);
-      return crimson::ct_error::input_output_error::make();
-    }
-  ).then([bl=std::move(bl)/* hold the buf until the end of io */](size_t written)
-        -> write_ertr::future<> {
-    if (written != bl.length()) {
-      return crimson::ct_error::input_output_error::make();
-    }
-    return write_ertr::now();
+  return seastar::do_with(
+    bl.prepare_iovs(),
+    std::move(bl),
+    [&device, device_id, offset, FNAME](auto& iovs, auto& bl)
+  {
+    return write_ertr::parallel_for_each(
+      iovs,
+      [&device, device_id, offset, FNAME](auto& p)
+    {
+      auto off = offset + p.offset;
+      auto len = p.length;
+      auto& iov = p.iov;
+      DEBUG("{} poffset={}~{} dma_write ...",
+           device_id_printer_t{device_id},
+            off, len);
+      return device.dma_write(off, std::move(iov)
+      ).handle_exception(
+        [FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t>
+      {
+        ERROR("{} poffset={}~{} dma_write got error -- {}",
+             device_id_printer_t{device_id}, off, len, e);
+        return crimson::ct_error::input_output_error::make();
+      }).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> {
+        if (written != len) {
+          ERROR("{} poffset={}~{} dma_write len={} inconsistent",
+               device_id_printer_t{device_id}, off, len, written);
+          return crimson::ct_error::input_output_error::make();
+        }
+        DEBUG("{} poffset={}~{} dma_write done",
+             device_id_printer_t{device_id},
+              off, len);
+        return write_ertr::now();
+      });
+    });
   });
 }
 
@@ -692,6 +710,7 @@ Segment::write_ertr::future<> ZBDSegmentManager::segment_write(
     bl.length());
   stats.data_write.increment(bl.length());
   return do_writev(
+    get_device_id(),
     device, 
     get_offset(addr), 
     std::move(bl),