]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/zns: advance write pointer before writing tail-info.
authorAravind Ramesh <Aravind.Ramesh@wdc.com>
Tue, 26 Jul 2022 09:52:17 +0000 (15:22 +0530)
committerAravind <aravind.ramesh@wdc.com>
Fri, 2 Sep 2022 05:06:56 +0000 (10:36 +0530)
SegmentAllocator::close_segment() writes tail information to a
segment before closing the segment, and this is written at the
end of segment. However, for ZNS SSDs, the writes have to always happen
at write pointer, so writing tail info at the end of a zone fails if
the WP is not at the offset requested by close_segment().

If the write pointer is not at lba where the tail information is written,
then advance write pointer by writing zeroes to the zone from it's current
write pointer. Then write the tail information at the end of zone.

Added advance_wp() function which advances the write pointer and then write
tail information, in case of ZNS devices but for a regular device it
continues to write at the end of segment.

Do close_segment() call after writing tail information, closing a segment
first and then writing tail information can cause potential race conditions
on a zns backed segment.

Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
src/crimson/os/seastore/journal/segment_allocator.cc
src/crimson/os/seastore/segment_manager.h
src/crimson/os/seastore/segment_manager/block.cc
src/crimson/os/seastore/segment_manager/block.h
src/crimson/os/seastore/segment_manager/ephemeral.cc
src/crimson/os/seastore/segment_manager/ephemeral.h
src/crimson/os/seastore/segment_manager/zns.cc
src/crimson/os/seastore/segment_manager/zns.h

index 28f7a30cfe556551c0432de07335608d9785ab2e..4607d056586f2a702aa21b87516ecda2a93673e1 100644 (file)
@@ -219,7 +219,6 @@ SegmentAllocator::close_segment()
   // Note: make sure no one can access the current segment once closing
   auto seg_to_close = std::move(current_segment);
   auto close_segment_id = seg_to_close->get_segment_id();
-  segment_provider.close_segment(close_segment_id);
   auto close_seg_info = segment_provider.get_seg_info(close_segment_id);
   ceph_assert((close_seg_info.modify_time == NULL_TIME &&
                close_seg_info.num_extents == 0) ||
@@ -247,17 +246,25 @@ SegmentAllocator::close_segment()
   bl.append(bp);
 
   assert(bl.length() == sm_group.get_rounded_tail_length());
-  return seg_to_close->write(
-    sm_group.get_segment_size() - sm_group.get_rounded_tail_length(),
-    bl
-  ).safe_then([seg_to_close=std::move(seg_to_close)] {
-    return seg_to_close->close();
+
+  auto p_seg_to_close = seg_to_close.get();
+  return p_seg_to_close->advance_wp(
+    sm_group.get_segment_size() - sm_group.get_rounded_tail_length()
+  ).safe_then([this, FNAME, bl=std::move(bl), p_seg_to_close]() mutable {
+    DEBUG("Writing tail info to segment {}", p_seg_to_close->get_segment_id());
+    return p_seg_to_close->write(
+      sm_group.get_segment_size() - sm_group.get_rounded_tail_length(),
+      std::move(bl));
+  }).safe_then([p_seg_to_close] {
+    return p_seg_to_close->close();
+  }).safe_then([this, seg_to_close=std::move(seg_to_close)] {
+    segment_provider.close_segment(seg_to_close->get_segment_id());
   }).handle_error(
     close_segment_ertr::pass_further{},
-    crimson::ct_error::assert_all{
-      "Invalid error in SegmentAllocator::close_segment"
-    }
-  );
+    crimson::ct_error::assert_all {
+    "Invalid error in SegmentAllocator::close_segment"
+  });
+
 }
 
 RecordBatch::add_pending_ret
index b565125c50bb7b27ba9e1aaf6ce5011f3554ff2b..e14ef0d8d07f0a809ef4e02a7ef8605e0eeec001 100644 (file)
@@ -128,6 +128,16 @@ public:
   virtual write_ertr::future<> write(
     seastore_off_t offset, ceph::bufferlist bl) = 0;
 
+  /**
+   * advance_wp
+   *
+   * advance the segment write pointer,
+   * needed when writing at wp is strictly implemented. ex: ZNS backed segments
+   * @param offset: advance write pointer till the given offset
+   */
+  virtual write_ertr::future<> advance_wp(
+    seastore_off_t offset) = 0;
+
   virtual ~Segment() {}
 };
 using SegmentRef = boost::intrusive_ptr<Segment>;
index e25feadc72bbdeb462ef8a47a78a2d90235f7781..7b4b004dd95e70171b4d77684ea5069ff8c40ddf 100644 (file)
@@ -421,6 +421,11 @@ Segment::write_ertr::future<> BlockSegment::write(
   return manager.segment_write(paddr, bl);
 }
 
+Segment::write_ertr::future<> BlockSegment::advance_wp(
+  seastore_off_t offset) {
+  return write_ertr::now();
+}
+
 Segment::close_ertr::future<> BlockSegmentManager::segment_close(
     segment_id_t id, seastore_off_t write_pointer)
 {
index 6150f8c177dd9490d737ed2490aef958b5d88e12..765d78e881fa8b95b8c62f1dc2cfb8c1b05d2f92 100644 (file)
@@ -97,6 +97,7 @@ public:
   seastore_off_t get_write_ptr() const final { return write_pointer; }
   close_ertr::future<> close() final;
   write_ertr::future<> write(seastore_off_t offset, ceph::bufferlist bl) final;
+  write_ertr::future<> advance_wp(seastore_off_t offset) final;
 
   ~BlockSegment() {}
 };
index 919155bebd1658c2a604ec4e3a59a2bb2c716be5..6a97a5130336933bc32c826eb22d85d4bd30d274 100644 (file)
@@ -87,6 +87,12 @@ Segment::write_ertr::future<> EphemeralSegment::write(
   return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl);
 }
 
+Segment::write_ertr::future<> EphemeralSegment::advance_wp(
+  seastore_off_t offset)
+{
+  return write_ertr::now();
+}
+
 Segment::close_ertr::future<> EphemeralSegmentManager::segment_close(segment_id_t id)
 {
   auto s_id = id.device_segment_id();
index e282943626f1f7918809df08a3dc9cc6d8d3f438..7f69e60b40e10ae6e5a9d9e59ff6441eb8961c86 100644 (file)
@@ -48,6 +48,7 @@ public:
   seastore_off_t get_write_ptr() const final { return write_pointer; }
   close_ertr::future<> close() final;
   write_ertr::future<> write(seastore_off_t offset, ceph::bufferlist bl) final;
+  write_ertr::future<> advance_wp(seastore_off_t offset) final;
 
   ~EphemeralSegment() {}
 };
index 95c35b727317737610eacb9d451b75d0a9a1de2e..99b56d09c0948a23dc19930587be794022097929 100644 (file)
@@ -14,6 +14,8 @@ SET_SUBSYS(seastore_device);
 
 #define SECT_SHIFT     9
 #define RESERVED_ZONES         1
+// limit the max padding buf size to 1MB
+#define MAX_PADDING_SIZE 1048576
 
 namespace crimson::os::seastore::segment_manager::zns {
 
@@ -428,7 +430,7 @@ ZNSSegmentManager::open_ertr::future<SegmentRef> ZNSSegmentManager::open(
       );
     }
   ).safe_then([=] {
-    DEBUG("segment open successful");
+    DEBUG("segment {}, open successful", id);
     return open_ertr::future<SegmentRef>(
       open_ertr::ready_future_marker{},
       SegmentRef(new ZNSSegment(*this, id))
@@ -591,9 +593,9 @@ Segment::write_ertr::future<> ZNSSegment::write(
 {
   LOG_PREFIX(ZNSSegment::write);
   if (offset != write_pointer || offset % manager.metadata.block_size != 0) {
-    ERROR("Invalid segment write on segment {} to offset {}",
-      id,
-      offset);
+    ERROR("Segment offset and zone write pointer mismatch. "
+          "segment {} segment-offset {} write pointer {}",
+          id, offset, write_pointer);
     return crimson::ct_error::invarg::make();
   }
   if (offset + bl.length() > manager.metadata.segment_size)
@@ -603,4 +605,56 @@ Segment::write_ertr::future<> ZNSSegment::write(
   return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl);
 }
 
+Segment::write_ertr::future<> ZNSSegment::write_padding_bytes(
+  size_t padding_bytes)
+{
+  LOG_PREFIX(ZNSSegment::write_padding_bytes);
+  DEBUG("Writing {} padding bytes to segment {} at wp {}",
+        padding_bytes, id, write_pointer);
+
+  return crimson::repeat([FNAME, padding_bytes, this] () mutable {
+    size_t bufsize = 0;
+    if (padding_bytes >= MAX_PADDING_SIZE) {
+      bufsize = MAX_PADDING_SIZE;
+    } else {
+      bufsize = padding_bytes;
+    }
+
+    padding_bytes -= bufsize;
+    bufferptr bp(ceph::buffer::create_page_aligned(bufsize));
+    bp.zero();
+    bufferlist padd_bl;
+    padd_bl.append(bp);
+    return write(write_pointer, padd_bl).safe_then([FNAME, padding_bytes, this]() {
+      if (padding_bytes == 0) {
+        return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::yes);
+      } else {
+        return write_ertr::make_ready_future<seastar::stop_iteration>(seastar::stop_iteration::no);
+      }
+    });
+  });
+}
+
+// Advance write pointer, to given offset.
+Segment::write_ertr::future<> ZNSSegment::advance_wp(
+  seastore_off_t offset)
+{
+  LOG_PREFIX(ZNSSegment::advance_wp);
+
+  DEBUG("Advancing write pointer from {} to {}", write_pointer, offset);
+  if (offset < write_pointer) {
+    return crimson::ct_error::invarg::make();
+  }
+
+  size_t padding_bytes = offset - write_pointer;
+
+  if (padding_bytes == 0) {
+    return write_ertr::now();
+  }
+
+  assert(padding_bytes % manager.metadata.block_size == 0);
+
+  return write_padding_bytes(padding_bytes);
+}
+
 }
index 6eb13ac2a957aa7737b714ff99a1a8d93be234a3..6b18f9506038f4ed8f72e13e63c4e7c2322e7ad6 100644 (file)
@@ -73,6 +73,7 @@ namespace crimson::os::seastore::segment_manager::zns {
     seastore_off_t get_write_ptr() const final { return write_pointer; }
     close_ertr::future<> close() final;
     write_ertr::future<> write(seastore_off_t offset, ceph::bufferlist bl) final;
+    write_ertr::future<> advance_wp(seastore_off_t offset) final;
 
     ~ZNSSegment() {}
   private:
@@ -80,6 +81,7 @@ namespace crimson::os::seastore::segment_manager::zns {
     ZNSSegmentManager &manager;
     const segment_id_t id;
     seastore_off_t write_pointer = 0;
+    write_ertr::future<> write_padding_bytes(size_t padding_bytes);
   };
 
   class ZNSSegmentManager final : public SegmentManager{