]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
Revert "crimson/os/seastore: wait ool writes in DeviceSubmission phase"
authorYingxin Cheng <yingxin.cheng@intel.com>
Mon, 19 Aug 2024 01:48:28 +0000 (09:48 +0800)
committerYingxin Cheng <yingxin.cheng@intel.com>
Mon, 19 Aug 2024 01:53:31 +0000 (09:53 +0800)
This reverts commit c9e423facea79d42f0496264f267adee5d911b87.

The commit starts to submit OOL writes before submitting the journal
write, true, but it cannot guarantee that OOL writes finish before the
journal write.

Thus it is possible that during SeaStore restart, a journal record
appears valid but its dependent OOL records are partial written, which
leads to corruption.

Signed-off-by: Yingxin Cheng <yingxin.cheng@intel.com>
src/crimson/os/seastore/extent_placement_manager.cc
src/crimson/os/seastore/extent_placement_manager.h
src/crimson/os/seastore/journal/circular_bounded_journal.cc
src/crimson/os/seastore/journal/segmented_journal.cc
src/crimson/os/seastore/ordering_handle.h

index a958fbdfd028a0f9aca7b43fe6311fbf61db496e..1ac7c68484b1761ea02798fc58575202213e5df5 100644 (file)
@@ -28,7 +28,8 @@ SegmentedOolWriter::SegmentedOolWriter(
 {
 }
 
-void SegmentedOolWriter::write_record(
+SegmentedOolWriter::alloc_write_ertr::future<>
+SegmentedOolWriter::write_record(
   Transaction& t,
   record_t&& record,
   std::list<LogicalCachedExtentRef>&& extents,
@@ -62,20 +63,15 @@ void SegmentedOolWriter::write_record(
     extent_addr = extent_addr.as_seg_paddr().add_offset(
         extent->get_length());
   }
-  // t might be destructed inside write_future
-  auto write_future = seastar::with_gate(write_guard,
-    [this, FNAME, tid=t.get_trans_id(),
-     record_base=ret.record_base_regardless_md,
-     submit_fut=std::move(ret.future)]() mutable {
-    return std::move(submit_fut
-    ).safe_then([this, FNAME, tid, record_base](record_locator_t ret) {
-      TRACE("trans.{} {} finish {}=={}",
-            tid, segment_allocator.get_name(), ret, record_base);
-      // ool won't write metadata, so the paddrs must be equal
-      assert(ret.record_block_base == record_base.offset);
-    });
+  return std::move(ret.future
+  ).safe_then([this, FNAME, &t,
+               record_base=ret.record_base_regardless_md
+              ](record_locator_t ret) {
+    TRACET("{} finish {}=={}",
+           t, segment_allocator.get_name(), ret, record_base);
+    // ool won't write metadata, so the paddrs must be equal
+    assert(ret.record_block_base == record_base.offset);
   });
-  t.get_handle().add_write_future(std::move(write_future));
 }
 
 SegmentedOolWriter::alloc_write_iertr::future<>
@@ -112,15 +108,19 @@ SegmentedOolWriter::do_write(
       DEBUGT("{} extents={} submit {} extents and roll, unavailable ...",
              t, segment_allocator.get_name(),
              extents.size(), num_extents);
+      auto fut_write = alloc_write_ertr::now();
       if (num_extents > 0) {
         assert(record_submitter.check_action(record.size) !=
                action_t::ROLL);
-        write_record(
+        fut_write = write_record(
             t, std::move(record), std::move(pending_extents),
             true/* with_atomic_roll_segment */);
       }
       return trans_intr::make_interruptible(
-        record_submitter.roll_segment()
+        record_submitter.roll_segment(
+        ).safe_then([fut_write=std::move(fut_write)]() mutable {
+          return std::move(fut_write);
+        })
       ).si_then([this, &t, &extents] {
         return do_write(t, extents);
       });
@@ -151,12 +151,15 @@ SegmentedOolWriter::do_write(
       DEBUGT("{} extents={} submit {} extents ...",
              t, segment_allocator.get_name(),
              extents.size(), pending_extents.size());
-      write_record(t, std::move(record), std::move(pending_extents));
-      if (!extents.empty()) {
-        return do_write(t, extents);
-      } else {
-        return alloc_write_iertr::now();
-      }
+      return trans_intr::make_interruptible(
+        write_record(t, std::move(record), std::move(pending_extents))
+      ).si_then([this, &t, &extents] {
+        if (!extents.empty()) {
+          return do_write(t, extents);
+        } else {
+          return alloc_write_iertr::now();
+        }
+      });
     }
     // SUBMIT_NOT_FULL: evaluate the next extent
   }
@@ -166,8 +169,8 @@ SegmentedOolWriter::do_write(
          t, segment_allocator.get_name(),
          num_extents);
   assert(num_extents > 0);
-  write_record(t, std::move(record), std::move(pending_extents));
-  return alloc_write_iertr::now();
+  return trans_intr::make_interruptible(
+    write_record(t, std::move(record), std::move(pending_extents)));
 }
 
 SegmentedOolWriter::alloc_write_iertr::future<>
@@ -991,11 +994,13 @@ RandomBlockOolWriter::alloc_write_ool_extents(
   if (extents.empty()) {
     return alloc_write_iertr::now();
   }
-  do_write(t, extents);
-  return alloc_write_iertr::now();
+  return seastar::with_gate(write_guard, [this, &t, &extents] {
+    return do_write(t, extents);
+  });
 }
 
-void RandomBlockOolWriter::do_write(
+RandomBlockOolWriter::alloc_write_iertr::future<>
+RandomBlockOolWriter::do_write(
   Transaction& t,
   std::list<CachedExtentRef>& extents)
 {
@@ -1048,10 +1053,8 @@ void RandomBlockOolWriter::do_write(
     }
   }
 
-  // t might be destructed inside write_future
-  auto write_future = seastar::with_gate(write_guard,
-    [writes=std::move(writes)]() mutable {
-    return seastar::do_with(std::move(writes),
+  return trans_intr::make_interruptible(
+    seastar::do_with(std::move(writes),
       [](auto& writes) {
       return crimson::do_for_each(writes,
         [](auto& info) {
@@ -1062,9 +1065,8 @@ void RandomBlockOolWriter::do_write(
             "Invalid error when writing record"}
         );
       });
-    });
-  });
-  t.get_handle().add_write_future(std::move(write_future));
+    })
+  );
 }
 
 }
index 0f2d55ef04aeb5d99ee829aae1fb7dcf28de8b60..7c4110c053ef5060d1860c73e4ae4198997928e0 100644 (file)
@@ -115,7 +115,7 @@ private:
     Transaction& t,
     std::list<CachedExtentRef> &extent);
 
-  void write_record(
+  alloc_write_ertr::future<> write_record(
     Transaction& t,
     record_t&& record,
     std::list<LogicalCachedExtentRef> &&extents,
@@ -195,7 +195,7 @@ private:
     ceph::bufferptr bp;
     RandomBlockManager* rbm;
   };
-  void do_write(
+  alloc_write_iertr::future<> do_write(
     Transaction& t,
     std::list<CachedExtentRef> &extent);
 
index 4da70f72c4cd334edf120d8fbfa0d2ac5abdfaa6..9ee8b1b997f0ae39118a19e1c91802eec6504984 100644 (file)
@@ -97,9 +97,7 @@ CircularBoundedJournal::do_submit_record(
   auto submit_ret = record_submitter.submit(std::move(record));
   // submit_ret.record_base_regardless_md is wrong for journaling
   return handle.enter(write_pipeline->device_submission
-  ).then([&handle] {
-    return handle.take_write_future();
-  }).safe_then([submit_fut=std::move(submit_ret.future)]() mutable {
+  ).then([submit_fut=std::move(submit_ret.future)]() mutable {
     return std::move(submit_fut);
   }).safe_then([FNAME, this, &handle](record_locator_t result) {
     return handle.enter(write_pipeline->finalize
index 81e8c5a62c781f3e2bb3a211dd51260fc3efbc06..eca45f113c25c2f18c4f9c445346d8246d965964 100644 (file)
@@ -396,9 +396,7 @@ SegmentedJournal::do_submit_record(
     auto submit_ret = record_submitter.submit(std::move(record));
     // submit_ret.record_base_regardless_md is wrong for journaling
     return handle.enter(write_pipeline->device_submission
-    ).then([&handle] {
-      return handle.take_write_future();
-    }).safe_then([submit_fut=std::move(submit_ret.future)]() mutable {
+    ).then([submit_fut=std::move(submit_ret.future)]() mutable {
       return std::move(submit_fut);
     }).safe_then([FNAME, this, &handle](record_locator_t result) {
       return handle.enter(write_pipeline->finalize
index cfa8620587552104013a5bf3559fcc918b0e7f8c..8ab8442acd9f7acc6b8073beb6bc5ce556fa47b3 100644 (file)
@@ -122,11 +122,6 @@ struct OrderingHandle {
   std::unique_ptr<OperationProxy> op;
   seastar::shared_mutex *collection_ordering_lock = nullptr;
 
-  using write_ertr = crimson::errorator<
-      crimson::ct_error::input_output_error>;
-  // the pending writes that should complete at DeviceSubmission phase
-  write_ertr::future<> write_future = write_ertr::now();
-
   // in the future we might add further constructors / template to type
   // erasure while extracting the location of tracking events.
   OrderingHandle(std::unique_ptr<OperationProxy> op) : op(std::move(op)) {}
@@ -149,20 +144,6 @@ struct OrderingHandle {
     }
   }
 
-  void add_write_future(write_ertr::future<>&& fut) {
-    auto appended = std::move(write_future
-    ).safe_then([fut=std::move(fut)]() mutable {
-      return std::move(fut);
-    });
-    write_future = std::move(appended);
-  }
-
-  write_ertr::future<> take_write_future() {
-    auto ret = std::move(write_future);
-    write_future = write_ertr::now();
-    return ret;
-  }
-
   template <typename T>
   seastar::future<> enter(T &t) {
     return op->enter(t);
@@ -170,10 +151,6 @@ struct OrderingHandle {
 
   void exit() {
     op->exit();
-
-    auto ignore_writes = std::move(write_future);
-    std::ignore = ignore_writes;
-    write_future = write_ertr::now();
   }
 
   seastar::future<> complete() {
@@ -182,9 +159,6 @@ struct OrderingHandle {
 
   ~OrderingHandle() {
     maybe_release_collection_lock();
-
-    assert(write_future.available());
-    assert(!write_future.failed());
   }
 };