From da6f3c4523d8cbb14718fef9f7c994d9253d7cd0 Mon Sep 17 00:00:00 2001 From: Yingxin Cheng Date: Mon, 19 Aug 2024 09:48:28 +0800 Subject: [PATCH] Revert "crimson/os/seastore: wait ool writes in DeviceSubmission phase" This reverts commit c9e423facea79d42f0496264f267adee5d911b87. The commit starts to submit OOL writes before submitting the journal write, true, but it cannot guarantee that OOL writes finish before the journal write. Thus it is possible that during SeaStore restart, a journal record appears valid but its dependent OOL records are partial written, which leads to corruption. Signed-off-by: Yingxin Cheng --- .../os/seastore/extent_placement_manager.cc | 70 ++++++++++--------- .../os/seastore/extent_placement_manager.h | 4 +- .../journal/circular_bounded_journal.cc | 4 +- .../os/seastore/journal/segmented_journal.cc | 4 +- src/crimson/os/seastore/ordering_handle.h | 26 ------- 5 files changed, 40 insertions(+), 68 deletions(-) diff --git a/src/crimson/os/seastore/extent_placement_manager.cc b/src/crimson/os/seastore/extent_placement_manager.cc index a958fbdfd02..1ac7c68484b 100644 --- a/src/crimson/os/seastore/extent_placement_manager.cc +++ b/src/crimson/os/seastore/extent_placement_manager.cc @@ -28,7 +28,8 @@ SegmentedOolWriter::SegmentedOolWriter( { } -void SegmentedOolWriter::write_record( +SegmentedOolWriter::alloc_write_ertr::future<> +SegmentedOolWriter::write_record( Transaction& t, record_t&& record, std::list&& extents, @@ -62,20 +63,15 @@ void SegmentedOolWriter::write_record( extent_addr = extent_addr.as_seg_paddr().add_offset( extent->get_length()); } - // t might be destructed inside write_future - auto write_future = seastar::with_gate(write_guard, - [this, FNAME, tid=t.get_trans_id(), - record_base=ret.record_base_regardless_md, - submit_fut=std::move(ret.future)]() mutable { - return std::move(submit_fut - ).safe_then([this, FNAME, tid, record_base](record_locator_t ret) { - TRACE("trans.{} {} finish {}=={}", - tid, segment_allocator.get_name(), ret, record_base); - // ool won't write metadata, so the paddrs must be equal - assert(ret.record_block_base == record_base.offset); - }); + return std::move(ret.future + ).safe_then([this, FNAME, &t, + record_base=ret.record_base_regardless_md + ](record_locator_t ret) { + TRACET("{} finish {}=={}", + t, segment_allocator.get_name(), ret, record_base); + // ool won't write metadata, so the paddrs must be equal + assert(ret.record_block_base == record_base.offset); }); - t.get_handle().add_write_future(std::move(write_future)); } SegmentedOolWriter::alloc_write_iertr::future<> @@ -112,15 +108,19 @@ SegmentedOolWriter::do_write( DEBUGT("{} extents={} submit {} extents and roll, unavailable ...", t, segment_allocator.get_name(), extents.size(), num_extents); + auto fut_write = alloc_write_ertr::now(); if (num_extents > 0) { assert(record_submitter.check_action(record.size) != action_t::ROLL); - write_record( + fut_write = write_record( t, std::move(record), std::move(pending_extents), true/* with_atomic_roll_segment */); } return trans_intr::make_interruptible( - record_submitter.roll_segment() + record_submitter.roll_segment( + ).safe_then([fut_write=std::move(fut_write)]() mutable { + return std::move(fut_write); + }) ).si_then([this, &t, &extents] { return do_write(t, extents); }); @@ -151,12 +151,15 @@ SegmentedOolWriter::do_write( DEBUGT("{} extents={} submit {} extents ...", t, segment_allocator.get_name(), extents.size(), pending_extents.size()); - write_record(t, std::move(record), std::move(pending_extents)); - if (!extents.empty()) { - return do_write(t, extents); - } else { - return alloc_write_iertr::now(); - } + return trans_intr::make_interruptible( + write_record(t, std::move(record), std::move(pending_extents)) + ).si_then([this, &t, &extents] { + if (!extents.empty()) { + return do_write(t, extents); + } else { + return alloc_write_iertr::now(); + } + }); } // SUBMIT_NOT_FULL: evaluate the next extent } @@ -166,8 +169,8 @@ SegmentedOolWriter::do_write( t, segment_allocator.get_name(), num_extents); assert(num_extents > 0); - write_record(t, std::move(record), std::move(pending_extents)); - return alloc_write_iertr::now(); + return trans_intr::make_interruptible( + write_record(t, std::move(record), std::move(pending_extents))); } SegmentedOolWriter::alloc_write_iertr::future<> @@ -991,11 +994,13 @@ RandomBlockOolWriter::alloc_write_ool_extents( if (extents.empty()) { return alloc_write_iertr::now(); } - do_write(t, extents); - return alloc_write_iertr::now(); + return seastar::with_gate(write_guard, [this, &t, &extents] { + return do_write(t, extents); + }); } -void RandomBlockOolWriter::do_write( +RandomBlockOolWriter::alloc_write_iertr::future<> +RandomBlockOolWriter::do_write( Transaction& t, std::list& extents) { @@ -1048,10 +1053,8 @@ void RandomBlockOolWriter::do_write( } } - // t might be destructed inside write_future - auto write_future = seastar::with_gate(write_guard, - [writes=std::move(writes)]() mutable { - return seastar::do_with(std::move(writes), + return trans_intr::make_interruptible( + seastar::do_with(std::move(writes), [](auto& writes) { return crimson::do_for_each(writes, [](auto& info) { @@ -1062,9 +1065,8 @@ void RandomBlockOolWriter::do_write( "Invalid error when writing record"} ); }); - }); - }); - t.get_handle().add_write_future(std::move(write_future)); + }) + ); } } diff --git a/src/crimson/os/seastore/extent_placement_manager.h b/src/crimson/os/seastore/extent_placement_manager.h index 0f2d55ef04a..7c4110c053e 100644 --- a/src/crimson/os/seastore/extent_placement_manager.h +++ b/src/crimson/os/seastore/extent_placement_manager.h @@ -115,7 +115,7 @@ private: Transaction& t, std::list &extent); - void write_record( + alloc_write_ertr::future<> write_record( Transaction& t, record_t&& record, std::list &&extents, @@ -195,7 +195,7 @@ private: ceph::bufferptr bp; RandomBlockManager* rbm; }; - void do_write( + alloc_write_iertr::future<> do_write( Transaction& t, std::list &extent); diff --git a/src/crimson/os/seastore/journal/circular_bounded_journal.cc b/src/crimson/os/seastore/journal/circular_bounded_journal.cc index 4da70f72c4c..9ee8b1b997f 100644 --- a/src/crimson/os/seastore/journal/circular_bounded_journal.cc +++ b/src/crimson/os/seastore/journal/circular_bounded_journal.cc @@ -97,9 +97,7 @@ CircularBoundedJournal::do_submit_record( auto submit_ret = record_submitter.submit(std::move(record)); // submit_ret.record_base_regardless_md is wrong for journaling return handle.enter(write_pipeline->device_submission - ).then([&handle] { - return handle.take_write_future(); - }).safe_then([submit_fut=std::move(submit_ret.future)]() mutable { + ).then([submit_fut=std::move(submit_ret.future)]() mutable { return std::move(submit_fut); }).safe_then([FNAME, this, &handle](record_locator_t result) { return handle.enter(write_pipeline->finalize diff --git a/src/crimson/os/seastore/journal/segmented_journal.cc b/src/crimson/os/seastore/journal/segmented_journal.cc index 81e8c5a62c7..eca45f113c2 100644 --- a/src/crimson/os/seastore/journal/segmented_journal.cc +++ b/src/crimson/os/seastore/journal/segmented_journal.cc @@ -396,9 +396,7 @@ SegmentedJournal::do_submit_record( auto submit_ret = record_submitter.submit(std::move(record)); // submit_ret.record_base_regardless_md is wrong for journaling return handle.enter(write_pipeline->device_submission - ).then([&handle] { - return handle.take_write_future(); - }).safe_then([submit_fut=std::move(submit_ret.future)]() mutable { + ).then([submit_fut=std::move(submit_ret.future)]() mutable { return std::move(submit_fut); }).safe_then([FNAME, this, &handle](record_locator_t result) { return handle.enter(write_pipeline->finalize diff --git a/src/crimson/os/seastore/ordering_handle.h b/src/crimson/os/seastore/ordering_handle.h index cfa86205875..8ab8442acd9 100644 --- a/src/crimson/os/seastore/ordering_handle.h +++ b/src/crimson/os/seastore/ordering_handle.h @@ -122,11 +122,6 @@ struct OrderingHandle { std::unique_ptr op; seastar::shared_mutex *collection_ordering_lock = nullptr; - using write_ertr = crimson::errorator< - crimson::ct_error::input_output_error>; - // the pending writes that should complete at DeviceSubmission phase - write_ertr::future<> write_future = write_ertr::now(); - // in the future we might add further constructors / template to type // erasure while extracting the location of tracking events. OrderingHandle(std::unique_ptr op) : op(std::move(op)) {} @@ -149,20 +144,6 @@ struct OrderingHandle { } } - void add_write_future(write_ertr::future<>&& fut) { - auto appended = std::move(write_future - ).safe_then([fut=std::move(fut)]() mutable { - return std::move(fut); - }); - write_future = std::move(appended); - } - - write_ertr::future<> take_write_future() { - auto ret = std::move(write_future); - write_future = write_ertr::now(); - return ret; - } - template seastar::future<> enter(T &t) { return op->enter(t); @@ -170,10 +151,6 @@ struct OrderingHandle { void exit() { op->exit(); - - auto ignore_writes = std::move(write_future); - std::ignore = ignore_writes; - write_future = write_ertr::now(); } seastar::future<> complete() { @@ -182,9 +159,6 @@ struct OrderingHandle { ~OrderingHandle() { maybe_release_collection_lock(); - - assert(write_future.available()); - assert(!write_future.failed()); } }; -- 2.39.5