From 4a8fa2da72fe64109073fddca0d4cfd99aeb9c77 Mon Sep 17 00:00:00 2001 From: Joshua Baergen Date: Thu, 9 Nov 2023 09:43:22 -0700 Subject: [PATCH] librbd: Append one journal event per image request In the case where an image request is split across multiple object extents and journaling is enabled, multiple journal events are appended. Prior to this change, all object requests would wait for the last journal event to complete, since journal events complete in order and thus the last one completing implies that all prior journal events were safe at that point. The issue with this is that there's nothing stopping that last journal event from being cleaned up before all object requests have stopped referring to it. Thus, it's entirely possible for the following sequence to occur: 1. An image request gets split into two image extents and two object requests. Journal events are appended (one per image extent). 2. The first object request gets delayed due to an overlap, but the second object request gets submitted and starts waiting on the last journal event (which also causes a C_CommitIOEvent to be instantiated against that journal event). 3. Journaling completes, and the C_CommitIOEvent fires. The C_CommitIOEvent covers the entire range of data that was journaled in this event, and so the event is cleaned up. 4. The first object request from above is allowed to make progress; it tries to wait for the journal event that was just cleaned up which causes the assert in wait_event() to fire. As far as I can tell, this is only possible on the discard path today, and only recently. Up until 21a26a752843295ff946d1543c2f5f9fac764593 (librbd: Fix local rbd mirror journals growing forever), m_image_extents always contained a single extent for all I/O types; this commit changed the discard path so that if discard granularity changed the discard request, m_image_extents would be repopulated, and if the request happened to cross objects then there would be multiple m_image_extents. It appears that the intent here was that there should be one journal event per image request and the pending_extents kept track of what had completed thus far. This commit restores that 1:1 relationship. Fixes: https://tracker.ceph.com/issues/63422 Signed-off-by: Joshua Baergen --- src/librbd/Journal.cc | 69 +++++++++++++++++--- src/librbd/Journal.h | 12 +++- src/librbd/io/ImageRequest.cc | 41 ++---------- src/test/librbd/io/test_mock_ImageRequest.cc | 31 +++++---- src/test/librbd/journal/test_Entries.cc | 63 ++++++++++++++++++ src/test/librbd/test_mock_Journal.cc | 2 +- 6 files changed, 158 insertions(+), 60 deletions(-) diff --git a/src/librbd/Journal.cc b/src/librbd/Journal.cc index 121701c70d254..1b37a30c17c0d 100644 --- a/src/librbd/Journal.cc +++ b/src/librbd/Journal.cc @@ -761,36 +761,87 @@ void Journal::user_flushed() { } template -uint64_t Journal::append_write_event(uint64_t offset, size_t length, - const bufferlist &bl, - bool flush_entry) { +void Journal::add_write_event_entries(uint64_t offset, size_t length, + const bufferlist &bl, + uint64_t buffer_offset, + Bufferlists *bufferlists) { ceph_assert(m_max_append_size > journal::AioWriteEvent::get_fixed_size()); - uint64_t max_write_data_size = + const uint64_t max_write_data_size = m_max_append_size - journal::AioWriteEvent::get_fixed_size(); // ensure that the write event fits within the journal entry - Bufferlists bufferlists; uint64_t bytes_remaining = length; uint64_t event_offset = 0; do { uint64_t event_length = std::min(bytes_remaining, max_write_data_size); bufferlist event_bl; - event_bl.substr_of(bl, event_offset, event_length); + event_bl.substr_of(bl, buffer_offset + event_offset, event_length); journal::EventEntry event_entry(journal::AioWriteEvent(offset + event_offset, event_length, event_bl), ceph_clock_now()); - bufferlists.emplace_back(); - encode(event_entry, bufferlists.back()); + bufferlists->emplace_back(); + encode(event_entry, bufferlists->back()); event_offset += event_length; bytes_remaining -= event_length; } while (bytes_remaining > 0); +} + +template +uint64_t Journal::append_write_event(const Extents &image_extents, + const bufferlist &bl, + bool flush_entry) { + Bufferlists bufferlists; + uint64_t buffer_offset = 0; + for (auto &extent : image_extents) { + add_write_event_entries(extent.first, extent.second, bl, buffer_offset, + &bufferlists); + + buffer_offset += extent.second; + } return append_io_events(journal::EVENT_TYPE_AIO_WRITE, bufferlists, - {{offset, length}}, flush_entry, 0); + image_extents, flush_entry, 0); +} + +template +uint64_t Journal::append_write_same_event(const Extents &image_extents, + const bufferlist &bl, + bool flush_entry) { + Bufferlists bufferlists; + for (auto &extent : image_extents) { + journal::EventEntry event_entry( + journal::AioWriteSameEvent(extent.first, extent.second, bl), + ceph_clock_now()); + + bufferlists.emplace_back(); + encode(event_entry, bufferlists.back()); + } + + return append_io_events(journal::EVENT_TYPE_AIO_WRITESAME, bufferlists, + image_extents, flush_entry, 0); +} + +template +uint64_t Journal::append_discard_event(const Extents &image_extents, + uint32_t discard_granularity_bytes, + bool flush_entry) { + Bufferlists bufferlists; + for (auto &extent : image_extents) { + journal::EventEntry event_entry( + journal::AioDiscardEvent(extent.first, extent.second, + discard_granularity_bytes), + ceph_clock_now()); + + bufferlists.emplace_back(); + encode(event_entry, bufferlists.back()); + } + + return append_io_events(journal::EVENT_TYPE_AIO_DISCARD, bufferlists, + image_extents, flush_entry, 0); } template diff --git a/src/librbd/Journal.h b/src/librbd/Journal.h index e33256bd575d2..5327adac7192b 100644 --- a/src/librbd/Journal.h +++ b/src/librbd/Journal.h @@ -134,14 +134,20 @@ public: void user_flushed(); - uint64_t append_write_event(uint64_t offset, size_t length, + uint64_t append_write_event(const io::Extents &image_extents, const bufferlist &bl, bool flush_entry); + uint64_t append_write_same_event(const io::Extents &image_extents, + const bufferlist &bl, + bool flush_entry); uint64_t append_compare_and_write_event(uint64_t offset, size_t length, const bufferlist &cmp_bl, const bufferlist &write_bl, bool flush_entry); + uint64_t append_discard_event(const io::Extents &image_extents, + uint32_t discard_granularity_bytes, + bool flush_entry); uint64_t append_io_event(journal::EventEntry &&event_entry, uint64_t offset, size_t length, bool flush_entry, int filter_ret_val); @@ -325,6 +331,10 @@ private: bool is_journal_replaying(const ceph::mutex &) const; bool is_tag_owner(const ceph::mutex &) const; + void add_write_event_entries(uint64_t offset, size_t length, + const bufferlist &bl, + uint64_t buffer_offset, + Bufferlists *bufferlists); uint64_t append_io_events(journal::EventType event_type, const Bufferlists &bufferlists, const io::Extents &extents, bool flush_entry, diff --git a/src/librbd/io/ImageRequest.cc b/src/librbd/io/ImageRequest.cc index 95bac7b245cb3..fb9f8944ed849 100644 --- a/src/librbd/io/ImageRequest.cc +++ b/src/librbd/io/ImageRequest.cc @@ -521,19 +521,9 @@ template uint64_t ImageWriteRequest::append_journal_event() { I &image_ctx = this->m_image_ctx; - uint64_t tid = 0; - uint64_t buffer_offset = 0; ceph_assert(!this->m_image_extents.empty()); - for (auto &extent : this->m_image_extents) { - bufferlist sub_bl; - sub_bl.substr_of(m_bl, buffer_offset, extent.second); - buffer_offset += extent.second; - - tid = image_ctx.journal->append_write_event(extent.first, extent.second, - sub_bl, false); - } - - return tid; + return image_ctx.journal->append_write_event( + this->m_image_extents, m_bl, false); } template @@ -569,19 +559,9 @@ template uint64_t ImageDiscardRequest::append_journal_event() { I &image_ctx = this->m_image_ctx; - uint64_t tid = 0; ceph_assert(!this->m_image_extents.empty()); - for (auto &extent : this->m_image_extents) { - journal::EventEntry event_entry( - journal::AioDiscardEvent(extent.first, - extent.second, - this->m_discard_granularity_bytes)); - tid = image_ctx.journal->append_io_event(std::move(event_entry), - extent.first, extent.second, - false, 0); - } - - return tid; + return image_ctx.journal->append_discard_event( + this->m_image_extents, m_discard_granularity_bytes, false); } template @@ -720,18 +700,9 @@ template uint64_t ImageWriteSameRequest::append_journal_event() { I &image_ctx = this->m_image_ctx; - uint64_t tid = 0; ceph_assert(!this->m_image_extents.empty()); - for (auto &extent : this->m_image_extents) { - journal::EventEntry event_entry(journal::AioWriteSameEvent(extent.first, - extent.second, - m_data_bl)); - tid = image_ctx.journal->append_io_event(std::move(event_entry), - extent.first, extent.second, - false, 0); - } - - return tid; + return image_ctx.journal->append_write_same_event( + this->m_image_extents, m_data_bl, false); } template diff --git a/src/test/librbd/io/test_mock_ImageRequest.cc b/src/test/librbd/io/test_mock_ImageRequest.cc index 9d6423d66c4b8..6ee67fe5f1c3e 100644 --- a/src/test/librbd/io/test_mock_ImageRequest.cc +++ b/src/test/librbd/io/test_mock_ImageRequest.cc @@ -16,12 +16,15 @@ namespace { struct MockTestImageCtx; struct MockTestJournal : public MockJournal { - MOCK_METHOD4(append_write_event, uint64_t(uint64_t, size_t, + MOCK_METHOD3(append_write_event, uint64_t(const io::Extents&, const bufferlist &, bool)); + MOCK_METHOD3(append_write_same_event, uint64_t(const io::Extents&, + const bufferlist &, bool)); MOCK_METHOD5(append_compare_and_write_event, uint64_t(uint64_t, size_t, const bufferlist &, const bufferlist &, bool)); + MOCK_METHOD3(append_discard_event, uint64_t(const io::Extents&, uint32_t, bool)); MOCK_METHOD5(append_io_event_mock, uint64_t(const journal::EventEntry&, uint64_t, size_t, bool, int)); uint64_t append_io_event(journal::EventEntry &&event_entry, @@ -119,9 +122,10 @@ struct TestMockIoImageRequest : public TestMockFixture { } } - void expect_journal_append_io_event(MockTestJournal &mock_journal, uint64_t journal_tid, - uint64_t offset, size_t length) { - EXPECT_CALL(mock_journal, append_io_event_mock(_, offset, length, _, _)) + void expect_journal_append_discard_event(MockTestJournal &mock_journal, + uint64_t journal_tid, + const io::Extents& extents) { + EXPECT_CALL(mock_journal, append_discard_event(extents, _, _)) .WillOnce(Return(journal_tid)); } @@ -386,8 +390,8 @@ TEST_F(TestMockIoImageRequest, PartialDiscardJournalAppendEnabled) { InSequence seq; expect_get_modify_timestamp(mock_image_ctx, false); expect_is_journal_appending(mock_journal, true); - expect_journal_append_io_event(mock_journal, 0, 16, 63); - expect_journal_append_io_event(mock_journal, 1, 84, 100); + expect_journal_append_discard_event(mock_journal, 0, + {{16, 63}, {84, 100}}); expect_object_discard_request(mock_image_ctx, 0, 16, 63, 0); expect_object_discard_request(mock_image_ctx, 0, 84, 100, 0); @@ -419,8 +423,8 @@ TEST_F(TestMockIoImageRequest, TailDiscardJournalAppendEnabled) { InSequence seq; expect_get_modify_timestamp(mock_image_ctx, false); expect_is_journal_appending(mock_journal, true); - expect_journal_append_io_event( - mock_journal, 0, ictx->layout.object_size - 1024, 1024); + expect_journal_append_discard_event( + mock_journal, 0, {{ictx->layout.object_size - 1024, 1024}}); expect_object_discard_request( mock_image_ctx, 0, ictx->layout.object_size - 1024, 1024, 0); @@ -452,7 +456,7 @@ TEST_F(TestMockIoImageRequest, PruneRequiredDiscardJournalAppendEnabled) { InSequence seq; expect_get_modify_timestamp(mock_image_ctx, false); expect_is_journal_appending(mock_journal, true); - EXPECT_CALL(mock_journal, append_io_event_mock(_, _, _, _, _)).Times(0); + EXPECT_CALL(mock_journal, append_discard_event(_, _, _)).Times(0); EXPECT_CALL(*mock_image_ctx.io_object_dispatcher, send(_)).Times(0); C_SaferCond aio_comp_ctx; @@ -482,7 +486,7 @@ TEST_F(TestMockIoImageRequest, LengthModifiedDiscardJournalAppendEnabled) { InSequence seq; expect_get_modify_timestamp(mock_image_ctx, false); expect_is_journal_appending(mock_journal, true); - expect_journal_append_io_event(mock_journal, 0, 32, 32); + expect_journal_append_discard_event(mock_journal, 0, {{32, 32}}); expect_object_discard_request(mock_image_ctx, 0, 32, 32, 0); C_SaferCond aio_comp_ctx; @@ -513,10 +517,9 @@ TEST_F(TestMockIoImageRequest, DiscardGranularityJournalAppendEnabled) { InSequence seq; expect_get_modify_timestamp(mock_image_ctx, false); expect_is_journal_appending(mock_journal, true); - expect_journal_append_io_event(mock_journal, 0, 32, 32); - expect_journal_append_io_event(mock_journal, 1, 96, 64); - expect_journal_append_io_event( - mock_journal, 2, ictx->layout.object_size - 32, 32); + expect_journal_append_discard_event( + mock_journal, 0, + {{32, 32}, {96, 64}, {ictx->layout.object_size - 32, 32}}); expect_object_discard_request(mock_image_ctx, 0, 32, 32, 0); expect_object_discard_request(mock_image_ctx, 0, 96, 64, 0); expect_object_discard_request( diff --git a/src/test/librbd/journal/test_Entries.cc b/src/test/librbd/journal/test_Entries.cc index c392fb9f88a88..bb4b06c0368a1 100644 --- a/src/test/librbd/journal/test_Entries.cc +++ b/src/test/librbd/journal/test_Entries.cc @@ -196,6 +196,69 @@ TEST_F(TestJournalEntries, AioDiscard) { ASSERT_EQ(234U, aio_discard_event.length); } +TEST_F(TestJournalEntries, AioDiscardWithPrune) { + REQUIRE_FEATURE(RBD_FEATURE_JOURNALING); + + // The discard path can create multiple image extents (ImageRequest.cc) in the + // case where the discard request needs to be pruned and multiple objects are + // involved in the request. This test ensures that journal event entries are + // queued up for each image extent. + + // Create an image that is multiple objects so that we can force multiple + // image extents on the discard path. + CephContext* cct = reinterpret_cast(_rados.cct()); + auto object_size = 1ull << cct->_conf.get_val("rbd_default_order"); + auto image_size = 4 * object_size; + + auto image_name = get_temp_image_name(); + ASSERT_EQ(0, create_image_pp(m_rbd, m_ioctx, image_name, image_size)); + + librbd::ImageCtx *ictx; + ASSERT_EQ(0, open_image(image_name, &ictx)); + + ::journal::Journaler *journaler = create_journaler(ictx); + ASSERT_TRUE(journaler != NULL); + + C_SaferCond cond_ctx; + auto c = librbd::io::AioCompletion::create(&cond_ctx); + c->get(); + // We offset the discard by -4096 bytes and set discard granularity to 8192; + // this should cause two image extents to be formed in + // AbstractImageWriteRequest::send_request(). + api::Io<>::aio_discard(*ictx, c, object_size - 4096, 2 * object_size, 8192, + true); + ASSERT_EQ(0, c->wait_for_complete()); + c->put(); + + for (uint64_t chunk = 0; chunk < 2; chunk++) { + auto offset = object_size; + auto size = object_size; + if (chunk == 1) { + offset = object_size * 2; + size = object_size - 8192; + } + + ::journal::ReplayEntry replay_entry; + if (!journaler->try_pop_front(&replay_entry)) { + ASSERT_TRUE(wait_for_entries_available(ictx)); + ASSERT_TRUE(journaler->try_pop_front(&replay_entry)); + } + + librbd::journal::EventEntry event_entry; + ASSERT_TRUE(get_event_entry(replay_entry, &event_entry)); + + ASSERT_EQ(librbd::journal::EVENT_TYPE_AIO_DISCARD, + event_entry.get_event_type()); + + librbd::journal::AioDiscardEvent aio_discard_event = + boost::get(event_entry.event); + ASSERT_EQ(offset, aio_discard_event.offset); + ASSERT_EQ(size, aio_discard_event.length); + + journaler->committed(replay_entry); + } +} + TEST_F(TestJournalEntries, AioFlush) { REQUIRE_FEATURE(RBD_FEATURE_JOURNALING); diff --git a/src/test/librbd/test_mock_Journal.cc b/src/test/librbd/test_mock_Journal.cc index 2fe74d2fe4661..589695c50b39d 100644 --- a/src/test/librbd/test_mock_Journal.cc +++ b/src/test/librbd/test_mock_Journal.cc @@ -460,7 +460,7 @@ public: bl.append_zero(length); std::shared_lock owner_locker{mock_image_ctx.owner_lock}; - return mock_journal->append_write_event(0, length, bl, false); + return mock_journal->append_write_event({{0, length}}, bl, false); } uint64_t when_append_compare_and_write_event( -- 2.39.5