]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd: Append one journal event per image request
authorJoshua Baergen <jbaergen@digitalocean.com>
Thu, 9 Nov 2023 16:43:22 +0000 (09:43 -0700)
committerJoshua Baergen <jbaergen@digitalocean.com>
Tue, 5 Dec 2023 21:25:46 +0000 (14:25 -0700)
In the case where an image request is split across multiple object
extents and journaling is enabled, multiple journal events are appended.
Prior to this change, all object requests would wait for the last
journal event to complete, since journal events complete in order and
thus the last one completing implies that all prior journal events were
safe at that point.

The issue with this is that there's nothing stopping that last journal
event from being cleaned up before all object requests have stopped
referring to it. Thus, it's entirely possible for the following sequence
to occur:
1. An image request gets split into two image extents and two object
   requests. Journal events are appended (one per image extent).
2. The first object request gets delayed due to an overlap, but the
   second object request gets submitted and starts waiting on the last
   journal event (which also causes a C_CommitIOEvent to be instantiated
   against that journal event).
3. Journaling completes, and the C_CommitIOEvent fires. The
   C_CommitIOEvent covers the entire range of data that was journaled in
   this event, and so the event is cleaned up.
4. The first object request from above is allowed to make progress; it
   tries to wait for the journal event that was just cleaned up which
   causes the assert in wait_event() to fire.

As far as I can tell, this is only possible on the discard path today,
and only recently. Up until 21a26a752843295ff946d1543c2f5f9fac764593
(librbd: Fix local rbd mirror journals growing forever), m_image_extents
always contained a single extent for all I/O types; this commit changed
the discard path so that if discard granularity changed the discard
request, m_image_extents would be repopulated, and if the request
happened to cross objects then there would be multiple m_image_extents.

It appears that the intent here was that there should be one journal
event per image request and the pending_extents kept track of what had
completed thus far. This commit restores that 1:1 relationship.

Fixes: https://tracker.ceph.com/issues/63422
Signed-off-by: Joshua Baergen <jbaergen@digitalocean.com>
src/librbd/Journal.cc
src/librbd/Journal.h
src/librbd/io/ImageRequest.cc
src/test/librbd/io/test_mock_ImageRequest.cc
src/test/librbd/journal/test_Entries.cc
src/test/librbd/test_mock_Journal.cc

index 121701c70d254ff6f4705be404d0fa0013fb46aa..1b37a30c17c0d9da3640a852700c66e0ce23b032 100644 (file)
@@ -761,36 +761,87 @@ void Journal<I>::user_flushed() {
 }
 
 template <typename I>
-uint64_t Journal<I>::append_write_event(uint64_t offset, size_t length,
-                                        const bufferlist &bl,
-                                        bool flush_entry) {
+void Journal<I>::add_write_event_entries(uint64_t offset, size_t length,
+                                         const bufferlist &bl,
+                                         uint64_t buffer_offset,
+                                         Bufferlists *bufferlists) {
   ceph_assert(m_max_append_size > journal::AioWriteEvent::get_fixed_size());
-  uint64_t max_write_data_size =
+  const uint64_t max_write_data_size =
     m_max_append_size - journal::AioWriteEvent::get_fixed_size();
 
   // ensure that the write event fits within the journal entry
-  Bufferlists bufferlists;
   uint64_t bytes_remaining = length;
   uint64_t event_offset = 0;
   do {
     uint64_t event_length = std::min(bytes_remaining, max_write_data_size);
 
     bufferlist event_bl;
-    event_bl.substr_of(bl, event_offset, event_length);
+    event_bl.substr_of(bl, buffer_offset + event_offset, event_length);
     journal::EventEntry event_entry(journal::AioWriteEvent(offset + event_offset,
                                                            event_length,
                                                            event_bl),
                                     ceph_clock_now());
 
-    bufferlists.emplace_back();
-    encode(event_entry, bufferlists.back());
+    bufferlists->emplace_back();
+    encode(event_entry, bufferlists->back());
 
     event_offset += event_length;
     bytes_remaining -= event_length;
   } while (bytes_remaining > 0);
+}
+
+template <typename I>
+uint64_t Journal<I>::append_write_event(const Extents &image_extents,
+                                        const bufferlist &bl,
+                                        bool flush_entry) {
+  Bufferlists bufferlists;
+  uint64_t buffer_offset = 0;
+  for (auto &extent : image_extents) {
+    add_write_event_entries(extent.first, extent.second, bl, buffer_offset,
+                            &bufferlists);
+
+    buffer_offset += extent.second;
+  }
 
   return append_io_events(journal::EVENT_TYPE_AIO_WRITE, bufferlists,
-                          {{offset, length}}, flush_entry, 0);
+                          image_extents, flush_entry, 0);
+}
+
+template <typename I>
+uint64_t Journal<I>::append_write_same_event(const Extents &image_extents,
+                                             const bufferlist &bl,
+                                             bool flush_entry) {
+  Bufferlists bufferlists;
+  for (auto &extent : image_extents) {
+    journal::EventEntry event_entry(
+      journal::AioWriteSameEvent(extent.first, extent.second, bl),
+      ceph_clock_now());
+
+    bufferlists.emplace_back();
+    encode(event_entry, bufferlists.back());
+  }
+
+  return append_io_events(journal::EVENT_TYPE_AIO_WRITESAME, bufferlists,
+                          image_extents, flush_entry, 0);
+}
+
+template <typename I>
+uint64_t Journal<I>::append_discard_event(const Extents &image_extents,
+                                          uint32_t discard_granularity_bytes,
+                                          bool flush_entry) {
+  Bufferlists bufferlists;
+  for (auto &extent : image_extents) {
+    journal::EventEntry event_entry(
+      journal::AioDiscardEvent(extent.first, extent.second,
+                               discard_granularity_bytes),
+      ceph_clock_now());
+
+    bufferlists.emplace_back();
+    encode(event_entry, bufferlists.back());
+  }
+
+  return append_io_events(journal::EVENT_TYPE_AIO_DISCARD, bufferlists,
+                          image_extents, flush_entry, 0);
 }
 
 template <typename I>
index e33256bd575d2ee74ee3cb403e710e2e9eee1628..5327adac7192be3ab1c314076bed298efa5bae0c 100644 (file)
@@ -134,14 +134,20 @@ public:
 
   void user_flushed();
 
-  uint64_t append_write_event(uint64_t offset, size_t length,
+  uint64_t append_write_event(const io::Extents &image_extents,
                               const bufferlist &bl,
                               bool flush_entry);
+  uint64_t append_write_same_event(const io::Extents &image_extents,
+                                   const bufferlist &bl,
+                                   bool flush_entry);
   uint64_t append_compare_and_write_event(uint64_t offset,
                                           size_t length,
                                           const bufferlist &cmp_bl,
                                           const bufferlist &write_bl,
                                           bool flush_entry);
+  uint64_t append_discard_event(const io::Extents &image_extents,
+                                uint32_t discard_granularity_bytes,
+                                bool flush_entry);
   uint64_t append_io_event(journal::EventEntry &&event_entry,
                            uint64_t offset, size_t length,
                            bool flush_entry, int filter_ret_val);
@@ -325,6 +331,10 @@ private:
   bool is_journal_replaying(const ceph::mutex &) const;
   bool is_tag_owner(const ceph::mutex &) const;
 
+  void add_write_event_entries(uint64_t offset, size_t length,
+                               const bufferlist &bl,
+                               uint64_t buffer_offset,
+                               Bufferlists *bufferlists);
   uint64_t append_io_events(journal::EventType event_type,
                             const Bufferlists &bufferlists,
                             const io::Extents &extents, bool flush_entry,
index 95bac7b245cb3ee3b9cf973b839199e9310fd0de..fb9f8944ed849bf837222b77ab34307b0537686d 100644 (file)
@@ -521,19 +521,9 @@ template <typename I>
 uint64_t ImageWriteRequest<I>::append_journal_event() {
   I &image_ctx = this->m_image_ctx;
 
-  uint64_t tid = 0;
-  uint64_t buffer_offset = 0;
   ceph_assert(!this->m_image_extents.empty());
-  for (auto &extent : this->m_image_extents) {
-    bufferlist sub_bl;
-    sub_bl.substr_of(m_bl, buffer_offset, extent.second);
-    buffer_offset += extent.second;
-
-    tid = image_ctx.journal->append_write_event(extent.first, extent.second,
-                                                sub_bl, false);
-  }
-
-  return tid;
+  return image_ctx.journal->append_write_event(
+    this->m_image_extents, m_bl, false);
 }
 
 template <typename I>
@@ -569,19 +559,9 @@ template <typename I>
 uint64_t ImageDiscardRequest<I>::append_journal_event() {
   I &image_ctx = this->m_image_ctx;
 
-  uint64_t tid = 0;
   ceph_assert(!this->m_image_extents.empty());
-  for (auto &extent : this->m_image_extents) {
-    journal::EventEntry event_entry(
-      journal::AioDiscardEvent(extent.first,
-                               extent.second,
-                               this->m_discard_granularity_bytes));
-    tid = image_ctx.journal->append_io_event(std::move(event_entry),
-                                             extent.first, extent.second,
-                                             false, 0);
-  }
-
-  return tid;
+  return image_ctx.journal->append_discard_event(
+    this->m_image_extents, m_discard_granularity_bytes, false);
 }
 
 template <typename I>
@@ -720,18 +700,9 @@ template <typename I>
 uint64_t ImageWriteSameRequest<I>::append_journal_event() {
   I &image_ctx = this->m_image_ctx;
 
-  uint64_t tid = 0;
   ceph_assert(!this->m_image_extents.empty());
-  for (auto &extent : this->m_image_extents) {
-    journal::EventEntry event_entry(journal::AioWriteSameEvent(extent.first,
-                                                               extent.second,
-                                                               m_data_bl));
-    tid = image_ctx.journal->append_io_event(std::move(event_entry),
-                                             extent.first, extent.second,
-                                             false, 0);
-  }
-
-  return tid;
+  return image_ctx.journal->append_write_same_event(
+    this->m_image_extents, m_data_bl, false);
 }
 
 template <typename I>
index 9d6423d66c4b8c4369be38e34e73256ee3b50a75..6ee67fe5f1c3ed9f4057c64e0a459bb33c1efb71 100644 (file)
@@ -16,12 +16,15 @@ namespace {
 struct MockTestImageCtx;
 
 struct MockTestJournal : public MockJournal {
-  MOCK_METHOD4(append_write_event, uint64_t(uint64_t, size_t,
+  MOCK_METHOD3(append_write_event, uint64_t(const io::Extents&,
                                             const bufferlist &, bool));
+  MOCK_METHOD3(append_write_same_event, uint64_t(const io::Extents&,
+                                                 const bufferlist &, bool));
   MOCK_METHOD5(append_compare_and_write_event, uint64_t(uint64_t, size_t,
                                                         const bufferlist &,
                                                         const bufferlist &,
                                                         bool));
+  MOCK_METHOD3(append_discard_event, uint64_t(const io::Extents&, uint32_t, bool));
   MOCK_METHOD5(append_io_event_mock, uint64_t(const journal::EventEntry&,
                                               uint64_t, size_t, bool, int));
   uint64_t append_io_event(journal::EventEntry &&event_entry,
@@ -119,9 +122,10 @@ struct TestMockIoImageRequest : public TestMockFixture {
     }
   }
 
-  void expect_journal_append_io_event(MockTestJournal &mock_journal, uint64_t journal_tid,
-                                      uint64_t offset, size_t length) {
-    EXPECT_CALL(mock_journal, append_io_event_mock(_, offset, length, _, _))
+  void expect_journal_append_discard_event(MockTestJournal &mock_journal,
+                                           uint64_t journal_tid,
+                                           const io::Extents& extents) {
+    EXPECT_CALL(mock_journal, append_discard_event(extents, _, _))
       .WillOnce(Return(journal_tid));
   }
 
@@ -386,8 +390,8 @@ TEST_F(TestMockIoImageRequest, PartialDiscardJournalAppendEnabled) {
   InSequence seq;
   expect_get_modify_timestamp(mock_image_ctx, false);
   expect_is_journal_appending(mock_journal, true);
-  expect_journal_append_io_event(mock_journal, 0, 16, 63);
-  expect_journal_append_io_event(mock_journal, 1, 84, 100);
+  expect_journal_append_discard_event(mock_journal, 0,
+                                      {{16, 63}, {84, 100}});
   expect_object_discard_request(mock_image_ctx, 0, 16, 63, 0);
   expect_object_discard_request(mock_image_ctx, 0, 84, 100, 0);
 
@@ -419,8 +423,8 @@ TEST_F(TestMockIoImageRequest, TailDiscardJournalAppendEnabled) {
   InSequence seq;
   expect_get_modify_timestamp(mock_image_ctx, false);
   expect_is_journal_appending(mock_journal, true);
-  expect_journal_append_io_event(
-    mock_journal, 0, ictx->layout.object_size - 1024, 1024);
+  expect_journal_append_discard_event(
+    mock_journal, 0, {{ictx->layout.object_size - 1024, 1024}});
   expect_object_discard_request(
     mock_image_ctx, 0, ictx->layout.object_size - 1024, 1024, 0);
 
@@ -452,7 +456,7 @@ TEST_F(TestMockIoImageRequest, PruneRequiredDiscardJournalAppendEnabled) {
   InSequence seq;
   expect_get_modify_timestamp(mock_image_ctx, false);
   expect_is_journal_appending(mock_journal, true);
-  EXPECT_CALL(mock_journal, append_io_event_mock(_, _, _, _, _)).Times(0);
+  EXPECT_CALL(mock_journal, append_discard_event(_, _, _)).Times(0);
   EXPECT_CALL(*mock_image_ctx.io_object_dispatcher, send(_)).Times(0);
 
   C_SaferCond aio_comp_ctx;
@@ -482,7 +486,7 @@ TEST_F(TestMockIoImageRequest, LengthModifiedDiscardJournalAppendEnabled) {
   InSequence seq;
   expect_get_modify_timestamp(mock_image_ctx, false);
   expect_is_journal_appending(mock_journal, true);
-  expect_journal_append_io_event(mock_journal, 0, 32, 32);
+  expect_journal_append_discard_event(mock_journal, 0, {{32, 32}});
   expect_object_discard_request(mock_image_ctx, 0, 32, 32, 0);
 
   C_SaferCond aio_comp_ctx;
@@ -513,10 +517,9 @@ TEST_F(TestMockIoImageRequest, DiscardGranularityJournalAppendEnabled) {
   InSequence seq;
   expect_get_modify_timestamp(mock_image_ctx, false);
   expect_is_journal_appending(mock_journal, true);
-  expect_journal_append_io_event(mock_journal, 0, 32, 32);
-  expect_journal_append_io_event(mock_journal, 1, 96, 64);
-  expect_journal_append_io_event(
-    mock_journal, 2, ictx->layout.object_size - 32, 32);
+  expect_journal_append_discard_event(
+    mock_journal, 0,
+    {{32, 32}, {96, 64}, {ictx->layout.object_size - 32, 32}});
   expect_object_discard_request(mock_image_ctx, 0, 32, 32, 0);
   expect_object_discard_request(mock_image_ctx, 0, 96, 64, 0);
   expect_object_discard_request(
index c392fb9f88a88702fd8e0c8bbff2699fa447c751..bb4b06c0368a1ebcbd63c0001ec4458a63a225f1 100644 (file)
@@ -196,6 +196,69 @@ TEST_F(TestJournalEntries, AioDiscard) {
   ASSERT_EQ(234U, aio_discard_event.length);
 }
 
+TEST_F(TestJournalEntries, AioDiscardWithPrune) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+  // The discard path can create multiple image extents (ImageRequest.cc) in the
+  // case where the discard request needs to be pruned and multiple objects are
+  // involved in the request. This test ensures that journal event entries are
+  // queued up for each image extent.
+
+  // Create an image that is multiple objects so that we can force multiple
+  // image extents on the discard path.
+  CephContext* cct = reinterpret_cast<CephContext*>(_rados.cct());
+  auto object_size = 1ull << cct->_conf.get_val<uint64_t>("rbd_default_order");
+  auto image_size = 4 * object_size;
+
+  auto image_name = get_temp_image_name();
+  ASSERT_EQ(0, create_image_pp(m_rbd, m_ioctx, image_name, image_size));
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(image_name, &ictx));
+
+  ::journal::Journaler *journaler = create_journaler(ictx);
+  ASSERT_TRUE(journaler != NULL);
+
+  C_SaferCond cond_ctx;
+  auto c = librbd::io::AioCompletion::create(&cond_ctx);
+  c->get();
+  // We offset the discard by -4096 bytes and set discard granularity to 8192;
+  // this should cause two image extents to be formed in
+  // AbstractImageWriteRequest<I>::send_request().
+  api::Io<>::aio_discard(*ictx, c, object_size - 4096, 2 * object_size, 8192,
+                         true);
+  ASSERT_EQ(0, c->wait_for_complete());
+  c->put();
+
+  for (uint64_t chunk = 0; chunk < 2; chunk++) {
+    auto offset = object_size;
+    auto size = object_size;
+    if (chunk == 1) {
+      offset = object_size * 2;
+      size = object_size - 8192;
+    }
+
+    ::journal::ReplayEntry replay_entry;
+    if (!journaler->try_pop_front(&replay_entry)) {
+      ASSERT_TRUE(wait_for_entries_available(ictx));
+      ASSERT_TRUE(journaler->try_pop_front(&replay_entry));
+    }
+
+    librbd::journal::EventEntry event_entry;
+    ASSERT_TRUE(get_event_entry(replay_entry, &event_entry));
+
+    ASSERT_EQ(librbd::journal::EVENT_TYPE_AIO_DISCARD,
+              event_entry.get_event_type());
+
+    librbd::journal::AioDiscardEvent aio_discard_event =
+      boost::get<librbd::journal::AioDiscardEvent>(event_entry.event);
+    ASSERT_EQ(offset, aio_discard_event.offset);
+    ASSERT_EQ(size, aio_discard_event.length);
+
+    journaler->committed(replay_entry);
+  }
+}
+
 TEST_F(TestJournalEntries, AioFlush) {
   REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
 
index 2fe74d2fe4661ae4bb796fafef44798440d4740d..589695c50b39d255c8d2ebee1cd24df26f359c24 100644 (file)
@@ -460,7 +460,7 @@ public:
     bl.append_zero(length);
 
     std::shared_lock owner_locker{mock_image_ctx.owner_lock};
-    return mock_journal->append_write_event(0, length, bl, false);
+    return mock_journal->append_write_event({{0, length}}, bl, false);
   }
 
   uint64_t when_append_compare_and_write_event(