]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd: Append one journal event per image request
authorJoshua Baergen <jbaergen@digitalocean.com>
Thu, 9 Nov 2023 16:43:22 +0000 (09:43 -0700)
committerJoshua Baergen <jbaergen@digitalocean.com>
Wed, 6 Dec 2023 21:26:47 +0000 (14:26 -0700)
In the case where an image request is split across multiple object
extents and journaling is enabled, multiple journal events are appended.
Prior to this change, all object requests would wait for the last
journal event to complete, since journal events complete in order and
thus the last one completing implies that all prior journal events were
safe at that point.

The issue with this is that there's nothing stopping that last journal
event from being cleaned up before all object requests have stopped
referring to it. Thus, it's entirely possible for the following sequence
to occur:
1. An image request gets split into two image extents and two object
   requests. Journal events are appended (one per image extent).
2. The first object request gets delayed due to an overlap, but the
   second object request gets submitted and starts waiting on the last
   journal event (which also causes a C_CommitIOEvent to be instantiated
   against that journal event).
3. Journaling completes, and the C_CommitIOEvent fires. The
   C_CommitIOEvent covers the entire range of data that was journaled in
   this event, and so the event is cleaned up.
4. The first object request from above is allowed to make progress; it
   tries to wait for the journal event that was just cleaned up which
   causes the assert in wait_event() to fire.

As far as I can tell, this is only possible on the discard path today,
and only recently. Up until 21a26a752843295ff946d1543c2f5f9fac764593
(librbd: Fix local rbd mirror journals growing forever), m_image_extents
always contained a single extent for all I/O types; this commit changed
the discard path so that if discard granularity changed the discard
request, m_image_extents would be repopulated, and if the request
happened to cross objects then there would be multiple m_image_extents.

It appears that the intent here was that there should be one journal
event per image request and the pending_extents kept track of what had
completed thus far. This commit restores that 1:1 relationship.

Fixes: https://tracker.ceph.com/issues/63422
Signed-off-by: Joshua Baergen <jbaergen@digitalocean.com>
(cherry picked from commit 4a8fa2da72fe64109073fddca0d4cfd99aeb9c77)

src/librbd/Journal.cc
src/librbd/Journal.h
src/librbd/io/ImageRequest.cc
src/test/librbd/io/test_mock_ImageRequest.cc
src/test/librbd/journal/test_Entries.cc
src/test/librbd/test_mock_Journal.cc

index d35f910f203f2ccd61fdb7225dbcfa4eb46c0aea..1d4339c164b166072e441a963c63e46a27e323de 100644 (file)
@@ -761,36 +761,87 @@ void Journal<I>::user_flushed() {
 }
 
 template <typename I>
-uint64_t Journal<I>::append_write_event(uint64_t offset, size_t length,
-                                        const bufferlist &bl,
-                                        bool flush_entry) {
+void Journal<I>::add_write_event_entries(uint64_t offset, size_t length,
+                                         const bufferlist &bl,
+                                         uint64_t buffer_offset,
+                                         Bufferlists *bufferlists) {
   ceph_assert(m_max_append_size > journal::AioWriteEvent::get_fixed_size());
-  uint64_t max_write_data_size =
+  const uint64_t max_write_data_size =
     m_max_append_size - journal::AioWriteEvent::get_fixed_size();
 
   // ensure that the write event fits within the journal entry
-  Bufferlists bufferlists;
   uint64_t bytes_remaining = length;
   uint64_t event_offset = 0;
   do {
     uint64_t event_length = std::min(bytes_remaining, max_write_data_size);
 
     bufferlist event_bl;
-    event_bl.substr_of(bl, event_offset, event_length);
+    event_bl.substr_of(bl, buffer_offset + event_offset, event_length);
     journal::EventEntry event_entry(journal::AioWriteEvent(offset + event_offset,
                                                            event_length,
                                                            event_bl),
                                     ceph_clock_now());
 
-    bufferlists.emplace_back();
-    encode(event_entry, bufferlists.back());
+    bufferlists->emplace_back();
+    encode(event_entry, bufferlists->back());
 
     event_offset += event_length;
     bytes_remaining -= event_length;
   } while (bytes_remaining > 0);
+}
+
+template <typename I>
+uint64_t Journal<I>::append_write_event(const Extents &image_extents,
+                                        const bufferlist &bl,
+                                        bool flush_entry) {
+  Bufferlists bufferlists;
+  uint64_t buffer_offset = 0;
+  for (auto &extent : image_extents) {
+    add_write_event_entries(extent.first, extent.second, bl, buffer_offset,
+                            &bufferlists);
+
+    buffer_offset += extent.second;
+  }
 
   return append_io_events(journal::EVENT_TYPE_AIO_WRITE, bufferlists,
-                          {{offset, length}}, flush_entry, 0);
+                          image_extents, flush_entry, 0);
+}
+
+template <typename I>
+uint64_t Journal<I>::append_write_same_event(const Extents &image_extents,
+                                             const bufferlist &bl,
+                                             bool flush_entry) {
+  Bufferlists bufferlists;
+  for (auto &extent : image_extents) {
+    journal::EventEntry event_entry(
+      journal::AioWriteSameEvent(extent.first, extent.second, bl),
+      ceph_clock_now());
+
+    bufferlists.emplace_back();
+    encode(event_entry, bufferlists.back());
+  }
+
+  return append_io_events(journal::EVENT_TYPE_AIO_WRITESAME, bufferlists,
+                          image_extents, flush_entry, 0);
+}
+
+template <typename I>
+uint64_t Journal<I>::append_discard_event(const Extents &image_extents,
+                                          uint32_t discard_granularity_bytes,
+                                          bool flush_entry) {
+  Bufferlists bufferlists;
+  for (auto &extent : image_extents) {
+    journal::EventEntry event_entry(
+      journal::AioDiscardEvent(extent.first, extent.second,
+                               discard_granularity_bytes),
+      ceph_clock_now());
+
+    bufferlists.emplace_back();
+    encode(event_entry, bufferlists.back());
+  }
+
+  return append_io_events(journal::EVENT_TYPE_AIO_DISCARD, bufferlists,
+                          image_extents, flush_entry, 0);
 }
 
 template <typename I>
index ed2d020378ca7c1fb747cfd64dd6b788b35d4bfe..a671eeee38e358c6a4e46d72f0d035e81f79db13 100644 (file)
@@ -134,9 +134,15 @@ public:
 
   void user_flushed();
 
-  uint64_t append_write_event(uint64_t offset, size_t length,
+  uint64_t append_write_event(const io::Extents &image_extents,
                               const bufferlist &bl,
                               bool flush_entry);
+  uint64_t append_write_same_event(const io::Extents &image_extents,
+                                   const bufferlist &bl,
+                                   bool flush_entry);
+  uint64_t append_discard_event(const io::Extents &image_extents,
+                                uint32_t discard_granularity_bytes,
+                                bool flush_entry);
   uint64_t append_io_event(journal::EventEntry &&event_entry,
                            uint64_t offset, size_t length,
                            bool flush_entry, int filter_ret_val);
@@ -320,6 +326,10 @@ private:
   bool is_journal_replaying(const ceph::mutex &) const;
   bool is_tag_owner(const ceph::mutex &) const;
 
+  void add_write_event_entries(uint64_t offset, size_t length,
+                               const bufferlist &bl,
+                               uint64_t buffer_offset,
+                               Bufferlists *bufferlists);
   uint64_t append_io_events(journal::EventType event_type,
                             const Bufferlists &bufferlists,
                             const io::Extents &extents, bool flush_entry,
index 4b78bbbe1612d191cf30394007996c0f16ff4922..82b5254ec43e3ffaca8c619456c3d45dde5cc303 100644 (file)
@@ -517,19 +517,9 @@ template <typename I>
 uint64_t ImageWriteRequest<I>::append_journal_event() {
   I &image_ctx = this->m_image_ctx;
 
-  uint64_t tid = 0;
-  uint64_t buffer_offset = 0;
   ceph_assert(!this->m_image_extents.empty());
-  for (auto &extent : this->m_image_extents) {
-    bufferlist sub_bl;
-    sub_bl.substr_of(m_bl, buffer_offset, extent.second);
-    buffer_offset += extent.second;
-
-    tid = image_ctx.journal->append_write_event(extent.first, extent.second,
-                                                sub_bl, false);
-  }
-
-  return tid;
+  return image_ctx.journal->append_write_event(
+    this->m_image_extents, m_bl, false);
 }
 
 template <typename I>
@@ -565,19 +555,9 @@ template <typename I>
 uint64_t ImageDiscardRequest<I>::append_journal_event() {
   I &image_ctx = this->m_image_ctx;
 
-  uint64_t tid = 0;
   ceph_assert(!this->m_image_extents.empty());
-  for (auto &extent : this->m_image_extents) {
-    journal::EventEntry event_entry(
-      journal::AioDiscardEvent(extent.first,
-                               extent.second,
-                               this->m_discard_granularity_bytes));
-    tid = image_ctx.journal->append_io_event(std::move(event_entry),
-                                             extent.first, extent.second,
-                                             false, 0);
-  }
-
-  return tid;
+  return image_ctx.journal->append_discard_event(
+    this->m_image_extents, m_discard_granularity_bytes, false);
 }
 
 template <typename I>
@@ -716,18 +696,9 @@ template <typename I>
 uint64_t ImageWriteSameRequest<I>::append_journal_event() {
   I &image_ctx = this->m_image_ctx;
 
-  uint64_t tid = 0;
   ceph_assert(!this->m_image_extents.empty());
-  for (auto &extent : this->m_image_extents) {
-    journal::EventEntry event_entry(journal::AioWriteSameEvent(extent.first,
-                                                               extent.second,
-                                                               m_data_bl));
-    tid = image_ctx.journal->append_io_event(std::move(event_entry),
-                                             extent.first, extent.second,
-                                             false, 0);
-  }
-
-  return tid;
+  return image_ctx.journal->append_write_same_event(
+    this->m_image_extents, m_data_bl, false);
 }
 
 template <typename I>
index 48a4b2a397fcd96f165f9ee6e85fe7ee169678d8..c2d74364d3ad3e0358c1b677779a5ea460ed4adc 100644 (file)
@@ -16,8 +16,11 @@ namespace {
 struct MockTestImageCtx;
 
 struct MockTestJournal : public MockJournal {
-  MOCK_METHOD4(append_write_event, uint64_t(uint64_t, size_t,
+  MOCK_METHOD3(append_write_event, uint64_t(const io::Extents&,
                                             const bufferlist &, bool));
+  MOCK_METHOD3(append_write_same_event, uint64_t(const io::Extents&,
+                                                 const bufferlist &, bool));
+  MOCK_METHOD3(append_discard_event, uint64_t(const io::Extents&, uint32_t, bool));
   MOCK_METHOD5(append_io_event_mock, uint64_t(const journal::EventEntry&,
                                               uint64_t, size_t, bool, int));
   uint64_t append_io_event(journal::EventEntry &&event_entry,
@@ -111,9 +114,10 @@ struct TestMockIoImageRequest : public TestMockFixture {
     }
   }
 
-  void expect_journal_append_io_event(MockTestJournal &mock_journal, uint64_t journal_tid,
-                                      uint64_t offset, size_t length) {
-    EXPECT_CALL(mock_journal, append_io_event_mock(_, offset, length, _, _))
+  void expect_journal_append_discard_event(MockTestJournal &mock_journal,
+                                           uint64_t journal_tid,
+                                           const io::Extents& extents) {
+    EXPECT_CALL(mock_journal, append_discard_event(extents, _, _))
       .WillOnce(Return(journal_tid));
   }
 
@@ -378,8 +382,8 @@ TEST_F(TestMockIoImageRequest, PartialDiscardJournalAppendEnabled) {
   InSequence seq;
   expect_get_modify_timestamp(mock_image_ctx, false);
   expect_is_journal_appending(mock_journal, true);
-  expect_journal_append_io_event(mock_journal, 0, 16, 63);
-  expect_journal_append_io_event(mock_journal, 1, 84, 100);
+  expect_journal_append_discard_event(mock_journal, 0,
+                                      {{16, 63}, {84, 100}});
   expect_object_discard_request(mock_image_ctx, 0, 16, 63, 0);
   expect_object_discard_request(mock_image_ctx, 0, 84, 100, 0);
 
@@ -411,8 +415,8 @@ TEST_F(TestMockIoImageRequest, TailDiscardJournalAppendEnabled) {
   InSequence seq;
   expect_get_modify_timestamp(mock_image_ctx, false);
   expect_is_journal_appending(mock_journal, true);
-  expect_journal_append_io_event(
-    mock_journal, 0, ictx->layout.object_size - 1024, 1024);
+  expect_journal_append_discard_event(
+    mock_journal, 0, {{ictx->layout.object_size - 1024, 1024}});
   expect_object_discard_request(
     mock_image_ctx, 0, ictx->layout.object_size - 1024, 1024, 0);
 
@@ -444,7 +448,7 @@ TEST_F(TestMockIoImageRequest, PruneRequiredDiscardJournalAppendEnabled) {
   InSequence seq;
   expect_get_modify_timestamp(mock_image_ctx, false);
   expect_is_journal_appending(mock_journal, true);
-  EXPECT_CALL(mock_journal, append_io_event_mock(_, _, _, _, _)).Times(0);
+  EXPECT_CALL(mock_journal, append_discard_event(_, _, _)).Times(0);
   EXPECT_CALL(*mock_image_ctx.io_object_dispatcher, send(_)).Times(0);
 
   C_SaferCond aio_comp_ctx;
@@ -474,7 +478,7 @@ TEST_F(TestMockIoImageRequest, LengthModifiedDiscardJournalAppendEnabled) {
   InSequence seq;
   expect_get_modify_timestamp(mock_image_ctx, false);
   expect_is_journal_appending(mock_journal, true);
-  expect_journal_append_io_event(mock_journal, 0, 32, 32);
+  expect_journal_append_discard_event(mock_journal, 0, {{32, 32}});
   expect_object_discard_request(mock_image_ctx, 0, 32, 32, 0);
 
   C_SaferCond aio_comp_ctx;
@@ -505,10 +509,9 @@ TEST_F(TestMockIoImageRequest, DiscardGranularityJournalAppendEnabled) {
   InSequence seq;
   expect_get_modify_timestamp(mock_image_ctx, false);
   expect_is_journal_appending(mock_journal, true);
-  expect_journal_append_io_event(mock_journal, 0, 32, 32);
-  expect_journal_append_io_event(mock_journal, 1, 96, 64);
-  expect_journal_append_io_event(
-    mock_journal, 2, ictx->layout.object_size - 32, 32);
+  expect_journal_append_discard_event(
+    mock_journal, 0,
+    {{32, 32}, {96, 64}, {ictx->layout.object_size - 32, 32}});
   expect_object_discard_request(mock_image_ctx, 0, 32, 32, 0);
   expect_object_discard_request(mock_image_ctx, 0, 96, 64, 0);
   expect_object_discard_request(
index d89c4f8e423d225e2415382e396551fa92a2b3ee..e7033a848f9d786f4fbd45ece7d5992af55aba45 100644 (file)
@@ -195,6 +195,69 @@ TEST_F(TestJournalEntries, AioDiscard) {
   ASSERT_EQ(234U, aio_discard_event.length);
 }
 
+TEST_F(TestJournalEntries, AioDiscardWithPrune) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+  // The discard path can create multiple image extents (ImageRequest.cc) in the
+  // case where the discard request needs to be pruned and multiple objects are
+  // involved in the request. This test ensures that journal event entries are
+  // queued up for each image extent.
+
+  // Create an image that is multiple objects so that we can force multiple
+  // image extents on the discard path.
+  CephContext* cct = reinterpret_cast<CephContext*>(_rados.cct());
+  auto object_size = 1ull << cct->_conf.get_val<uint64_t>("rbd_default_order");
+  auto image_size = 4 * object_size;
+
+  auto image_name = get_temp_image_name();
+  ASSERT_EQ(0, create_image_pp(m_rbd, m_ioctx, image_name, image_size));
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(image_name, &ictx));
+
+  ::journal::Journaler *journaler = create_journaler(ictx);
+  ASSERT_TRUE(journaler != NULL);
+
+  C_SaferCond cond_ctx;
+  auto c = librbd::io::AioCompletion::create(&cond_ctx);
+  c->get();
+  // We offset the discard by -4096 bytes and set discard granularity to 8192;
+  // this should cause two image extents to be formed in
+  // AbstractImageWriteRequest<I>::send_request().
+  api::Io<>::aio_discard(*ictx, c, object_size - 4096, 2 * object_size, 8192,
+                         true);
+  ASSERT_EQ(0, c->wait_for_complete());
+  c->put();
+
+  for (uint64_t chunk = 0; chunk < 2; chunk++) {
+    auto offset = object_size;
+    auto size = object_size;
+    if (chunk == 1) {
+      offset = object_size * 2;
+      size = object_size - 8192;
+    }
+
+    ::journal::ReplayEntry replay_entry;
+    if (!journaler->try_pop_front(&replay_entry)) {
+      ASSERT_TRUE(wait_for_entries_available(ictx));
+      ASSERT_TRUE(journaler->try_pop_front(&replay_entry));
+    }
+
+    librbd::journal::EventEntry event_entry;
+    ASSERT_TRUE(get_event_entry(replay_entry, &event_entry));
+
+    ASSERT_EQ(librbd::journal::EVENT_TYPE_AIO_DISCARD,
+              event_entry.get_event_type());
+
+    librbd::journal::AioDiscardEvent aio_discard_event =
+      boost::get<librbd::journal::AioDiscardEvent>(event_entry.event);
+    ASSERT_EQ(offset, aio_discard_event.offset);
+    ASSERT_EQ(size, aio_discard_event.length);
+
+    journaler->committed(replay_entry);
+  }
+}
+
 TEST_F(TestJournalEntries, AioFlush) {
   REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
 
index b57c66e8907cdfc43f003f46248cc5c3a814a4d0..e717bce1aa627393fd49d65e3b5b06390978eb2b 100644 (file)
@@ -460,7 +460,7 @@ public:
     bl.append_zero(length);
 
     std::shared_lock owner_locker{mock_image_ctx.owner_lock};
-    return mock_journal->append_write_event(0, length, bl, false);
+    return mock_journal->append_write_event({{0, length}}, bl, false);
   }
 
   uint64_t when_append_io_event(MockJournalImageCtx &mock_image_ctx,