In the case where an image request is split across multiple object
extents and journaling is enabled, multiple journal events are appended.
Prior to this change, all object requests would wait for the last
journal event to complete, since journal events complete in order and
thus the last one completing implies that all prior journal events were
safe at that point.
The issue with this is that there's nothing stopping that last journal
event from being cleaned up before all object requests have stopped
referring to it. Thus, it's entirely possible for the following sequence
to occur:
1. An image request gets split into two image extents and two object
requests. Journal events are appended (one per image extent).
2. The first object request gets delayed due to an overlap, but the
second object request gets submitted and starts waiting on the last
journal event (which also causes a C_CommitIOEvent to be instantiated
against that journal event).
3. Journaling completes, and the C_CommitIOEvent fires. The
C_CommitIOEvent covers the entire range of data that was journaled in
this event, and so the event is cleaned up.
4. The first object request from above is allowed to make progress; it
tries to wait for the journal event that was just cleaned up which
causes the assert in wait_event() to fire.
As far as I can tell, this is only possible on the discard path today,
and only recently. Up until
21a26a752843295ff946d1543c2f5f9fac764593
(librbd: Fix local rbd mirror journals growing forever), m_image_extents
always contained a single extent for all I/O types; this commit changed
the discard path so that if discard granularity changed the discard
request, m_image_extents would be repopulated, and if the request
happened to cross objects then there would be multiple m_image_extents.
It appears that the intent here was that there should be one journal
event per image request and the pending_extents kept track of what had
completed thus far. This commit restores that 1:1 relationship.
Fixes: https://tracker.ceph.com/issues/63422
Signed-off-by: Joshua Baergen <jbaergen@digitalocean.com>
}
template <typename I>
-uint64_t Journal<I>::append_write_event(uint64_t offset, size_t length,
- const bufferlist &bl,
- bool flush_entry) {
+void Journal<I>::add_write_event_entries(uint64_t offset, size_t length,
+ const bufferlist &bl,
+ uint64_t buffer_offset,
+ Bufferlists *bufferlists) {
ceph_assert(m_max_append_size > journal::AioWriteEvent::get_fixed_size());
- uint64_t max_write_data_size =
+ const uint64_t max_write_data_size =
m_max_append_size - journal::AioWriteEvent::get_fixed_size();
// ensure that the write event fits within the journal entry
- Bufferlists bufferlists;
uint64_t bytes_remaining = length;
uint64_t event_offset = 0;
do {
uint64_t event_length = std::min(bytes_remaining, max_write_data_size);
bufferlist event_bl;
- event_bl.substr_of(bl, event_offset, event_length);
+ event_bl.substr_of(bl, buffer_offset + event_offset, event_length);
journal::EventEntry event_entry(journal::AioWriteEvent(offset + event_offset,
event_length,
event_bl),
ceph_clock_now());
- bufferlists.emplace_back();
- encode(event_entry, bufferlists.back());
+ bufferlists->emplace_back();
+ encode(event_entry, bufferlists->back());
event_offset += event_length;
bytes_remaining -= event_length;
} while (bytes_remaining > 0);
+}
+
+template <typename I>
+uint64_t Journal<I>::append_write_event(const Extents &image_extents,
+ const bufferlist &bl,
+ bool flush_entry) {
+ Bufferlists bufferlists;
+ uint64_t buffer_offset = 0;
+ for (auto &extent : image_extents) {
+ add_write_event_entries(extent.first, extent.second, bl, buffer_offset,
+ &bufferlists);
+
+ buffer_offset += extent.second;
+ }
return append_io_events(journal::EVENT_TYPE_AIO_WRITE, bufferlists,
- {{offset, length}}, flush_entry, 0);
+ image_extents, flush_entry, 0);
+}
+
+template <typename I>
+uint64_t Journal<I>::append_write_same_event(const Extents &image_extents,
+ const bufferlist &bl,
+ bool flush_entry) {
+ Bufferlists bufferlists;
+ for (auto &extent : image_extents) {
+ journal::EventEntry event_entry(
+ journal::AioWriteSameEvent(extent.first, extent.second, bl),
+ ceph_clock_now());
+
+ bufferlists.emplace_back();
+ encode(event_entry, bufferlists.back());
+ }
+
+ return append_io_events(journal::EVENT_TYPE_AIO_WRITESAME, bufferlists,
+ image_extents, flush_entry, 0);
+}
+
+template <typename I>
+uint64_t Journal<I>::append_discard_event(const Extents &image_extents,
+ uint32_t discard_granularity_bytes,
+ bool flush_entry) {
+ Bufferlists bufferlists;
+ for (auto &extent : image_extents) {
+ journal::EventEntry event_entry(
+ journal::AioDiscardEvent(extent.first, extent.second,
+ discard_granularity_bytes),
+ ceph_clock_now());
+
+ bufferlists.emplace_back();
+ encode(event_entry, bufferlists.back());
+ }
+
+ return append_io_events(journal::EVENT_TYPE_AIO_DISCARD, bufferlists,
+ image_extents, flush_entry, 0);
}
template <typename I>
void user_flushed();
- uint64_t append_write_event(uint64_t offset, size_t length,
+ uint64_t append_write_event(const io::Extents &image_extents,
const bufferlist &bl,
bool flush_entry);
+ uint64_t append_write_same_event(const io::Extents &image_extents,
+ const bufferlist &bl,
+ bool flush_entry);
uint64_t append_compare_and_write_event(uint64_t offset,
size_t length,
const bufferlist &cmp_bl,
const bufferlist &write_bl,
bool flush_entry);
+ uint64_t append_discard_event(const io::Extents &image_extents,
+ uint32_t discard_granularity_bytes,
+ bool flush_entry);
uint64_t append_io_event(journal::EventEntry &&event_entry,
uint64_t offset, size_t length,
bool flush_entry, int filter_ret_val);
bool is_journal_replaying(const ceph::mutex &) const;
bool is_tag_owner(const ceph::mutex &) const;
+ void add_write_event_entries(uint64_t offset, size_t length,
+ const bufferlist &bl,
+ uint64_t buffer_offset,
+ Bufferlists *bufferlists);
uint64_t append_io_events(journal::EventType event_type,
const Bufferlists &bufferlists,
const io::Extents &extents, bool flush_entry,
uint64_t ImageWriteRequest<I>::append_journal_event() {
I &image_ctx = this->m_image_ctx;
- uint64_t tid = 0;
- uint64_t buffer_offset = 0;
ceph_assert(!this->m_image_extents.empty());
- for (auto &extent : this->m_image_extents) {
- bufferlist sub_bl;
- sub_bl.substr_of(m_bl, buffer_offset, extent.second);
- buffer_offset += extent.second;
-
- tid = image_ctx.journal->append_write_event(extent.first, extent.second,
- sub_bl, false);
- }
-
- return tid;
+ return image_ctx.journal->append_write_event(
+ this->m_image_extents, m_bl, false);
}
template <typename I>
uint64_t ImageDiscardRequest<I>::append_journal_event() {
I &image_ctx = this->m_image_ctx;
- uint64_t tid = 0;
ceph_assert(!this->m_image_extents.empty());
- for (auto &extent : this->m_image_extents) {
- journal::EventEntry event_entry(
- journal::AioDiscardEvent(extent.first,
- extent.second,
- this->m_discard_granularity_bytes));
- tid = image_ctx.journal->append_io_event(std::move(event_entry),
- extent.first, extent.second,
- false, 0);
- }
-
- return tid;
+ return image_ctx.journal->append_discard_event(
+ this->m_image_extents, m_discard_granularity_bytes, false);
}
template <typename I>
uint64_t ImageWriteSameRequest<I>::append_journal_event() {
I &image_ctx = this->m_image_ctx;
- uint64_t tid = 0;
ceph_assert(!this->m_image_extents.empty());
- for (auto &extent : this->m_image_extents) {
- journal::EventEntry event_entry(journal::AioWriteSameEvent(extent.first,
- extent.second,
- m_data_bl));
- tid = image_ctx.journal->append_io_event(std::move(event_entry),
- extent.first, extent.second,
- false, 0);
- }
-
- return tid;
+ return image_ctx.journal->append_write_same_event(
+ this->m_image_extents, m_data_bl, false);
}
template <typename I>
struct MockTestImageCtx;
struct MockTestJournal : public MockJournal {
- MOCK_METHOD4(append_write_event, uint64_t(uint64_t, size_t,
+ MOCK_METHOD3(append_write_event, uint64_t(const io::Extents&,
const bufferlist &, bool));
+ MOCK_METHOD3(append_write_same_event, uint64_t(const io::Extents&,
+ const bufferlist &, bool));
MOCK_METHOD5(append_compare_and_write_event, uint64_t(uint64_t, size_t,
const bufferlist &,
const bufferlist &,
bool));
+ MOCK_METHOD3(append_discard_event, uint64_t(const io::Extents&, uint32_t, bool));
MOCK_METHOD5(append_io_event_mock, uint64_t(const journal::EventEntry&,
uint64_t, size_t, bool, int));
uint64_t append_io_event(journal::EventEntry &&event_entry,
}
}
- void expect_journal_append_io_event(MockTestJournal &mock_journal, uint64_t journal_tid,
- uint64_t offset, size_t length) {
- EXPECT_CALL(mock_journal, append_io_event_mock(_, offset, length, _, _))
+ void expect_journal_append_discard_event(MockTestJournal &mock_journal,
+ uint64_t journal_tid,
+ const io::Extents& extents) {
+ EXPECT_CALL(mock_journal, append_discard_event(extents, _, _))
.WillOnce(Return(journal_tid));
}
InSequence seq;
expect_get_modify_timestamp(mock_image_ctx, false);
expect_is_journal_appending(mock_journal, true);
- expect_journal_append_io_event(mock_journal, 0, 16, 63);
- expect_journal_append_io_event(mock_journal, 1, 84, 100);
+ expect_journal_append_discard_event(mock_journal, 0,
+ {{16, 63}, {84, 100}});
expect_object_discard_request(mock_image_ctx, 0, 16, 63, 0);
expect_object_discard_request(mock_image_ctx, 0, 84, 100, 0);
InSequence seq;
expect_get_modify_timestamp(mock_image_ctx, false);
expect_is_journal_appending(mock_journal, true);
- expect_journal_append_io_event(
- mock_journal, 0, ictx->layout.object_size - 1024, 1024);
+ expect_journal_append_discard_event(
+ mock_journal, 0, {{ictx->layout.object_size - 1024, 1024}});
expect_object_discard_request(
mock_image_ctx, 0, ictx->layout.object_size - 1024, 1024, 0);
InSequence seq;
expect_get_modify_timestamp(mock_image_ctx, false);
expect_is_journal_appending(mock_journal, true);
- EXPECT_CALL(mock_journal, append_io_event_mock(_, _, _, _, _)).Times(0);
+ EXPECT_CALL(mock_journal, append_discard_event(_, _, _)).Times(0);
EXPECT_CALL(*mock_image_ctx.io_object_dispatcher, send(_)).Times(0);
C_SaferCond aio_comp_ctx;
InSequence seq;
expect_get_modify_timestamp(mock_image_ctx, false);
expect_is_journal_appending(mock_journal, true);
- expect_journal_append_io_event(mock_journal, 0, 32, 32);
+ expect_journal_append_discard_event(mock_journal, 0, {{32, 32}});
expect_object_discard_request(mock_image_ctx, 0, 32, 32, 0);
C_SaferCond aio_comp_ctx;
InSequence seq;
expect_get_modify_timestamp(mock_image_ctx, false);
expect_is_journal_appending(mock_journal, true);
- expect_journal_append_io_event(mock_journal, 0, 32, 32);
- expect_journal_append_io_event(mock_journal, 1, 96, 64);
- expect_journal_append_io_event(
- mock_journal, 2, ictx->layout.object_size - 32, 32);
+ expect_journal_append_discard_event(
+ mock_journal, 0,
+ {{32, 32}, {96, 64}, {ictx->layout.object_size - 32, 32}});
expect_object_discard_request(mock_image_ctx, 0, 32, 32, 0);
expect_object_discard_request(mock_image_ctx, 0, 96, 64, 0);
expect_object_discard_request(
ASSERT_EQ(234U, aio_discard_event.length);
}
+TEST_F(TestJournalEntries, AioDiscardWithPrune) {
+ REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+ // The discard path can create multiple image extents (ImageRequest.cc) in the
+ // case where the discard request needs to be pruned and multiple objects are
+ // involved in the request. This test ensures that journal event entries are
+ // queued up for each image extent.
+
+ // Create an image that is multiple objects so that we can force multiple
+ // image extents on the discard path.
+ CephContext* cct = reinterpret_cast<CephContext*>(_rados.cct());
+ auto object_size = 1ull << cct->_conf.get_val<uint64_t>("rbd_default_order");
+ auto image_size = 4 * object_size;
+
+ auto image_name = get_temp_image_name();
+ ASSERT_EQ(0, create_image_pp(m_rbd, m_ioctx, image_name, image_size));
+
+ librbd::ImageCtx *ictx;
+ ASSERT_EQ(0, open_image(image_name, &ictx));
+
+ ::journal::Journaler *journaler = create_journaler(ictx);
+ ASSERT_TRUE(journaler != NULL);
+
+ C_SaferCond cond_ctx;
+ auto c = librbd::io::AioCompletion::create(&cond_ctx);
+ c->get();
+ // We offset the discard by -4096 bytes and set discard granularity to 8192;
+ // this should cause two image extents to be formed in
+ // AbstractImageWriteRequest<I>::send_request().
+ api::Io<>::aio_discard(*ictx, c, object_size - 4096, 2 * object_size, 8192,
+ true);
+ ASSERT_EQ(0, c->wait_for_complete());
+ c->put();
+
+ for (uint64_t chunk = 0; chunk < 2; chunk++) {
+ auto offset = object_size;
+ auto size = object_size;
+ if (chunk == 1) {
+ offset = object_size * 2;
+ size = object_size - 8192;
+ }
+
+ ::journal::ReplayEntry replay_entry;
+ if (!journaler->try_pop_front(&replay_entry)) {
+ ASSERT_TRUE(wait_for_entries_available(ictx));
+ ASSERT_TRUE(journaler->try_pop_front(&replay_entry));
+ }
+
+ librbd::journal::EventEntry event_entry;
+ ASSERT_TRUE(get_event_entry(replay_entry, &event_entry));
+
+ ASSERT_EQ(librbd::journal::EVENT_TYPE_AIO_DISCARD,
+ event_entry.get_event_type());
+
+ librbd::journal::AioDiscardEvent aio_discard_event =
+ boost::get<librbd::journal::AioDiscardEvent>(event_entry.event);
+ ASSERT_EQ(offset, aio_discard_event.offset);
+ ASSERT_EQ(size, aio_discard_event.length);
+
+ journaler->committed(replay_entry);
+ }
+}
+
TEST_F(TestJournalEntries, AioFlush) {
REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
bl.append_zero(length);
std::shared_lock owner_locker{mock_image_ctx.owner_lock};
- return mock_journal->append_write_event(0, length, bl, false);
+ return mock_journal->append_write_event({{0, length}}, bl, false);
}
uint64_t when_append_compare_and_write_event(