From: Jason Dillaman Date: Mon, 6 Jun 2016 23:00:47 +0000 (-0400) Subject: librbd: resize and snap create can hang on journal replay failure X-Git-Tag: v10.2.2~19^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ef12536830082194da52df3a0dc8e22a3f69cb54;p=ceph.git librbd: resize and snap create can hang on journal replay failure Signed-off-by: Jason Dillaman (cherry picked from commit 1032f19b3c9d9c7916669d04af909131574b3cab) --- diff --git a/src/librbd/Journal.cc b/src/librbd/Journal.cc index abd1e1f331d3..8e59356afbf9 100644 --- a/src/librbd/Journal.cc +++ b/src/librbd/Journal.cc @@ -1346,6 +1346,10 @@ void Journal::handle_replay_process_safe(ReplayEntry replay_entry, int r) { CephContext *cct = m_image_ctx.cct; m_lock.Lock(); + assert(m_state == STATE_REPLAYING || + m_state == STATE_FLUSHING_RESTART || + m_state == STATE_FLUSHING_REPLAY); + ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl; if (r < 0) { lderr(cct) << "failed to commit journal event to disk: " << cpp_strerror(r) @@ -1379,8 +1383,8 @@ void Journal::handle_replay_process_safe(ReplayEntry replay_entry, int r) { } else { // only commit the entry if written successfully m_journaler->committed(replay_entry); - m_lock.Unlock(); } + m_lock.Unlock(); } template diff --git a/src/librbd/journal/Replay.cc b/src/librbd/journal/Replay.cc index c6c4f55b602f..c57202a0ba31 100644 --- a/src/librbd/journal/Replay.cc +++ b/src/librbd/journal/Replay.cc @@ -87,6 +87,14 @@ struct ExecuteOp : public Context { } virtual void finish(int r) override { + CephContext *cct = image_ctx.cct; + if (r < 0) { + lderr(cct) << "ExecuteOp: " << __func__ << ": r=" << r << dendl; + on_op_complete->complete(r); + return; + } + + ldout(cct, 20) << "ExecuteOp: " << __func__ << dendl; RWLock::RLocker owner_locker(image_ctx.owner_lock); execute(event); } @@ -102,7 +110,17 @@ struct C_RefreshIfRequired : public Context { } virtual void finish(int r) override { + CephContext *cct = image_ctx.cct; + + if (r < 0) { + lderr(cct) << "C_RefreshIfRequired: " << __func__ << ": r=" << r << dendl; + image_ctx.op_work_queue->queue(on_finish, r); + return; + } + if (image_ctx.state->is_refresh_required()) { + ldout(cct, 20) << "C_RefreshIfRequired: " << __func__ << ": " + << "refresh required" << dendl; image_ctx.state->refresh(on_finish); return; } @@ -156,8 +174,6 @@ void Replay::shut_down(bool cancel_ops, Context *on_finish) { ldout(cct, 20) << this << " " << __func__ << dendl; AioCompletion *flush_comp = nullptr; - OpTids cancel_op_tids; - Contexts op_finish_events; on_finish = util::create_async_context_callback( m_image_ctx, on_finish); @@ -176,7 +192,9 @@ void Replay::shut_down(bool cancel_ops, Context *on_finish) { // OpFinishEvent or waiting for ready) if (op_event.on_start_ready == nullptr && op_event.on_op_finish_event != nullptr) { - cancel_op_tids.push_back(op_event_pair.first); + Context *on_op_finish_event = nullptr; + std::swap(on_op_finish_event, op_event.on_op_finish_event); + m_image_ctx.op_work_queue->queue(on_op_finish_event, -ERESTART); } } else if (op_event.on_op_finish_event != nullptr) { // start ops waiting for OpFinishEvent @@ -200,9 +218,6 @@ void Replay::shut_down(bool cancel_ops, Context *on_finish) { RWLock::RLocker owner_locker(m_image_ctx.owner_lock); AioImageRequest::aio_flush(&m_image_ctx, flush_comp); } - for (auto op_tid : cancel_op_tids) { - handle_op_complete(op_tid, -ERESTART); - } if (on_finish != nullptr) { on_finish->complete(0); } @@ -743,10 +758,8 @@ void Replay::handle_op_complete(uint64_t op_tid, int r) { op_event.on_finish_safe != nullptr) || shutting_down); } - // skipped upon error -- so clean up if non-null - delete op_event.on_op_finish_event; - if (r == -ERESTART) { - delete op_event.on_op_complete; + if (op_event.on_op_finish_event != nullptr) { + op_event.on_op_finish_event->complete(r); } if (op_event.on_finish_ready != nullptr) { diff --git a/src/test/librbd/journal/test_mock_Replay.cc b/src/test/librbd/journal/test_mock_Replay.cc index 936622c6df0a..43f2909b2bb1 100644 --- a/src/test/librbd/journal/test_mock_Replay.cc +++ b/src/test/librbd/journal/test_mock_Replay.cc @@ -615,9 +615,15 @@ TEST_F(TestMockJournalReplay, MissingOpFinishEventCancelOps) { when_replay_op_ready(mock_journal_replay, 123, &on_resume); ASSERT_EQ(0, on_snap_create_ready.wait()); - ASSERT_EQ(0, when_shut_down(mock_journal_replay, true)); - ASSERT_EQ(-ERESTART, on_snap_remove_safe.wait()); + C_SaferCond on_shut_down; + mock_journal_replay.shut_down(true, &on_shut_down); + + ASSERT_EQ(-ERESTART, on_resume.wait()); + on_snap_create_finish->complete(-ERESTART); ASSERT_EQ(-ERESTART, on_snap_create_safe.wait()); + + ASSERT_EQ(-ERESTART, on_snap_remove_safe.wait()); + ASSERT_EQ(0, on_shut_down.wait()); } TEST_F(TestMockJournalReplay, UnknownOpFinishEvent) {