]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
librbd: resize and snap create can hang on journal replay failure
authorJason Dillaman <dillaman@redhat.com>
Mon, 6 Jun 2016 23:00:47 +0000 (19:00 -0400)
committerJason Dillaman <dillaman@redhat.com>
Thu, 9 Jun 2016 18:09:48 +0000 (14:09 -0400)
Signed-off-by: Jason Dillaman <dillaman@redhat.com>
(cherry picked from commit 1032f19b3c9d9c7916669d04af909131574b3cab)

src/librbd/Journal.cc
src/librbd/journal/Replay.cc
src/test/librbd/journal/test_mock_Replay.cc

index abd1e1f331d32145ceed7fdb643ec27306c7c8ec..8e59356afbf956f6a9e5a36e037b171f790d39a9 100644 (file)
@@ -1346,6 +1346,10 @@ void Journal<I>::handle_replay_process_safe(ReplayEntry replay_entry, int r) {
   CephContext *cct = m_image_ctx.cct;
 
   m_lock.Lock();
+  assert(m_state == STATE_REPLAYING ||
+         m_state == STATE_FLUSHING_RESTART ||
+         m_state == STATE_FLUSHING_REPLAY);
+
   ldout(cct, 20) << this << " " << __func__ << ": r=" << r << dendl;
   if (r < 0) {
     lderr(cct) << "failed to commit journal event to disk: " << cpp_strerror(r)
@@ -1379,8 +1383,8 @@ void Journal<I>::handle_replay_process_safe(ReplayEntry replay_entry, int r) {
   } else {
     // only commit the entry if written successfully
     m_journaler->committed(replay_entry);
-    m_lock.Unlock();
   }
+  m_lock.Unlock();
 }
 
 template <typename I>
index c6c4f55b602f1a25e0ab6c4e1391ff37c3462fc8..c57202a0ba31b1dad9331a75900d58a496471606 100644 (file)
@@ -87,6 +87,14 @@ struct ExecuteOp : public Context {
   }
 
   virtual void finish(int r) override {
+    CephContext *cct = image_ctx.cct;
+    if (r < 0) {
+      lderr(cct) << "ExecuteOp: " << __func__ << ": r=" << r << dendl;
+      on_op_complete->complete(r);
+      return;
+    }
+
+    ldout(cct, 20) << "ExecuteOp: " << __func__ << dendl;
     RWLock::RLocker owner_locker(image_ctx.owner_lock);
     execute(event);
   }
@@ -102,7 +110,17 @@ struct C_RefreshIfRequired : public Context {
   }
 
   virtual void finish(int r) override {
+    CephContext *cct = image_ctx.cct;
+
+    if (r < 0) {
+      lderr(cct) << "C_RefreshIfRequired: " << __func__ << ": r=" << r << dendl;
+      image_ctx.op_work_queue->queue(on_finish, r);
+      return;
+    }
+
     if (image_ctx.state->is_refresh_required()) {
+      ldout(cct, 20) << "C_RefreshIfRequired: " << __func__ << ": "
+                     << "refresh required" << dendl;
       image_ctx.state->refresh(on_finish);
       return;
     }
@@ -156,8 +174,6 @@ void Replay<I>::shut_down(bool cancel_ops, Context *on_finish) {
   ldout(cct, 20) << this << " " << __func__ << dendl;
 
   AioCompletion *flush_comp = nullptr;
-  OpTids cancel_op_tids;
-  Contexts op_finish_events;
   on_finish = util::create_async_context_callback(
     m_image_ctx, on_finish);
 
@@ -176,7 +192,9 @@ void Replay<I>::shut_down(bool cancel_ops, Context *on_finish) {
         // OpFinishEvent or waiting for ready)
         if (op_event.on_start_ready == nullptr &&
             op_event.on_op_finish_event != nullptr) {
-          cancel_op_tids.push_back(op_event_pair.first);
+          Context *on_op_finish_event = nullptr;
+          std::swap(on_op_finish_event, op_event.on_op_finish_event);
+          m_image_ctx.op_work_queue->queue(on_op_finish_event, -ERESTART);
         }
       } else if (op_event.on_op_finish_event != nullptr) {
         // start ops waiting for OpFinishEvent
@@ -200,9 +218,6 @@ void Replay<I>::shut_down(bool cancel_ops, Context *on_finish) {
     RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
     AioImageRequest<I>::aio_flush(&m_image_ctx, flush_comp);
   }
-  for (auto op_tid : cancel_op_tids) {
-    handle_op_complete(op_tid, -ERESTART);
-  }
   if (on_finish != nullptr) {
     on_finish->complete(0);
   }
@@ -743,10 +758,8 @@ void Replay<I>::handle_op_complete(uint64_t op_tid, int r) {
             op_event.on_finish_safe != nullptr) || shutting_down);
   }
 
-  // skipped upon error -- so clean up if non-null
-  delete op_event.on_op_finish_event;
-  if (r == -ERESTART) {
-    delete op_event.on_op_complete;
+  if (op_event.on_op_finish_event != nullptr) {
+    op_event.on_op_finish_event->complete(r);
   }
 
   if (op_event.on_finish_ready != nullptr) {
index 936622c6df0a81b69036b4460c6832c470448ea1..43f2909b2bb1ce47aa67994c9fed3aef0a53ac0d 100644 (file)
@@ -615,9 +615,15 @@ TEST_F(TestMockJournalReplay, MissingOpFinishEventCancelOps) {
   when_replay_op_ready(mock_journal_replay, 123, &on_resume);
   ASSERT_EQ(0, on_snap_create_ready.wait());
 
-  ASSERT_EQ(0, when_shut_down(mock_journal_replay, true));
-  ASSERT_EQ(-ERESTART, on_snap_remove_safe.wait());
+  C_SaferCond on_shut_down;
+  mock_journal_replay.shut_down(true, &on_shut_down);
+
+  ASSERT_EQ(-ERESTART, on_resume.wait());
+  on_snap_create_finish->complete(-ERESTART);
   ASSERT_EQ(-ERESTART, on_snap_create_safe.wait());
+
+  ASSERT_EQ(-ERESTART, on_snap_remove_safe.wait());
+  ASSERT_EQ(0, on_shut_down.wait());
 }
 
 TEST_F(TestMockJournalReplay, UnknownOpFinishEvent) {