]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Fix Paxos shutdown handling for commit_finish race
authorDavid Zafman <dzafman@redhat.com>
Mon, 7 Aug 2017 19:48:27 +0000 (12:48 -0700)
committerDavid Zafman <dzafman@redhat.com>
Thu, 10 Aug 2017 19:37:04 +0000 (12:37 -0700)
Fixes: http://tracker.ceph.com/issues/20921
Signed-off-by: David Zafman <dzafman@redhat.com>
src/mon/Paxos.cc
src/mon/Paxos.h

index 0109643b52044b35bf9edba0a3397ec4cdca1498..3555d60dba5d728e45330f19ade22cef21b9ab78 100644 (file)
@@ -817,10 +817,22 @@ struct C_Committed : public Context {
   void finish(int r) override {
     assert(r >= 0);
     Mutex::Locker l(paxos->mon->lock);
+    if (paxos->is_shutdown()) {
+      paxos->abort_commit();
+      return;
+    }
     paxos->commit_finish();
   }
 };
 
+void Paxos::abort_commit()
+{
+  assert(commits_started > 0);
+  --commits_started;
+  if (commits_started == 0)
+    shutdown_cond.Signal();
+}
+
 void Paxos::commit_start()
 {
   dout(10) << __func__ << " " << (last_committed+1) << dendl;
@@ -855,6 +867,7 @@ void Paxos::commit_start()
     state = STATE_WRITING;
   else
     ceph_abort();
+  ++commits_started;
 
   if (mon->get_quorum().size() > 1) {
     // cancel timeout event
@@ -910,6 +923,8 @@ void Paxos::commit_finish()
   // it doesn't need to flush the store queue
   assert(is_writing() || is_writing_previous());
   state = STATE_REFRESH;
+  assert(commits_started > 0);
+  --commits_started;
 
   if (do_refresh()) {
     commit_proposal();
@@ -1301,9 +1316,17 @@ void Paxos::shutdown()
 {
   dout(10) << __func__ << " cancel all contexts" << dendl;
 
+  state = STATE_SHUTDOWN;
+
   // discard pending transaction
   pending_proposal.reset();
 
+  // Let store finish commits in progress
+  // XXX: I assume I can't use finish_contexts() because the store
+  // is going to trigger
+  while(commits_started > 0)
+    shutdown_cond.Wait(mon->lock);
+
   finish_contexts(g_ceph_context, waiting_for_writeable, -ECANCELED);
   finish_contexts(g_ceph_context, waiting_for_commit, -ECANCELED);
   finish_contexts(g_ceph_context, waiting_for_readable, -ECANCELED);
index 4ba84e6cc45b57be74910b148b0f5efcbd537d61..896aafa07eaf1a714c8adac57ae2300399278aa8 100644 (file)
@@ -230,6 +230,8 @@ public:
     STATE_WRITING_PREVIOUS,
     // leader: refresh following a commit
     STATE_REFRESH,
+    // Shutdown after WRITING or WRITING_PREVIOUS
+    STATE_SHUTDOWN
   };
 
   /**
@@ -257,6 +259,8 @@ public:
       return "writing-previous";
     case STATE_REFRESH:
       return "refresh";
+    case STATE_SHUTDOWN:
+      return "shutdown";
     default:
       return "UNKNOWN";
     }
@@ -270,6 +274,9 @@ private:
   /**
    * @}
    */
+  int commits_started = 0;
+
+  Cond shutdown_cond;
 
 public:
   /**
@@ -306,6 +313,9 @@ public:
   /// @return 'true' if we are refreshing an update just committed
   bool is_refresh() const { return state == STATE_REFRESH; }
 
+  /// @return 'true' if we are in the process of shutting down
+  bool is_shutdown() const { return state == STATE_SHUTDOWN; }
+
 private:
   /**
    * @defgroup Paxos_h_recovery_vars Common recovery-related member variables
@@ -880,6 +890,7 @@ private:
    */
   void commit_start();
   void commit_finish();   ///< finish a commit after txn becomes durable
+  void abort_commit();    ///< Handle commit finish after shutdown started
   /**
    * Commit the new value to stable storage as being the latest available
    * version.