mds: on suicide(), send a DNE beacon to MDSMonitor

author John Spray <john.spray@redhat.com>

Tue, 12 May 2015 13:56:44 +0000 (14:56 +0100)

committer John Spray <john.spray@redhat.com>

Thu, 14 May 2015 12:58:13 +0000 (13:58 +0100)
author John Spray <john.spray@redhat.com>
Tue, 12 May 2015 13:56:44 +0000 (14:56 +0100)
committer John Spray <john.spray@redhat.com>
Thu, 14 May 2015 12:58:13 +0000 (13:58 +0100)
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc

index 96745fecf0321b714827a500034d6b2b98fa7ef2..61f977ea3ae7a3ebc748f129a0bcac82af5dc02e 100644 (file)
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1073,7 +1073,7 @@ int MDS::init(MDSMap::DaemonState wanted_state)
      standby_for_rank = MDSMap::MDS_MATCHED_ACTIVE;
  
    beacon.init(mdsmap, want_state, standby_for_rank, standby_for_name);
-  whoami = -1;
+  whoami = MDS_RANK_NONE;
    messenger->set_myname(entity_name_t::MDS(whoami));
    
    // schedule tick
@@ -1627,6 +1627,9 @@ void MDS::handle_mds_map(MMDSMap *m)
               dout(1) << "handle_mds_map i (" << addr
                       << ") dne in the mdsmap, new instance has larger gid " << i.global_id
                       << ", suicide" << dendl;
+              // Call suicide() rather than respawn() because if someone else
+              // has taken our ID, we don't want to keep restarting and
+              // fighting them for the ID.
               suicide();
               goto out;
             }
@@ -2385,6 +2388,8 @@ void MDS::handle_signal(int signum)
  
  void MDS::damaged()
  {
+  assert(whoami != MDS_RANK_NONE);
+
    set_want_state(MDSMap::STATE_DAMAGED);
    monc->flush_log();  // Flush any clog error from before we were called
    beacon.notify_health(this);  // Include latest status in our swan song
@@ -2397,11 +2402,18 @@ void MDS::damaged()
    respawn();  // Respawn into standby in case mon has other work for us
  }
  
-void MDS::suicide()
+void MDS::suicide(bool fast)
  {
    assert(mds_lock.is_locked());
    set_want_state(MDSMap::STATE_DNE); // whatever.
  
+  if (!fast && !mdsmap->is_dne_gid(mds_gid_t(monc->get_global_id()))) {
+    // Notify the MDSMonitor that we're dying, so that it doesn't have to
+    // wait for us to go laggy.  Only do this if we're actually in the
+    // MDSMap, because otherwise the MDSMonitor will drop our message.
+    beacon.send_and_wait(1);
+  }
+
    dout(1) << "suicide.  wanted " << ceph_mds_state_name(want_state)
           << ", now " << ceph_mds_state_name(state) << dendl;
  
@@ -2482,7 +2494,7 @@ void MDS::respawn()
  
    dout(0) << "respawn execv " << orig_argv[0]
           << " failed with " << cpp_strerror(errno) << dendl;
-  suicide();
+  suicide(true);
  }
  
  void MDS::handle_write_error(int err)
diff --git a/src/mds/MDS.h b/src/mds/MDS.h

index 4820299d7bf0439259046616cddee187946f3914..f7335e46bca13cc851c9db52ffc4a3a0442b3646 100644 (file)
--- a/src/mds/MDS.h
+++ b/src/mds/MDS.h
@@ -451,7 +451,19 @@ private:
     * through cleaner scrub/repair mechanisms.
     */
    void damaged();
-  void suicide();
+
+  /**
+   * Terminate this daemon process.
+   *
+   * @param fast: if true, do not send a message to the mon before shutting
+   *              down
+   */
+  void suicide(bool fast = false);
+
+  /**
+   * Start a new daemon process with the same command line parameters that
+   * this process was run with, then terminate this process
+   */
    void respawn();
    void handle_write_error(int err);
author	John Spray <john.spray@redhat.com>
	Tue, 12 May 2015 13:56:44 +0000 (14:56 +0100)
committer	John Spray <john.spray@redhat.com>
	Thu, 14 May 2015 12:58:13 +0000 (13:58 +0100)
src/mds/MDS.cc		patch \| blob \| history
src/mds/MDS.h		patch \| blob \| history