qa/tasks/cephfs: add test for discontinuous mdsmap

author Yan, Zheng <zyan@redhat.com>

Thu, 19 Jul 2018 04:36:31 +0000 (12:36 +0800)

committer Yan, Zheng <zyan@redhat.com>

Sun, 22 Jul 2018 10:22:35 +0000 (18:22 +0800)
author Yan, Zheng <zyan@redhat.com>
Thu, 19 Jul 2018 04:36:31 +0000 (12:36 +0800)
committer Yan, Zheng <zyan@redhat.com>
Sun, 22 Jul 2018 10:22:35 +0000 (18:22 +0800)
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py

index b3950441990866a76458a78e9dfc9a20a4068a32..393d69e53790a60179f4b03577c314d641685bb2 100644 (file)
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@@ -268,6 +268,12 @@ class MDSCluster(CephCluster):
  
          self._one_or_all(mds_id, _fail_restart)
  
+    def mds_signal(self, mds_id, sig, silent=False):
+        """
+        signal a MDS daemon
+        """
+        self.mds_daemons[mds_id].signal(sig, silent);
+
      def newfs(self, name='cephfs', create=True):
          return Filesystem(self._ctx, name=name, create=create)
  
diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py

index 9d3392c6953834c286c0ad010db0d85a027e76b3..dd8416136d3c66038330de27f41ad0017c48a3ed 100644 (file)
--- a/qa/tasks/cephfs/test_failover.py
+++ b/qa/tasks/cephfs/test_failover.py
@@ -1,3 +1,5 @@
+import time
+import signal
  import json
  import logging
  from unittest import case, SkipTest
@@ -133,8 +135,59 @@ class TestFailover(CephFSTestCase):
          self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
          self.wait_for_health_clear(timeout=30)
  
+    def test_discontinuous_mdsmap(self):
+        """
+        That discontinuous mdsmap does not affect failover.
+        See http://tracker.ceph.com/issues/24856.
+        """
+        mds_ids = sorted(self.mds_cluster.mds_ids)
+        mds_a, mds_b = mds_ids[0:2]
+        # Assign mds to fixed ranks. To prevent standby mds from replacing frozen mds
+        rank = 0;
+        for mds_id in mds_ids:
+            self.set_conf("mds.{0}".format(mds_id), "mds_standby_for_rank", str(rank))
+            rank += 1
+        self.mds_cluster.mds_restart()
+        self.fs.wait_for_daemons()
+
+        self.fs.set_max_mds(2)
+        self.fs.wait_for_state('up:active', rank=1)
+
+        # Drop 'export prep' message, make import stay in 'discovered' state
+        self.fs.mds_asok(['config', 'set', 'mds_inject_migrator_message_loss', '82'], mds_id=mds_b)
+
+        self.mount_a.run_shell(["mkdir", "a"])
+        self.mount_a.setfattr("a", "ceph.dir.pin", "1")
+        self.mount_a.umount_wait()
+
+        # Should be long enough for start the export
+        time.sleep(30)
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+        monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds"))
  
+        # Freeze mds_b
+        self.mds_cluster.mds_signal(mds_b, signal.SIGSTOP)
+        self.wait_until_true(
+            lambda: "laggy_since" in self.fs.mon_manager.get_mds_status(mds_b),
+            timeout=grace * 2
+        )
+
+        self.mds_cluster.mds_restart(mds_a)
+        self.fs.wait_for_state('up:resolve', rank=0, timeout=30)
+
+        # Make sure of mds_b's monitor connection gets reset
+        time.sleep(monc_timeout * 2)
+
+        # Unfreeze mds_b, it will get discontinuous mdsmap
+        self.mds_cluster.mds_signal(mds_b, signal.SIGCONT)
+        self.wait_until_true(
+            lambda: "laggy_since" not in self.fs.mon_manager.get_mds_status(mds_b),
+            timeout=grace * 2
+        )
  
+        # Check if mds_b sends 'resolve' message to mds_a. If not, mds_a can't become active
+        self.fs.wait_for_state('up:active', rank=0, timeout=30)
  
  class TestStandbyReplay(CephFSTestCase):
      MDSS_REQUIRED = 4
diff --git a/qa/tasks/vstart_runner.py b/qa/tasks/vstart_runner.py

index c3988214abf5c11303ef7ebc2231a686bb9f2e1a..e7f7f68f34570557db146eb87d7b6b63d3e15748 100644 (file)
--- a/qa/tasks/vstart_runner.py
+++ b/qa/tasks/vstart_runner.py
@@ -373,6 +373,14 @@ class LocalDaemon(object):
  
          self.proc = self.controller.run([os.path.join(BIN_PREFIX, "./ceph-{0}".format(self.daemon_type)), "-i", self.daemon_id])
  
+    def signal(self, sig, silent=False):
+        if not self.running():
+            raise RuntimeError("Can't send signal to non-running daemon")
+
+        os.kill(self._get_pid(), sig)
+        if not silent:
+            log.info("Sent signal {0} to {1}.{2}".format(sig, self.daemon_type, self.daemon_id))
+
  
  def safe_kill(pid):
      """
diff --git a/src/common/options.cc b/src/common/options.cc

index 5c83f9527773e566eeb6d20e1d51f6656872ca77..5c1198cc7be989ab714ac86ffb2570cccc1be290 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -6423,6 +6423,11 @@ std::vector<Option> get_mds_options() {
  
      Option("mds_inject_migrator_session_race", Option::TYPE_BOOL, Option::LEVEL_DEV)
       .set_default(false),
+
+    Option("mds_inject_migrator_message_loss", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
    });
  }
  
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc

index 452d5bbf6602be0125b1b629a084790380251b82..9d9428b79628b9029a06a07b3a9dedc244120e87 100644 (file)
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@@ -363,6 +363,7 @@ const char** MDSDaemon::get_tracked_conf_keys() const
      "mds_max_purge_ops_per_pg",
      "mds_max_purge_files",
      "mds_inject_migrator_session_race",
+    "mds_inject_migrator_message_loss",
      "clog_to_graylog",
      "clog_to_graylog_host",
      "clog_to_graylog_port",
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc

index 40a89626bc798710342ca357424d873370d4ecf8..cb9404779335c6de32ba168deb079260977c436b 100644 (file)
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -110,6 +110,14 @@ public:
  /* This function DOES put the passed message before returning*/
  void Migrator::dispatch(Message *m)
  {
+  if (unlikely(inject_message_loss)) {
+    if (inject_message_loss == m->get_type() - MDS_PORT_MIGRATOR) {
+      dout(0) << "inject message loss " << *m << dendl;
+      m->put();
+      return;
+    }
+  }
+
    switch (m->get_type()) {
      // import
    case MSG_MDS_EXPORTDIRDISCOVER:
@@ -3403,4 +3411,9 @@ void Migrator::handle_conf_change(const struct md_config_t *conf,
      inject_session_race = conf->get_val<bool>("mds_inject_migrator_session_race");
      dout(0) << "mds_inject_migrator_session_race is " << inject_session_race << dendl;
    }
+
+  if (changed.count("mds_inject_migrator_message_loss")) {
+    inject_message_loss = g_conf->get_val<int64_t>("mds_inject_migrator_message_loss");
+    dout(0) << "mds_inject_migrator_message_loss is " << inject_message_loss << dendl;
+  }
  }
diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h

index 148b2fb4fd2c01422109e9f36455929c5269a291..d9f0b518af945ba147df430fb413aeb8692a40c8 100644 (file)
--- a/src/mds/Migrator.h
+++ b/src/mds/Migrator.h
@@ -104,6 +104,7 @@ public:
    // -- cons --
    Migrator(MDSRank *m, MDCache *c) : mds(m), cache(c) {
      inject_session_race = g_conf->get_val<bool>("mds_inject_migrator_session_race");
+    inject_message_loss = g_conf->get_val<int64_t>("mds_inject_migrator_message_loss");
    }
  
    void handle_conf_change(const struct md_config_t *conf,
@@ -352,6 +353,7 @@ private:
    MDSRank *mds;
    MDCache *cache;
    bool inject_session_race = false;
+  int inject_message_loss = 0;
  };
  
  #endif
author	Yan, Zheng <zyan@redhat.com>
	Thu, 19 Jul 2018 04:36:31 +0000 (12:36 +0800)
committer	Yan, Zheng <zyan@redhat.com>
	Sun, 22 Jul 2018 10:22:35 +0000 (18:22 +0800)
qa/tasks/cephfs/filesystem.py		patch \| blob \| history
qa/tasks/cephfs/test_failover.py		patch \| blob \| history
qa/tasks/vstart_runner.py		patch \| blob \| history
src/common/options.cc		patch \| blob \| history
src/mds/MDSDaemon.cc		patch \| blob \| history
src/mds/Migrator.cc		patch \| blob \| history
src/mds/Migrator.h		patch \| blob \| history