qa: add mds deactivation procedure for upgrades

author Patrick Donnelly <pdonnell@redhat.com>

Thu, 3 May 2018 20:12:54 +0000 (13:12 -0700)

committer Patrick Donnelly <pdonnell@redhat.com>

Thu, 3 May 2018 23:51:43 +0000 (16:51 -0700)
author Patrick Donnelly <pdonnell@redhat.com>
Thu, 3 May 2018 20:12:54 +0000 (13:12 -0700)
committer Patrick Donnelly <pdonnell@redhat.com>
Thu, 3 May 2018 23:51:43 +0000 (16:51 -0700)
diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py

index d22126a9baab844edeecda751662f7dc0cbabf31..4ff3cc01413f12aa35842b9175114d687ef9c245 100644 (file)
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@@ -9,6 +9,7 @@ import datetime
  import re
  import errno
  import random
+import traceback
  
  from teuthology.exceptions import CommandFailedError
  from teuthology import misc
@@ -432,6 +433,42 @@ class Filesystem(MDSCluster):
              raise RuntimeError("cannot specify fscid when configuring overlay")
          self.metadata_overlay = overlay
  
+    def deactivate(self, rank):
+        if rank < 0:
+            raise RuntimeError("invalid rank")
+        elif rank == 0:
+            raise RuntimeError("cannot deactivate rank 0")
+        self.mon_manager.raw_cluster_cmd("mds", "deactivate", "%d:%d" % (self.id, rank))
+
+    def reach_max_mds(self):
+        # Try to reach rank count == max_mds, up or down (UPGRADE SENSITIVE!)
+        status = self.getinfo()
+        mds_map = self.get_mds_map(status=status)
+        max_mds = mds_map['max_mds']
+
+        count = len(list(self.get_ranks(status=status)))
+        if count > max_mds:
+            try:
+                # deactivate mds in decending order
+                status = self.wait_for_daemons(status=status, skip_max_mds_check=True)
+                while count > max_mds:
+                    targets = sorted(self.get_ranks(status=status), key=lambda r: r['rank'], reverse=True)
+                    target = targets[0]
+                    log.info("deactivating rank %d" % target['rank'])
+                    self.deactivate(target['rank'])
+                    status = self.wait_for_daemons(skip_max_mds_check=True)
+                    count = len(list(self.get_ranks(status=status)))
+            except:
+                # In Mimic, deactivation is done automatically:
+                log.info("Error:\n{}".format(traceback.format_exc()))
+                status = self.wait_for_daemons()
+        else:
+            status = self.wait_for_daemons()
+
+        mds_map = self.get_mds_map(status=status)
+        assert(mds_map['max_mds'] == max_mds)
+        assert(mds_map['in'] == list(range(0, max_mds)))
+
      def set_var(self, var, *args):
          a = map(str, args)
          self.mon_manager.raw_cluster_cmd("fs", "set", self.name, var, *a)
@@ -631,7 +668,7 @@ class Filesystem(MDSCluster):
      def get_usage(self):
          return self._df()['stats']['total_used_bytes']
  
-    def are_daemons_healthy(self, status=None):
+    def are_daemons_healthy(self, status=None, skip_max_mds_check=False):
          """
          Return true if all daemons are in one of active, standby, standby-replay, and
          at least max_mds daemons are in 'active'.
@@ -671,30 +708,34 @@ class Filesystem(MDSCluster):
              active_count, mds_map['max_mds']
          ))
  
-        if active_count > mds_map['max_mds']:
-            log.info("are_daemons_healthy: number of actives is grater than max_mds: {0}".format(mds_map))
-            return False
-        elif active_count == mds_map['max_mds']:
-            # The MDSMap says these guys are active, but let's check they really are
-            for mds_id, mds_status in mds_map['info'].items():
-                if mds_status['state'] == 'up:active':
-                    try:
-                        daemon_status = self.mds_asok(["status"], mds_id=mds_status['name'])
-                    except CommandFailedError as cfe:
-                        if cfe.exitstatus == errno.EINVAL:
-                            # Old version, can't do this check
-                            continue
-                        else:
-                            # MDS not even running
+        if not skip_max_mds_check:
+            if active_count > mds_map['max_mds']:
+                log.info("are_daemons_healthy: number of actives is greater than max_mds: {0}".format(mds_map))
+                return False
+            elif active_count == mds_map['max_mds']:
+                # The MDSMap says these guys are active, but let's check they really are
+                for mds_id, mds_status in mds_map['info'].items():
+                    if mds_status['state'] == 'up:active':
+                        try:
+                            daemon_status = self.mds_asok(["status"], mds_id=mds_status['name'])
+                        except CommandFailedError as cfe:
+                            if cfe.exitstatus == errno.EINVAL:
+                                # Old version, can't do this check
+                                continue
+                            else:
+                                # MDS not even running
+                                return False
+
+                        if daemon_status['state'] != 'up:active':
+                            # MDS hasn't taken the latest map yet
                              return False
  
-                    if daemon_status['state'] != 'up:active':
-                        # MDS hasn't taken the latest map yet
-                        return False
-
-            return True
+                return True
+            else:
+                return False
          else:
-            return False
+            log.info("are_daemons_healthy: skipping max_mds check")
+            return True
  
      def get_daemon_names(self, state=None, status=None):
          """
@@ -753,7 +794,7 @@ class Filesystem(MDSCluster):
  
          return result
  
-    def wait_for_daemons(self, timeout=None):
+    def wait_for_daemons(self, timeout=None, skip_max_mds_check=False, status=None):
          """
          Wait until all daemons are healthy
          :return:
@@ -762,10 +803,12 @@ class Filesystem(MDSCluster):
          if timeout is None:
              timeout = DAEMON_WAIT_TIMEOUT
  
+        if status is None:
+            status = self.status()
+
          elapsed = 0
          while True:
-            status = self.status()
-            if self.are_daemons_healthy(status=status):
+            if self.are_daemons_healthy(status=status, skip_max_mds_check=skip_max_mds_check):
                  return status
              else:
                  time.sleep(1)
@@ -775,6 +818,8 @@ class Filesystem(MDSCluster):
                  log.info("status = {0}".format(status))
                  raise RuntimeError("Timed out waiting for MDS daemons to become healthy")
  
+            status = self.status()
+
      def get_lone_mds_id(self):
          """
          Get a single MDS ID: the only one if there is only one
diff --git a/qa/tasks/mds_pre_upgrade.py b/qa/tasks/mds_pre_upgrade.py

index 5193f92eefa9b6ed22c74c69aaeb09d89739ae21..0856d48337c436c20d03311161727a4a552f16cc 100644 (file)
--- a/qa/tasks/mds_pre_upgrade.py
+++ b/qa/tasks/mds_pre_upgrade.py
@@ -25,20 +25,7 @@ def task(ctx, config):
      status = fs.getinfo()
  
      fs.set_max_mds(1)
-    status = fs.getinfo()
-    targets = filter(lambda r: r['rank'] >= 1, fs.get_ranks(status=status))
-    if len(targets) > 0:
-        # deactivate mds in decending order
-        targets = sorted(targets, key=lambda r: r['rank'], reverse=True)
-        for target in targets:
-            self.log("deactivating rank %d" % target['rank'])
-            self.fs.deactivate(target['rank'])
-            status = self.wait_for_stable()[0]
-        else:
-            status = self.wait_for_stable()[0]
-
-    assert(fs.get_mds_map(status=status)['max_mds'] == 1)
-    assert(fs.get_mds_map(status=status)['in'] == [0])
+    fs.reach_max_mds()
  
      # Stop standbys now to minimize time rank 0 is down in subsequent:
      # tasks:
author	Patrick Donnelly <pdonnell@redhat.com>
	Thu, 3 May 2018 20:12:54 +0000 (13:12 -0700)
committer	Patrick Donnelly <pdonnell@redhat.com>
	Thu, 3 May 2018 23:51:43 +0000 (16:51 -0700)
qa/tasks/cephfs/filesystem.py		patch \| blob \| history
qa/tasks/mds_pre_upgrade.py		patch \| blob \| history