From: Patrick Donnelly <pdonnell@redhat.com>
Date: Thu, 3 May 2018 20:12:54 +0000 (-0700)
Subject: qa: add mds deactivation procedure for upgrades
X-Git-Tag: v13.1.1~50^2~2
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=34f395512a0e2ddcf219f65406a748f564513c50;p=ceph-ci.git

qa: add mds deactivation procedure for upgrades

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
(cherry picked from commit 6a788bf203dc07d32f299ce488b054addaae4f75)
---

diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py
index d22126a9baa..4ff3cc01413 100644
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@@ -9,6 +9,7 @@ import datetime
 import re
 import errno
 import random
+import traceback
 
 from teuthology.exceptions import CommandFailedError
 from teuthology import misc
@@ -432,6 +433,42 @@ class Filesystem(MDSCluster):
             raise RuntimeError("cannot specify fscid when configuring overlay")
         self.metadata_overlay = overlay
 
+    def deactivate(self, rank):
+        if rank < 0:
+            raise RuntimeError("invalid rank")
+        elif rank == 0:
+            raise RuntimeError("cannot deactivate rank 0")
+        self.mon_manager.raw_cluster_cmd("mds", "deactivate", "%d:%d" % (self.id, rank))
+
+    def reach_max_mds(self):
+        # Try to reach rank count == max_mds, up or down (UPGRADE SENSITIVE!)
+        status = self.getinfo()
+        mds_map = self.get_mds_map(status=status)
+        max_mds = mds_map['max_mds']
+
+        count = len(list(self.get_ranks(status=status)))
+        if count > max_mds:
+            try:
+                # deactivate mds in decending order
+                status = self.wait_for_daemons(status=status, skip_max_mds_check=True)
+                while count > max_mds:
+                    targets = sorted(self.get_ranks(status=status), key=lambda r: r['rank'], reverse=True)
+                    target = targets[0]
+                    log.info("deactivating rank %d" % target['rank'])
+                    self.deactivate(target['rank'])
+                    status = self.wait_for_daemons(skip_max_mds_check=True)
+                    count = len(list(self.get_ranks(status=status)))
+            except:
+                # In Mimic, deactivation is done automatically:
+                log.info("Error:\n{}".format(traceback.format_exc()))
+                status = self.wait_for_daemons()
+        else:
+            status = self.wait_for_daemons()
+
+        mds_map = self.get_mds_map(status=status)
+        assert(mds_map['max_mds'] == max_mds)
+        assert(mds_map['in'] == list(range(0, max_mds)))
+
     def set_var(self, var, *args):
         a = map(str, args)
         self.mon_manager.raw_cluster_cmd("fs", "set", self.name, var, *a)
@@ -631,7 +668,7 @@ class Filesystem(MDSCluster):
     def get_usage(self):
         return self._df()['stats']['total_used_bytes']
 
-    def are_daemons_healthy(self, status=None):
+    def are_daemons_healthy(self, status=None, skip_max_mds_check=False):
         """
         Return true if all daemons are in one of active, standby, standby-replay, and
         at least max_mds daemons are in 'active'.
@@ -671,30 +708,34 @@ class Filesystem(MDSCluster):
             active_count, mds_map['max_mds']
         ))
 
-        if active_count > mds_map['max_mds']:
-            log.info("are_daemons_healthy: number of actives is grater than max_mds: {0}".format(mds_map))
-            return False
-        elif active_count == mds_map['max_mds']:
-            # The MDSMap says these guys are active, but let's check they really are
-            for mds_id, mds_status in mds_map['info'].items():
-                if mds_status['state'] == 'up:active':
-                    try:
-                        daemon_status = self.mds_asok(["status"], mds_id=mds_status['name'])
-                    except CommandFailedError as cfe:
-                        if cfe.exitstatus == errno.EINVAL:
-                            # Old version, can't do this check
-                            continue
-                        else:
-                            # MDS not even running
+        if not skip_max_mds_check:
+            if active_count > mds_map['max_mds']:
+                log.info("are_daemons_healthy: number of actives is greater than max_mds: {0}".format(mds_map))
+                return False
+            elif active_count == mds_map['max_mds']:
+                # The MDSMap says these guys are active, but let's check they really are
+                for mds_id, mds_status in mds_map['info'].items():
+                    if mds_status['state'] == 'up:active':
+                        try:
+                            daemon_status = self.mds_asok(["status"], mds_id=mds_status['name'])
+                        except CommandFailedError as cfe:
+                            if cfe.exitstatus == errno.EINVAL:
+                                # Old version, can't do this check
+                                continue
+                            else:
+                                # MDS not even running
+                                return False
+
+                        if daemon_status['state'] != 'up:active':
+                            # MDS hasn't taken the latest map yet
                             return False
 
-                    if daemon_status['state'] != 'up:active':
-                        # MDS hasn't taken the latest map yet
-                        return False
-
-            return True
+                return True
+            else:
+                return False
         else:
-            return False
+            log.info("are_daemons_healthy: skipping max_mds check")
+            return True
 
     def get_daemon_names(self, state=None, status=None):
         """
@@ -753,7 +794,7 @@ class Filesystem(MDSCluster):
 
         return result
 
-    def wait_for_daemons(self, timeout=None):
+    def wait_for_daemons(self, timeout=None, skip_max_mds_check=False, status=None):
         """
         Wait until all daemons are healthy
         :return:
@@ -762,10 +803,12 @@ class Filesystem(MDSCluster):
         if timeout is None:
             timeout = DAEMON_WAIT_TIMEOUT
 
+        if status is None:
+            status = self.status()
+
         elapsed = 0
         while True:
-            status = self.status()
-            if self.are_daemons_healthy(status=status):
+            if self.are_daemons_healthy(status=status, skip_max_mds_check=skip_max_mds_check):
                 return status
             else:
                 time.sleep(1)
@@ -775,6 +818,8 @@ class Filesystem(MDSCluster):
                 log.info("status = {0}".format(status))
                 raise RuntimeError("Timed out waiting for MDS daemons to become healthy")
 
+            status = self.status()
+
     def get_lone_mds_id(self):
         """
         Get a single MDS ID: the only one if there is only one
diff --git a/qa/tasks/mds_pre_upgrade.py b/qa/tasks/mds_pre_upgrade.py
index 5193f92eefa..0856d48337c 100644
--- a/qa/tasks/mds_pre_upgrade.py
+++ b/qa/tasks/mds_pre_upgrade.py
@@ -25,20 +25,7 @@ def task(ctx, config):
     status = fs.getinfo()
 
     fs.set_max_mds(1)
-    status = fs.getinfo()
-    targets = filter(lambda r: r['rank'] >= 1, fs.get_ranks(status=status))
-    if len(targets) > 0:
-        # deactivate mds in decending order
-        targets = sorted(targets, key=lambda r: r['rank'], reverse=True)
-        for target in targets:
-            self.log("deactivating rank %d" % target['rank'])
-            self.fs.deactivate(target['rank'])
-            status = self.wait_for_stable()[0]
-        else:
-            status = self.wait_for_stable()[0]
-
-    assert(fs.get_mds_map(status=status)['max_mds'] == 1)
-    assert(fs.get_mds_map(status=status)['in'] == [0])
+    fs.reach_max_mds()
 
     # Stop standbys now to minimize time rank 0 is down in subsequent:
     # tasks: