From: Patrick Donnelly Date: Thu, 3 May 2018 20:12:54 +0000 (-0700) Subject: qa: add mds deactivation procedure for upgrades X-Git-Tag: v13.1.1~50^2~2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=34f395512a0e2ddcf219f65406a748f564513c50;p=ceph-ci.git qa: add mds deactivation procedure for upgrades Signed-off-by: Patrick Donnelly (cherry picked from commit 6a788bf203dc07d32f299ce488b054addaae4f75) --- diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py index d22126a9baa..4ff3cc01413 100644 --- a/qa/tasks/cephfs/filesystem.py +++ b/qa/tasks/cephfs/filesystem.py @@ -9,6 +9,7 @@ import datetime import re import errno import random +import traceback from teuthology.exceptions import CommandFailedError from teuthology import misc @@ -432,6 +433,42 @@ class Filesystem(MDSCluster): raise RuntimeError("cannot specify fscid when configuring overlay") self.metadata_overlay = overlay + def deactivate(self, rank): + if rank < 0: + raise RuntimeError("invalid rank") + elif rank == 0: + raise RuntimeError("cannot deactivate rank 0") + self.mon_manager.raw_cluster_cmd("mds", "deactivate", "%d:%d" % (self.id, rank)) + + def reach_max_mds(self): + # Try to reach rank count == max_mds, up or down (UPGRADE SENSITIVE!) + status = self.getinfo() + mds_map = self.get_mds_map(status=status) + max_mds = mds_map['max_mds'] + + count = len(list(self.get_ranks(status=status))) + if count > max_mds: + try: + # deactivate mds in decending order + status = self.wait_for_daemons(status=status, skip_max_mds_check=True) + while count > max_mds: + targets = sorted(self.get_ranks(status=status), key=lambda r: r['rank'], reverse=True) + target = targets[0] + log.info("deactivating rank %d" % target['rank']) + self.deactivate(target['rank']) + status = self.wait_for_daemons(skip_max_mds_check=True) + count = len(list(self.get_ranks(status=status))) + except: + # In Mimic, deactivation is done automatically: + log.info("Error:\n{}".format(traceback.format_exc())) + status = self.wait_for_daemons() + else: + status = self.wait_for_daemons() + + mds_map = self.get_mds_map(status=status) + assert(mds_map['max_mds'] == max_mds) + assert(mds_map['in'] == list(range(0, max_mds))) + def set_var(self, var, *args): a = map(str, args) self.mon_manager.raw_cluster_cmd("fs", "set", self.name, var, *a) @@ -631,7 +668,7 @@ class Filesystem(MDSCluster): def get_usage(self): return self._df()['stats']['total_used_bytes'] - def are_daemons_healthy(self, status=None): + def are_daemons_healthy(self, status=None, skip_max_mds_check=False): """ Return true if all daemons are in one of active, standby, standby-replay, and at least max_mds daemons are in 'active'. @@ -671,30 +708,34 @@ class Filesystem(MDSCluster): active_count, mds_map['max_mds'] )) - if active_count > mds_map['max_mds']: - log.info("are_daemons_healthy: number of actives is grater than max_mds: {0}".format(mds_map)) - return False - elif active_count == mds_map['max_mds']: - # The MDSMap says these guys are active, but let's check they really are - for mds_id, mds_status in mds_map['info'].items(): - if mds_status['state'] == 'up:active': - try: - daemon_status = self.mds_asok(["status"], mds_id=mds_status['name']) - except CommandFailedError as cfe: - if cfe.exitstatus == errno.EINVAL: - # Old version, can't do this check - continue - else: - # MDS not even running + if not skip_max_mds_check: + if active_count > mds_map['max_mds']: + log.info("are_daemons_healthy: number of actives is greater than max_mds: {0}".format(mds_map)) + return False + elif active_count == mds_map['max_mds']: + # The MDSMap says these guys are active, but let's check they really are + for mds_id, mds_status in mds_map['info'].items(): + if mds_status['state'] == 'up:active': + try: + daemon_status = self.mds_asok(["status"], mds_id=mds_status['name']) + except CommandFailedError as cfe: + if cfe.exitstatus == errno.EINVAL: + # Old version, can't do this check + continue + else: + # MDS not even running + return False + + if daemon_status['state'] != 'up:active': + # MDS hasn't taken the latest map yet return False - if daemon_status['state'] != 'up:active': - # MDS hasn't taken the latest map yet - return False - - return True + return True + else: + return False else: - return False + log.info("are_daemons_healthy: skipping max_mds check") + return True def get_daemon_names(self, state=None, status=None): """ @@ -753,7 +794,7 @@ class Filesystem(MDSCluster): return result - def wait_for_daemons(self, timeout=None): + def wait_for_daemons(self, timeout=None, skip_max_mds_check=False, status=None): """ Wait until all daemons are healthy :return: @@ -762,10 +803,12 @@ class Filesystem(MDSCluster): if timeout is None: timeout = DAEMON_WAIT_TIMEOUT + if status is None: + status = self.status() + elapsed = 0 while True: - status = self.status() - if self.are_daemons_healthy(status=status): + if self.are_daemons_healthy(status=status, skip_max_mds_check=skip_max_mds_check): return status else: time.sleep(1) @@ -775,6 +818,8 @@ class Filesystem(MDSCluster): log.info("status = {0}".format(status)) raise RuntimeError("Timed out waiting for MDS daemons to become healthy") + status = self.status() + def get_lone_mds_id(self): """ Get a single MDS ID: the only one if there is only one diff --git a/qa/tasks/mds_pre_upgrade.py b/qa/tasks/mds_pre_upgrade.py index 5193f92eefa..0856d48337c 100644 --- a/qa/tasks/mds_pre_upgrade.py +++ b/qa/tasks/mds_pre_upgrade.py @@ -25,20 +25,7 @@ def task(ctx, config): status = fs.getinfo() fs.set_max_mds(1) - status = fs.getinfo() - targets = filter(lambda r: r['rank'] >= 1, fs.get_ranks(status=status)) - if len(targets) > 0: - # deactivate mds in decending order - targets = sorted(targets, key=lambda r: r['rank'], reverse=True) - for target in targets: - self.log("deactivating rank %d" % target['rank']) - self.fs.deactivate(target['rank']) - status = self.wait_for_stable()[0] - else: - status = self.wait_for_stable()[0] - - assert(fs.get_mds_map(status=status)['max_mds'] == 1) - assert(fs.get_mds_map(status=status)['in'] == [0]) + fs.reach_max_mds() # Stop standbys now to minimize time rank 0 is down in subsequent: # tasks: