qa/tasks/rebuild_mondb: update to address ceph-mgr changes

author Kefu Chai <kchai@redhat.com>

Sat, 27 May 2017 08:58:12 +0000 (16:58 +0800)

committer Kefu Chai <kchai@redhat.com>

Sun, 28 May 2017 01:59:50 +0000 (09:59 +0800)
author Kefu Chai <kchai@redhat.com>
Sat, 27 May 2017 08:58:12 +0000 (16:58 +0800)
committer Kefu Chai <kchai@redhat.com>
Sun, 28 May 2017 01:59:50 +0000 (09:59 +0800)
diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py

index 9b59622ffec7d25f1a40e15c95cacdfac8f6bda3..ab3df5773f41500ae681c3c8f179832bf12c4633 100644 (file)
--- a/qa/tasks/ceph_manager.py
+++ b/qa/tasks/ceph_manager.py
@@ -2268,6 +2268,20 @@ class CephManager:
              self.make_admin_daemon_dir(remote)
          self.ctx.daemons.get_daemon('mon', mon, self.cluster).restart()
  
+    def revive_mgr(self, mgr):
+        """
+        Restart by either power cycling (if the config says so),
+        or by doing a normal restart.
+        """
+        if self.config.get('powercycle'):
+            remote = self.find_remote('mgr', mgr)
+            self.log('revive_mgr on mgr.{m} doing powercycle of {s}'.
+                     format(m=mgr, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_on()
+            self.make_admin_daemon_dir(remote)
+        self.ctx.daemons.get_daemon('mgr', mgr, self.cluster).restart()
+
      def get_mon_status(self, mon):
          """
          Extract all the monitor status information from the cluster
diff --git a/qa/tasks/rebuild_mondb.py b/qa/tasks/rebuild_mondb.py

index fe70c641d7aa07661627ebf79779cc0438062225..900bd16a41fbed11b0f1255af8ce2789a34416ef 100644 (file)
--- a/qa/tasks/rebuild_mondb.py
+++ b/qa/tasks/rebuild_mondb.py
@@ -14,7 +14,7 @@ from teuthology import misc as teuthology
  log = logging.getLogger(__name__)
  
  
-def push_directory(path, remote, remote_dir):
+def _push_directory(path, remote, remote_dir):
      """
      local_temp_path=`mktemp`
      tar czf $local_temp_path $path
@@ -46,45 +46,20 @@ def push_directory(path, remote, remote_dir):
      remote.run(args=['sudo', 'rm', '-fr', remote_temp_path])
  
  
-def task(ctx, config):
-    """
-    Test monitor recovery from OSD
-    """
-    if config is None:
-        config = {}
-    assert isinstance(config, dict), \
-        'task only accepts a dict for configuration'
-
-    first_mon = teuthology.get_first_mon(ctx, config)
-    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
-
-    manager = ceph_manager.CephManager(
-        mon,
-        ctx=ctx,
-        logger=log.getChild('ceph_manager'))
-
-    mons = ctx.cluster.only(teuthology.is_type('mon'))
+def _nuke_mons(manager, mons, mon_id):
      assert mons
-    # note down the first cluster_name and mon_id
-    # we will recover it later on
-    cluster_name = None
-    mon_id = None
+    is_mon = teuthology.is_type('mon')
      for remote, roles in mons.remotes.iteritems():
-        is_mon = teuthology.is_type('mon')
          for role in roles:
              if not is_mon(role):
                  continue
              cluster, _, m = teuthology.split_role(role)
-            if cluster_name is None:
-                cluster_name = cluster
-                mon_id = m
-            assert cluster_name == cluster
              log.info('killing {cluster}:mon.{mon}'.format(
                  cluster=cluster,
                  mon=m))
              manager.kill_mon(m)
              mon_data = os.path.join('/var/lib/ceph/mon/',
-                                    '{0}-{1}'.format(cluster_name, m))
+                                    '{0}-{1}'.format(cluster, m))
              if m == mon_id:
                  # so we will only need to recreate the store.db for the
                  # first mon, would be easier than mkfs on it then replace
@@ -94,13 +69,15 @@ def task(ctx, config):
              else:
                  remote.run(args=['sudo', 'rm', '-r', mon_data])
  
+
+def _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path):
      local_mstore = tempfile.mkdtemp()
  
      # collect the maps from all OSDs
-    osds = ctx.cluster.only(teuthology.is_type('osd'))
+    is_osd = teuthology.is_type('osd')
+    osds = ctx.cluster.only(is_osd)
      assert osds
      for osd, roles in osds.remotes.iteritems():
-        is_osd = teuthology.is_type('osd')
          for role in roles:
              if not is_osd(role):
                  continue
@@ -113,7 +90,7 @@ def task(ctx, config):
              osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store')
              osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore])
  
-            push_directory(local_mstore, osd, osd_mstore)
+            _push_directory(local_mstore, osd, osd_mstore)
              log.info('rm -rf {0}'.format(local_mstore))
              shutil.rmtree(local_mstore)
              # update leveldb with OSD data
@@ -135,12 +112,10 @@ def task(ctx, config):
      # pull from recovered leveldb from client
      mon_store_dir = os.path.join('/var/lib/ceph/mon',
                                   '{0}-{1}'.format(cluster_name, mon_id))
-    push_directory(local_mstore, mon, mon_store_dir)
+    _push_directory(local_mstore, mon, mon_store_dir)
      mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir])
      shutil.rmtree(local_mstore)
-    default_keyring = '/etc/ceph/{cluster}.keyring'.format(
-        cluster=cluster_name)
-    keyring_path = config.get('keyring_path', default_keyring)
+
      # fill up the caps in the keyring file
      mon.run(args=['sudo',
                    'ceph-authtool', keyring_path,
@@ -151,23 +126,25 @@ def task(ctx, config):
                    '-n', 'client.admin',
                    '--cap', 'mon', 'allow *',
                    '--cap', 'osd', 'allow *',
-                  '--cap', 'mds', 'allow *'])
+                  '--cap', 'mds', 'allow *',
+                  '--cap', 'mgr', 'allow *'])
      mon.run(args=['sudo', '-u', 'ceph',
                    'ceph-monstore-tool', mon_store_dir,
                    'rebuild', '--', '--keyring',
                    keyring_path])
  
+
+def _revive_mons(manager, mons, recovered, keyring_path):
      # revive monitors
      # the initial monmap is in the ceph.conf, so we are good.
      n_mons = 0
+    is_mon = teuthology.is_type('mon')
      for remote, roles in mons.remotes.iteritems():
-        is_mon = teuthology.is_type('mon')
          for role in roles:
              if not is_mon(role):
                  continue
              cluster, _, m = teuthology.split_role(role)
-            assert cluster_name == cluster
-            if mon_id != m:
+            if recovered != m:
                  log.info('running mkfs on {cluster}:mon.{mon}'.format(
                      cluster=cluster,
                      mon=m))
@@ -179,15 +156,61 @@ def task(ctx, config):
                          '--mkfs',
                          '-i', m,
                          '--keyring', keyring_path])
+            log.info('reviving mon.{0}'.format(m))
              manager.revive_mon(m)
              n_mons += 1
-
      manager.wait_for_mon_quorum_size(n_mons, timeout=30)
-    for osd, roles in osds.remotes.iteritems():
-        is_osd = teuthology.is_type('osd')
+
+
+def _revive_mgrs(ctx, manager):
+    is_mgr = teuthology.is_type('mgr')
+    mgrs = ctx.cluster.only(is_mgr)
+    for _, roles in mgrs.remotes.iteritems():
+        for role in roles:
+            if not is_mgr(role):
+                continue
+            _, _, mgr_id = teuthology.split_role(role)
+            log.info('reviving mgr.{0}'.format(mgr_id))
+            manager.revive_mgr(mgr_id)
+
+
+def _revive_osds(ctx, manager):
+    is_osd = teuthology.is_type('osd')
+    osds = ctx.cluster.only(is_osd)
+    for _, roles in osds.remotes.iteritems():
          for role in roles:
              if not is_osd(role):
                  continue
              _, _, osd_id = teuthology.split_role(role)
              log.info('reviving osd.{0}'.format(osd_id))
              manager.revive_osd(osd_id)
+
+
+def task(ctx, config):
+    """
+    Test monitor recovery from OSD
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'task only accepts a dict for configuration'
+
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'))
+
+    mons = ctx.cluster.only(teuthology.is_type('mon'))
+    # note down the first cluster_name and mon_id
+    # we will recover it later on
+    cluster_name, _, mon_id = teuthology.split_role(first_mon)
+    _nuke_mons(manager, mons, mon_id)
+    default_keyring = '/etc/ceph/{cluster}.keyring'.format(
+        cluster=cluster_name)
+    keyring_path = config.get('keyring_path', default_keyring)
+    _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path)
+    _revive_mons(manager, mons, mon_id, keyring_path)
+    _revive_mgrs(ctx, manager)
+    _revive_osds(ctx, manager)
author	Kefu Chai <kchai@redhat.com>
	Sat, 27 May 2017 08:58:12 +0000 (16:58 +0800)
committer	Kefu Chai <kchai@redhat.com>
	Sun, 28 May 2017 01:59:50 +0000 (09:59 +0800)
qa/tasks/ceph_manager.py		patch \| blob \| history
qa/tasks/rebuild_mondb.py		patch \| blob \| history