From: Sage Weil <sage@redhat.com>
Date: Mon, 11 Nov 2019 20:30:59 +0000 (+0000)
Subject: qa/tasks/ceph2: make it multicluster-aware
X-Git-Tag: v15.1.0~790^2~21
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2b4c81e62ba53bb47f6cad7f7946422cc441ec6d;p=ceph.git

qa/tasks/ceph2: make it multicluster-aware

Signed-off-by: Sage Weil <sage@redhat.com>
---

diff --git a/qa/tasks/ceph2.py b/qa/tasks/ceph2.py
index 5e34263bafe1..665089a4019c 100644
--- a/qa/tasks/ceph2.py
+++ b/qa/tasks/ceph2.py
@@ -36,7 +36,7 @@ CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
 log = logging.getLogger(__name__)
 
 
-def shell(ctx, remote, args, **kwargs):
+def shell(ctx, cluster_name, remote, args, **kwargs):
     testdir = teuthology.get_testdir(ctx)
     return remote.run(
         args=[
@@ -44,20 +44,22 @@ def shell(ctx, remote, args, **kwargs):
             '{}/ceph-daemon'.format(testdir),
             '--image', ctx.image,
             'shell',
-            '-c', '{}/ceph.conf'.format(testdir),
-            '-k', '{}/ceph.keyring'.format(testdir),
-            '--fsid', ctx.fsid,
+            '-c', '{}/{}.conf'.format(testdir, cluster_name),
+            '-k', '{}/{}.keyring'.format(testdir, cluster_name),
+            '--fsid', ctx.ceph[cluster_name].fsid,
             '--',
             ] + args,
         **kwargs
     )
 
 def build_initial_config(ctx, config):
+    cluster_name = config['cluster']
+
     #path = os.path.join(os.path.dirname(__file__), 'ceph.conf.template')
     conf = configobj.ConfigObj() #path, file_error=True)
 
     conf.setdefault('global', {})
-    conf['global']['fsid'] = ctx.fsid
+    conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
 
     # overrides
     for section, keys in config['conf'].items():
@@ -89,10 +91,11 @@ def normalize_hostnames(ctx):
 
 @contextlib.contextmanager
 def download_ceph_daemon(ctx, config):
-    log.info('Downloading ceph-daemon...')
+    cluster_name = config['cluster']
     testdir = teuthology.get_testdir(ctx)
     branch = config.get('ceph-daemon-branch', 'master')
 
+    log.info('Downloading ceph-daemon...')
     ctx.cluster.run(
         args=[
             'curl', '--silent',
@@ -116,7 +119,7 @@ def download_ceph_daemon(ctx, config):
             'sudo',
             '{}/ceph-daemon'.format(testdir),
             'rm-cluster',
-            '--fsid', ctx.fsid,
+            '--fsid', ctx.ceph[cluster_name].fsid,
             '--force',
         ])
 
@@ -130,7 +133,10 @@ def download_ceph_daemon(ctx, config):
         )
 
 @contextlib.contextmanager
-def ceph_log(ctx, config, fsid):
+def ceph_log(ctx, config):
+    cluster_name = config['cluster']
+    fsid = ctx.ceph[cluster_name].fsid
+
     try:
         yield
 
@@ -177,10 +183,13 @@ def ceph_log(ctx, config, fsid):
                                           os.path.join(sub, 'log'))
 
 @contextlib.contextmanager
-def ceph_crash(ctx, fsid):
+def ceph_crash(ctx, config):
     """
     Gather crash dumps from /var/lib/ceph/$fsid/crash
     """
+    cluster_name = config['cluster']
+    fsid = ctx.ceph[cluster_name].fsid
+
     try:
         yield
 
@@ -206,23 +215,27 @@ def ceph_crash(ctx, fsid):
                     pass
 
 @contextlib.contextmanager
-def ceph_bootstrap(ctx, config, fsid):
+def ceph_bootstrap(ctx, config):
+    cluster_name = config['cluster']
     testdir = teuthology.get_testdir(ctx)
+    fsid = ctx.ceph[cluster_name].fsid
 
-    mons = ctx.mons
-    first_mon = sorted(mons.keys())[0]
-    (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
-    log.info('First mon is %s on %s' % (first_mon, mon_remote.shortname))
-    ctx.first_mon = first_mon
+    mons = ctx.ceph[cluster_name].mons
+    first_mon_role = sorted(mons.keys())[0]
+    _, _, first_mon = teuthology.split_role(first_mon_role)
+    (mon_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
+    log.info('First mon is mon.%s on %s' % (first_mon, mon_remote.shortname))
+    ctx.ceph[cluster_name].first_mon = first_mon
 
     others = ctx.cluster.remotes[mon_remote]
     log.info('others %s' % others)
-    mgrs = sorted([r for r in others if r.startswith('mgr.')])
+    mgrs = sorted([r for r in others
+                   if teuthology.is_type('mgr', cluster_name)(r)])
     if not mgrs:
         raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
-    first_mgr = mgrs[0]
+    _, _, first_mgr = teuthology.split_role(mgrs[0])
     log.info('First mgr is %s' % (first_mgr))
-    ctx.first_mgr = first_mgr
+    ctx.ceph[cluster_name].first_mgr = first_mgr
 
     try:
         # write seed config
@@ -232,7 +245,7 @@ def ceph_bootstrap(ctx, config, fsid):
         seed_config.write(conf_fp)
         teuthology.write_file(
             remote=mon_remote,
-            path='{}/seed.ceph.conf'.format(testdir),
+            path='{}/seed.{}.conf'.format(testdir, cluster_name),
             data=conf_fp.getvalue())
 
         # bootstrap
@@ -243,46 +256,46 @@ def ceph_bootstrap(ctx, config, fsid):
             '--image', ctx.image,
             'bootstrap',
             '--fsid', fsid,
-            '--mon-id', first_mon[4:],
-            '--mgr-id', first_mgr[4:],
-            '--config', '{}/seed.ceph.conf'.format(testdir),
-            '--output-config', '{}/ceph.conf'.format(testdir),
-            '--output-keyring', '{}/ceph.keyring'.format(testdir),
-            '--output-pub-ssh-key', '{}/ceph.pub'.format(testdir),
+            '--mon-id', first_mon,
+            '--mgr-id', first_mgr,
+            '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
+            '--output-config', '{}/{}.conf'.format(testdir, cluster_name),
+            '--output-keyring', '{}/{}.keyring'.format(testdir, cluster_name),
+            '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
         ]
-        if mons[first_mon].startswith('['):
-            cmd += ['--mon-addrv', mons[first_mon]]
+        if mons[first_mon_role].startswith('['):
+            cmd += ['--mon-addrv', mons[first_mon_role]]
         else:
-            cmd += ['--mon-ip', mons[first_mon]]
+            cmd += ['--mon-ip', mons[first_mon_role]]
         if config.get('skip_dashboard'):
             cmd += ['--skip-dashboard']
         # bootstrap makes the keyring root 0600, so +r it for our purposes
         cmd += [
             run.Raw('&&'),
-            'sudo', 'chmod', '+r', '{}/ceph.keyring'.format(testdir),
+            'sudo', 'chmod', '+r', '{}/{}.keyring'.format(testdir, cluster_name),
         ]
         mon_remote.run(args=cmd)
 
         # fetch keys and configs
         log.info('Fetching config...')
-        ctx.config_file = teuthology.get_file(
+        ctx.ceph[cluster_name].config_file = teuthology.get_file(
             remote=mon_remote,
-            path='{}/ceph.conf'.format(testdir))
+            path='{}/{}.conf'.format(testdir, cluster_name))
         log.info('Fetching client.admin keyring...')
-        ctx.admin_keyring = teuthology.get_file(
+        ctx.ceph[cluster_name].admin_keyring = teuthology.get_file(
             remote=mon_remote,
-            path='{}/ceph.keyring'.format(testdir))
+            path='{}/{}.keyring'.format(testdir, cluster_name))
         log.info('Fetching mon keyring...')
-        ctx.mon_keyring = teuthology.get_file(
+        ctx.ceph[cluster_name].mon_keyring = teuthology.get_file(
             remote=mon_remote,
-            path='/var/lib/ceph/%s/%s/keyring' % (fsid, first_mon),
+            path='/var/lib/ceph/%s/mon.%s/keyring' % (fsid, first_mon),
             sudo=True)
 
         # fetch ssh key, distribute to additional nodes
         log.info('Fetching pub ssh key...')
         ssh_pub_key = teuthology.get_file(
             remote=mon_remote,
-            path='{}/ceph.pub'.format(testdir)
+            path='{}/{}.pub'.format(testdir, cluster_name)
         ).strip()
 
         log.info('Installing pub ssh key for root users...')
@@ -303,15 +316,15 @@ def ceph_bootstrap(ctx, config, fsid):
             log.info('Writing conf and keyring to %s' % remote.shortname)
             teuthology.write_file(
                 remote=remote,
-                path='{}/ceph.conf'.format(testdir),
-                data=ctx.config_file)
+                path='{}/{}.conf'.format(testdir, cluster_name),
+                data=ctx.ceph[cluster_name].config_file)
             teuthology.write_file(
                 remote=remote,
-                path='{}/ceph.keyring'.format(testdir),
-                data=ctx.admin_keyring)
+                path='{}/{}.keyring'.format(testdir, cluster_name),
+                data=ctx.ceph[cluster_name].admin_keyring)
 
             log.info('Adding host %s to orchestrator...' % remote.shortname)
-            shell(ctx, remote, [
+            shell(ctx, cluster_name, remote, [
                 'ceph', 'orchestrator', 'host', 'add',
                 remote.shortname
             ])
@@ -322,40 +335,56 @@ def ceph_bootstrap(ctx, config, fsid):
         log.info('Cleaning up testdir ceph.* files...')
         ctx.cluster.run(args=[
             'rm', '-f',
-            '{}/seed.ceph.conf'.format(testdir),
-            '{}/ceph.pub'.format(testdir),
-            '{}/ceph.conf'.format(testdir),
-            '{}/ceph.keyring'.format(testdir),
+            '{}/seed.{}.conf'.format(testdir, cluster_name),
+            '{}/{}.pub'.format(testdir, cluster_name),
+            '{}/{}.conf'.format(testdir, cluster_name),
+            '{}/{}.keyring'.format(testdir, cluster_name),
         ])
 
         log.info('Stopping all daemons...')
-        ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
+
+        # this doesn't block until they are all stopped...
+        #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
+
+        # so, stop them individually
+        for remote, roles in ctx.cluster.remotes.items():
+            for role in roles:
+                log.info('Stopping %s on %s...' % (role, remote.shortname))
+                remote.run(args=[
+                    'sudo', 'systemctl', 'stop',
+                    'ceph-%s@%s' % (fsid, role)
+                ])
+
 
 @contextlib.contextmanager
 def ceph_mons(ctx, config):
     """
     Deploy any additional mons
     """
+    cluster_name = config['cluster']
     testdir = teuthology.get_testdir(ctx)
     num_mons = 1
 
     try:
         for remote, roles in ctx.cluster.remotes.items():
-            for mon in [r for r in roles if r.startswith('mon.')]:
-                if mon == ctx.first_mon:
+            for mon in [r for r in roles
+                        if teuthology.is_type('mon', cluster_name)(r)]:
+                c_, _, id_ = teuthology.split_role(mon)
+                if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
                     continue
-                log.info('Adding %s on %s' % (mon, remote.shortname))
+                log.info('Adding mon.%s on %s' % (mon, remote.shortname))
                 num_mons += 1
-                shell(ctx, remote, [
+                shell(ctx, cluster_name, remote, [
                     'ceph', 'orchestrator', 'mon', 'update',
                     str(num_mons),
-                    remote.shortname + ':' + ctx.mons[mon],
+                    remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + mon.split('.', 1)[1],
                 ])
 
                 while True:
                     log.info('Waiting for %d mons in monmap...' % (num_mons))
                     r = shell(
                         ctx=ctx,
+                        cluster_name=cluster_name,
                         remote=remote,
                         args=[
                             'ceph', 'mon', 'dump', '-f', 'json',
@@ -379,19 +408,21 @@ def ceph_mgrs(ctx, config):
     """
     Deploy any additional mgrs
     """
+    cluster_name = config['cluster']
     testdir = teuthology.get_testdir(ctx)
-    (remote,) = ctx.cluster.only(ctx.first_mon).remotes.keys()
 
     try:
         nodes = []
         for remote, roles in ctx.cluster.remotes.items():
-            for mgr in [r for r in roles if r.startswith('mgr.')]:
-                if mgr == ctx.first_mgr:
+            for mgr in [r for r in roles
+                        if teuthology.is_type('mgr', cluster_name)(r)]:
+                c_, _, id_ = teuthology.split_role(mgr)
+                if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
                     continue
-                log.info('Adding %s on %s' % (mgr, remote.shortname))
+                log.info('Adding mgr.%s on %s' % (mgr, remote.shortname))
                 ### FIXME: we don't get to choose the mgr names ####
-                nodes.append(remote.shortname)
-        shell(ctx, remote, [
+                nodes.append(remote.shortname + '=' + mgr.split('.', 1)[1])
+        shell(ctx, cluster_name, remote, [
             'ceph', 'orchestrator', 'mgr', 'update',
             str(len(nodes) + 1)] + nodes
         )
@@ -406,25 +437,19 @@ def ceph_osds(ctx, config):
     """
     Deploy OSDs
     """
+    cluster_name = config['cluster']
     try:
-        log.info('Zapping devices...')
-        devs_by_remote = {}
-        for remote, roles in ctx.cluster.remotes.items():
-            devs = teuthology.get_scratch_devices(remote)
-            for dev in devs:
-                shell(ctx, remote, [
-                    'ceph-volume', 'lvm', 'zap', dev])
-            devs_by_remote[remote] = devs
-
         log.info('Deploying OSDs...')
         for remote, roles in ctx.cluster.remotes.items():
-            devs = devs_by_remote[remote]
+            devs = teuthology.get_scratch_devices(remote)
             for osd in [r for r in roles if r.startswith('osd.')]:
                 assert devs   ## FIXME ##
                 dev = devs.pop()
                 log.info('Deploying %s on %s with %s...' % (
                     osd, remote.shortname, dev))
-                shell(ctx, remote, [
+                shell(ctx, cluster_name, remote, [
+                    'ceph-volume', 'lvm', 'zap', dev])
+                shell(ctx, cluster_name, remote, [
                     'ceph', 'orchestrator', 'osd', 'create',
                     remote.shortname + ':' + dev
                 ])
@@ -478,6 +503,31 @@ def stop(ctx, config):
 
     yield
 
+@contextlib.contextmanager
+def distribute_config_and_admin_keyring(ctx, config):
+    """
+    Distribute a sufficient config and keyring for clients
+    """
+    cluster_name = config['cluster']
+    log.info('Distributing config and client.admin keyring...')
+    for remote, roles in ctx.cluster.remotes.items():
+        remote.run(args=['sudo', 'mkdir', '-p', '/etc/ceph'])
+        teuthology.sudo_write_file(
+            remote=remote,
+            path='/etc/ceph/{}.conf'.format(cluster_name),
+            data=ctx.ceph[cluster_name].config_file)
+        teuthology.sudo_write_file(
+            remote=remote,
+            path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
+            data=ctx.ceph[cluster_name].admin_keyring)
+    try:
+        yield
+    finally:
+        ctx.cluster.run(args=[
+            'sudo', 'rm', '-f',
+            '/etc/ceph/{}.conf'.format(cluster_name),
+            '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
+        ])
 
 @contextlib.contextmanager
 def task(ctx, config):
@@ -493,16 +543,13 @@ def task(ctx, config):
 
     testdir = teuthology.get_testdir(ctx)
 
-    ## FIXME i don't understand multicluster ##
+    # set up cluster context
     first_ceph_cluster = False
     if not hasattr(ctx, 'daemons'):
         first_ceph_cluster = True
-        ctx.daemons = DaemonGroup()
-
+        ctx.daemons = DaemonGroup(use_ceph_daemon=True)
     if not hasattr(ctx, 'ceph'):
         ctx.ceph = {}
-
-    ## FIXME i don't understand multicluster ##
     if 'cluster' not in config:
         config['cluster'] = 'ceph'
     cluster_name = config['cluster']
@@ -522,9 +569,7 @@ def task(ctx, config):
 
     # uuid
     fsid = str(uuid.uuid1())
-    ctx.fsid = fsid
     log.info('Cluster fsid is %s' % fsid)
-    ## FIXME i don't understand multicluster ##
     ctx.ceph[cluster_name].fsid = fsid
 
     # mon ips
@@ -533,23 +578,24 @@ def task(ctx, config):
     roles = [role_list for (remote, role_list) in remotes_and_roles]
     ips = [host for (host, port) in
            (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
-    ctx.mons = get_mons(
+    ctx.ceph[cluster_name].mons = get_mons(
         roles, ips, cluster_name,
         mon_bind_msgr2=config.get('mon_bind_msgr2', True),
         mon_bind_addrvec=config.get('mon_bind_addrvec', True),
         )
-    log.info('Monitor IPs: %s' % ctx.mons)
+    log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
 
     with contextutil.nested(
             lambda: ceph_initial(),
             lambda: normalize_hostnames(ctx=ctx),
             lambda: download_ceph_daemon(ctx=ctx, config=config),
-            lambda: ceph_log(ctx=ctx, config=config, fsid=fsid),
-            lambda: ceph_crash(ctx=ctx, fsid=fsid),
-            lambda: ceph_bootstrap(ctx=ctx, config=config, fsid=fsid),
+            lambda: ceph_log(ctx=ctx, config=config),
+            lambda: ceph_crash(ctx=ctx, config=config),
+            lambda: ceph_bootstrap(ctx=ctx, config=config),
             lambda: ceph_mons(ctx=ctx, config=config),
             lambda: ceph_mgrs(ctx=ctx, config=config),
             lambda: ceph_osds(ctx=ctx, config=config),
+            lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
     ):
         try:
             log.info('Setup complete, yielding')