qa/tasks/ceph2: make it multicluster-aware

author Sage Weil <sage@redhat.com>

Mon, 11 Nov 2019 20:30:59 +0000 (20:30 +0000)

committer Sage Weil <sage@redhat.com>

Thu, 21 Nov 2019 16:46:54 +0000 (10:46 -0600)
author Sage Weil <sage@redhat.com>
Mon, 11 Nov 2019 20:30:59 +0000 (20:30 +0000)
committer Sage Weil <sage@redhat.com>
Thu, 21 Nov 2019 16:46:54 +0000 (10:46 -0600)
diff --git a/qa/tasks/ceph2.py b/qa/tasks/ceph2.py

index 5e34263bafe1eb9946b61081e0e713f52b58b3de..665089a4019c001a909f8ff28c70cb2395f78933 100644 (file)
--- a/qa/tasks/ceph2.py
+++ b/qa/tasks/ceph2.py
@@ -36,7 +36,7 @@ CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
  log = logging.getLogger(__name__)
  
  
-def shell(ctx, remote, args, **kwargs):
+def shell(ctx, cluster_name, remote, args, **kwargs):
      testdir = teuthology.get_testdir(ctx)
      return remote.run(
          args=[
@@ -44,20 +44,22 @@ def shell(ctx, remote, args, **kwargs):
              '{}/ceph-daemon'.format(testdir),
              '--image', ctx.image,
              'shell',
-            '-c', '{}/ceph.conf'.format(testdir),
-            '-k', '{}/ceph.keyring'.format(testdir),
-            '--fsid', ctx.fsid,
+            '-c', '{}/{}.conf'.format(testdir, cluster_name),
+            '-k', '{}/{}.keyring'.format(testdir, cluster_name),
+            '--fsid', ctx.ceph[cluster_name].fsid,
              '--',
              ] + args,
          **kwargs
      )
  
  def build_initial_config(ctx, config):
+    cluster_name = config['cluster']
+
      #path = os.path.join(os.path.dirname(__file__), 'ceph.conf.template')
      conf = configobj.ConfigObj() #path, file_error=True)
  
      conf.setdefault('global', {})
-    conf['global']['fsid'] = ctx.fsid
+    conf['global']['fsid'] = ctx.ceph[cluster_name].fsid
  
      # overrides
      for section, keys in config['conf'].items():
@@ -89,10 +91,11 @@ def normalize_hostnames(ctx):
  
  @contextlib.contextmanager
  def download_ceph_daemon(ctx, config):
-    log.info('Downloading ceph-daemon...')
+    cluster_name = config['cluster']
      testdir = teuthology.get_testdir(ctx)
      branch = config.get('ceph-daemon-branch', 'master')
  
+    log.info('Downloading ceph-daemon...')
      ctx.cluster.run(
          args=[
              'curl', '--silent',
@@ -116,7 +119,7 @@ def download_ceph_daemon(ctx, config):
              'sudo',
              '{}/ceph-daemon'.format(testdir),
              'rm-cluster',
-            '--fsid', ctx.fsid,
+            '--fsid', ctx.ceph[cluster_name].fsid,
              '--force',
          ])
  
@@ -130,7 +133,10 @@ def download_ceph_daemon(ctx, config):
          )
  
  @contextlib.contextmanager
-def ceph_log(ctx, config, fsid):
+def ceph_log(ctx, config):
+    cluster_name = config['cluster']
+    fsid = ctx.ceph[cluster_name].fsid
+
      try:
          yield
  
@@ -177,10 +183,13 @@ def ceph_log(ctx, config, fsid):
                                            os.path.join(sub, 'log'))
  
  @contextlib.contextmanager
-def ceph_crash(ctx, fsid):
+def ceph_crash(ctx, config):
      """
      Gather crash dumps from /var/lib/ceph/$fsid/crash
      """
+    cluster_name = config['cluster']
+    fsid = ctx.ceph[cluster_name].fsid
+
      try:
          yield
  
@@ -206,23 +215,27 @@ def ceph_crash(ctx, fsid):
                      pass
  
  @contextlib.contextmanager
-def ceph_bootstrap(ctx, config, fsid):
+def ceph_bootstrap(ctx, config):
+    cluster_name = config['cluster']
      testdir = teuthology.get_testdir(ctx)
+    fsid = ctx.ceph[cluster_name].fsid
  
-    mons = ctx.mons
-    first_mon = sorted(mons.keys())[0]
-    (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
-    log.info('First mon is %s on %s' % (first_mon, mon_remote.shortname))
-    ctx.first_mon = first_mon
+    mons = ctx.ceph[cluster_name].mons
+    first_mon_role = sorted(mons.keys())[0]
+    _, _, first_mon = teuthology.split_role(first_mon_role)
+    (mon_remote,) = ctx.cluster.only(first_mon_role).remotes.keys()
+    log.info('First mon is mon.%s on %s' % (first_mon, mon_remote.shortname))
+    ctx.ceph[cluster_name].first_mon = first_mon
  
      others = ctx.cluster.remotes[mon_remote]
      log.info('others %s' % others)
-    mgrs = sorted([r for r in others if r.startswith('mgr.')])
+    mgrs = sorted([r for r in others
+                   if teuthology.is_type('mgr', cluster_name)(r)])
      if not mgrs:
          raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
-    first_mgr = mgrs[0]
+    _, _, first_mgr = teuthology.split_role(mgrs[0])
      log.info('First mgr is %s' % (first_mgr))
-    ctx.first_mgr = first_mgr
+    ctx.ceph[cluster_name].first_mgr = first_mgr
  
      try:
          # write seed config
@@ -232,7 +245,7 @@ def ceph_bootstrap(ctx, config, fsid):
          seed_config.write(conf_fp)
          teuthology.write_file(
              remote=mon_remote,
-            path='{}/seed.ceph.conf'.format(testdir),
+            path='{}/seed.{}.conf'.format(testdir, cluster_name),
              data=conf_fp.getvalue())
  
          # bootstrap
@@ -243,46 +256,46 @@ def ceph_bootstrap(ctx, config, fsid):
              '--image', ctx.image,
              'bootstrap',
              '--fsid', fsid,
-            '--mon-id', first_mon[4:],
-            '--mgr-id', first_mgr[4:],
-            '--config', '{}/seed.ceph.conf'.format(testdir),
-            '--output-config', '{}/ceph.conf'.format(testdir),
-            '--output-keyring', '{}/ceph.keyring'.format(testdir),
-            '--output-pub-ssh-key', '{}/ceph.pub'.format(testdir),
+            '--mon-id', first_mon,
+            '--mgr-id', first_mgr,
+            '--config', '{}/seed.{}.conf'.format(testdir, cluster_name),
+            '--output-config', '{}/{}.conf'.format(testdir, cluster_name),
+            '--output-keyring', '{}/{}.keyring'.format(testdir, cluster_name),
+            '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name),
          ]
-        if mons[first_mon].startswith('['):
-            cmd += ['--mon-addrv', mons[first_mon]]
+        if mons[first_mon_role].startswith('['):
+            cmd += ['--mon-addrv', mons[first_mon_role]]
          else:
-            cmd += ['--mon-ip', mons[first_mon]]
+            cmd += ['--mon-ip', mons[first_mon_role]]
          if config.get('skip_dashboard'):
              cmd += ['--skip-dashboard']
          # bootstrap makes the keyring root 0600, so +r it for our purposes
          cmd += [
              run.Raw('&&'),
-            'sudo', 'chmod', '+r', '{}/ceph.keyring'.format(testdir),
+            'sudo', 'chmod', '+r', '{}/{}.keyring'.format(testdir, cluster_name),
          ]
          mon_remote.run(args=cmd)
  
          # fetch keys and configs
          log.info('Fetching config...')
-        ctx.config_file = teuthology.get_file(
+        ctx.ceph[cluster_name].config_file = teuthology.get_file(
              remote=mon_remote,
-            path='{}/ceph.conf'.format(testdir))
+            path='{}/{}.conf'.format(testdir, cluster_name))
          log.info('Fetching client.admin keyring...')
-        ctx.admin_keyring = teuthology.get_file(
+        ctx.ceph[cluster_name].admin_keyring = teuthology.get_file(
              remote=mon_remote,
-            path='{}/ceph.keyring'.format(testdir))
+            path='{}/{}.keyring'.format(testdir, cluster_name))
          log.info('Fetching mon keyring...')
-        ctx.mon_keyring = teuthology.get_file(
+        ctx.ceph[cluster_name].mon_keyring = teuthology.get_file(
              remote=mon_remote,
-            path='/var/lib/ceph/%s/%s/keyring' % (fsid, first_mon),
+            path='/var/lib/ceph/%s/mon.%s/keyring' % (fsid, first_mon),
              sudo=True)
  
          # fetch ssh key, distribute to additional nodes
          log.info('Fetching pub ssh key...')
          ssh_pub_key = teuthology.get_file(
              remote=mon_remote,
-            path='{}/ceph.pub'.format(testdir)
+            path='{}/{}.pub'.format(testdir, cluster_name)
          ).strip()
  
          log.info('Installing pub ssh key for root users...')
@@ -303,15 +316,15 @@ def ceph_bootstrap(ctx, config, fsid):
              log.info('Writing conf and keyring to %s' % remote.shortname)
              teuthology.write_file(
                  remote=remote,
-                path='{}/ceph.conf'.format(testdir),
-                data=ctx.config_file)
+                path='{}/{}.conf'.format(testdir, cluster_name),
+                data=ctx.ceph[cluster_name].config_file)
              teuthology.write_file(
                  remote=remote,
-                path='{}/ceph.keyring'.format(testdir),
-                data=ctx.admin_keyring)
+                path='{}/{}.keyring'.format(testdir, cluster_name),
+                data=ctx.ceph[cluster_name].admin_keyring)
  
              log.info('Adding host %s to orchestrator...' % remote.shortname)
-            shell(ctx, remote, [
+            shell(ctx, cluster_name, remote, [
                  'ceph', 'orchestrator', 'host', 'add',
                  remote.shortname
              ])
@@ -322,40 +335,56 @@ def ceph_bootstrap(ctx, config, fsid):
          log.info('Cleaning up testdir ceph.* files...')
          ctx.cluster.run(args=[
              'rm', '-f',
-            '{}/seed.ceph.conf'.format(testdir),
-            '{}/ceph.pub'.format(testdir),
-            '{}/ceph.conf'.format(testdir),
-            '{}/ceph.keyring'.format(testdir),
+            '{}/seed.{}.conf'.format(testdir, cluster_name),
+            '{}/{}.pub'.format(testdir, cluster_name),
+            '{}/{}.conf'.format(testdir, cluster_name),
+            '{}/{}.keyring'.format(testdir, cluster_name),
          ])
  
          log.info('Stopping all daemons...')
-        ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
+
+        # this doesn't block until they are all stopped...
+        #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
+
+        # so, stop them individually
+        for remote, roles in ctx.cluster.remotes.items():
+            for role in roles:
+                log.info('Stopping %s on %s...' % (role, remote.shortname))
+                remote.run(args=[
+                    'sudo', 'systemctl', 'stop',
+                    'ceph-%s@%s' % (fsid, role)
+                ])
+
  
  @contextlib.contextmanager
  def ceph_mons(ctx, config):
      """
      Deploy any additional mons
      """
+    cluster_name = config['cluster']
      testdir = teuthology.get_testdir(ctx)
      num_mons = 1
  
      try:
          for remote, roles in ctx.cluster.remotes.items():
-            for mon in [r for r in roles if r.startswith('mon.')]:
-                if mon == ctx.first_mon:
+            for mon in [r for r in roles
+                        if teuthology.is_type('mon', cluster_name)(r)]:
+                c_, _, id_ = teuthology.split_role(mon)
+                if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon:
                      continue
-                log.info('Adding %s on %s' % (mon, remote.shortname))
+                log.info('Adding mon.%s on %s' % (mon, remote.shortname))
                  num_mons += 1
-                shell(ctx, remote, [
+                shell(ctx, cluster_name, remote, [
                      'ceph', 'orchestrator', 'mon', 'update',
                      str(num_mons),
-                    remote.shortname + ':' + ctx.mons[mon],
+                    remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + mon.split('.', 1)[1],
                  ])
  
                  while True:
                      log.info('Waiting for %d mons in monmap...' % (num_mons))
                      r = shell(
                          ctx=ctx,
+                        cluster_name=cluster_name,
                          remote=remote,
                          args=[
                              'ceph', 'mon', 'dump', '-f', 'json',
@@ -379,19 +408,21 @@ def ceph_mgrs(ctx, config):
      """
      Deploy any additional mgrs
      """
+    cluster_name = config['cluster']
      testdir = teuthology.get_testdir(ctx)
-    (remote,) = ctx.cluster.only(ctx.first_mon).remotes.keys()
  
      try:
          nodes = []
          for remote, roles in ctx.cluster.remotes.items():
-            for mgr in [r for r in roles if r.startswith('mgr.')]:
-                if mgr == ctx.first_mgr:
+            for mgr in [r for r in roles
+                        if teuthology.is_type('mgr', cluster_name)(r)]:
+                c_, _, id_ = teuthology.split_role(mgr)
+                if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr:
                      continue
-                log.info('Adding %s on %s' % (mgr, remote.shortname))
+                log.info('Adding mgr.%s on %s' % (mgr, remote.shortname))
                  ### FIXME: we don't get to choose the mgr names ####
-                nodes.append(remote.shortname)
-        shell(ctx, remote, [
+                nodes.append(remote.shortname + '=' + mgr.split('.', 1)[1])
+        shell(ctx, cluster_name, remote, [
              'ceph', 'orchestrator', 'mgr', 'update',
              str(len(nodes) + 1)] + nodes
          )
@@ -406,25 +437,19 @@ def ceph_osds(ctx, config):
      """
      Deploy OSDs
      """
+    cluster_name = config['cluster']
      try:
-        log.info('Zapping devices...')
-        devs_by_remote = {}
-        for remote, roles in ctx.cluster.remotes.items():
-            devs = teuthology.get_scratch_devices(remote)
-            for dev in devs:
-                shell(ctx, remote, [
-                    'ceph-volume', 'lvm', 'zap', dev])
-            devs_by_remote[remote] = devs
-
          log.info('Deploying OSDs...')
          for remote, roles in ctx.cluster.remotes.items():
-            devs = devs_by_remote[remote]
+            devs = teuthology.get_scratch_devices(remote)
              for osd in [r for r in roles if r.startswith('osd.')]:
                  assert devs   ## FIXME ##
                  dev = devs.pop()
                  log.info('Deploying %s on %s with %s...' % (
                      osd, remote.shortname, dev))
-                shell(ctx, remote, [
+                shell(ctx, cluster_name, remote, [
+                    'ceph-volume', 'lvm', 'zap', dev])
+                shell(ctx, cluster_name, remote, [
                      'ceph', 'orchestrator', 'osd', 'create',
                      remote.shortname + ':' + dev
                  ])
@@ -478,6 +503,31 @@ def stop(ctx, config):
  
      yield
  
+@contextlib.contextmanager
+def distribute_config_and_admin_keyring(ctx, config):
+    """
+    Distribute a sufficient config and keyring for clients
+    """
+    cluster_name = config['cluster']
+    log.info('Distributing config and client.admin keyring...')
+    for remote, roles in ctx.cluster.remotes.items():
+        remote.run(args=['sudo', 'mkdir', '-p', '/etc/ceph'])
+        teuthology.sudo_write_file(
+            remote=remote,
+            path='/etc/ceph/{}.conf'.format(cluster_name),
+            data=ctx.ceph[cluster_name].config_file)
+        teuthology.sudo_write_file(
+            remote=remote,
+            path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
+            data=ctx.ceph[cluster_name].admin_keyring)
+    try:
+        yield
+    finally:
+        ctx.cluster.run(args=[
+            'sudo', 'rm', '-f',
+            '/etc/ceph/{}.conf'.format(cluster_name),
+            '/etc/ceph/{}.client.admin.keyring'.format(cluster_name),
+        ])
  
  @contextlib.contextmanager
  def task(ctx, config):
@@ -493,16 +543,13 @@ def task(ctx, config):
  
      testdir = teuthology.get_testdir(ctx)
  
-    ## FIXME i don't understand multicluster ##
+    # set up cluster context
      first_ceph_cluster = False
      if not hasattr(ctx, 'daemons'):
          first_ceph_cluster = True
-        ctx.daemons = DaemonGroup()
-
+        ctx.daemons = DaemonGroup(use_ceph_daemon=True)
      if not hasattr(ctx, 'ceph'):
          ctx.ceph = {}
-
-    ## FIXME i don't understand multicluster ##
      if 'cluster' not in config:
          config['cluster'] = 'ceph'
      cluster_name = config['cluster']
@@ -522,9 +569,7 @@ def task(ctx, config):
  
      # uuid
      fsid = str(uuid.uuid1())
-    ctx.fsid = fsid
      log.info('Cluster fsid is %s' % fsid)
-    ## FIXME i don't understand multicluster ##
      ctx.ceph[cluster_name].fsid = fsid
  
      # mon ips
@@ -533,23 +578,24 @@ def task(ctx, config):
      roles = [role_list for (remote, role_list) in remotes_and_roles]
      ips = [host for (host, port) in
             (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
-    ctx.mons = get_mons(
+    ctx.ceph[cluster_name].mons = get_mons(
          roles, ips, cluster_name,
          mon_bind_msgr2=config.get('mon_bind_msgr2', True),
          mon_bind_addrvec=config.get('mon_bind_addrvec', True),
          )
-    log.info('Monitor IPs: %s' % ctx.mons)
+    log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons)
  
      with contextutil.nested(
              lambda: ceph_initial(),
              lambda: normalize_hostnames(ctx=ctx),
              lambda: download_ceph_daemon(ctx=ctx, config=config),
-            lambda: ceph_log(ctx=ctx, config=config, fsid=fsid),
-            lambda: ceph_crash(ctx=ctx, fsid=fsid),
-            lambda: ceph_bootstrap(ctx=ctx, config=config, fsid=fsid),
+            lambda: ceph_log(ctx=ctx, config=config),
+            lambda: ceph_crash(ctx=ctx, config=config),
+            lambda: ceph_bootstrap(ctx=ctx, config=config),
              lambda: ceph_mons(ctx=ctx, config=config),
              lambda: ceph_mgrs(ctx=ctx, config=config),
              lambda: ceph_osds(ctx=ctx, config=config),
+            lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config),
      ):
          try:
              log.info('Setup complete, yielding')
author	Sage Weil <sage@redhat.com>
	Mon, 11 Nov 2019 20:30:59 +0000 (20:30 +0000)
committer	Sage Weil <sage@redhat.com>
	Thu, 21 Nov 2019 16:46:54 +0000 (10:46 -0600)