From: Sage Weil Date: Mon, 11 Nov 2019 20:30:59 +0000 (+0000) Subject: qa/tasks/ceph2: make it multicluster-aware X-Git-Tag: v15.1.0~790^2~21 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2b4c81e62ba53bb47f6cad7f7946422cc441ec6d;p=ceph.git qa/tasks/ceph2: make it multicluster-aware Signed-off-by: Sage Weil --- diff --git a/qa/tasks/ceph2.py b/qa/tasks/ceph2.py index 5e34263bafe1..665089a4019c 100644 --- a/qa/tasks/ceph2.py +++ b/qa/tasks/ceph2.py @@ -36,7 +36,7 @@ CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw'] log = logging.getLogger(__name__) -def shell(ctx, remote, args, **kwargs): +def shell(ctx, cluster_name, remote, args, **kwargs): testdir = teuthology.get_testdir(ctx) return remote.run( args=[ @@ -44,20 +44,22 @@ def shell(ctx, remote, args, **kwargs): '{}/ceph-daemon'.format(testdir), '--image', ctx.image, 'shell', - '-c', '{}/ceph.conf'.format(testdir), - '-k', '{}/ceph.keyring'.format(testdir), - '--fsid', ctx.fsid, + '-c', '{}/{}.conf'.format(testdir, cluster_name), + '-k', '{}/{}.keyring'.format(testdir, cluster_name), + '--fsid', ctx.ceph[cluster_name].fsid, '--', ] + args, **kwargs ) def build_initial_config(ctx, config): + cluster_name = config['cluster'] + #path = os.path.join(os.path.dirname(__file__), 'ceph.conf.template') conf = configobj.ConfigObj() #path, file_error=True) conf.setdefault('global', {}) - conf['global']['fsid'] = ctx.fsid + conf['global']['fsid'] = ctx.ceph[cluster_name].fsid # overrides for section, keys in config['conf'].items(): @@ -89,10 +91,11 @@ def normalize_hostnames(ctx): @contextlib.contextmanager def download_ceph_daemon(ctx, config): - log.info('Downloading ceph-daemon...') + cluster_name = config['cluster'] testdir = teuthology.get_testdir(ctx) branch = config.get('ceph-daemon-branch', 'master') + log.info('Downloading ceph-daemon...') ctx.cluster.run( args=[ 'curl', '--silent', @@ -116,7 +119,7 @@ def download_ceph_daemon(ctx, config): 'sudo', '{}/ceph-daemon'.format(testdir), 'rm-cluster', - '--fsid', ctx.fsid, + '--fsid', ctx.ceph[cluster_name].fsid, '--force', ]) @@ -130,7 +133,10 @@ def download_ceph_daemon(ctx, config): ) @contextlib.contextmanager -def ceph_log(ctx, config, fsid): +def ceph_log(ctx, config): + cluster_name = config['cluster'] + fsid = ctx.ceph[cluster_name].fsid + try: yield @@ -177,10 +183,13 @@ def ceph_log(ctx, config, fsid): os.path.join(sub, 'log')) @contextlib.contextmanager -def ceph_crash(ctx, fsid): +def ceph_crash(ctx, config): """ Gather crash dumps from /var/lib/ceph/$fsid/crash """ + cluster_name = config['cluster'] + fsid = ctx.ceph[cluster_name].fsid + try: yield @@ -206,23 +215,27 @@ def ceph_crash(ctx, fsid): pass @contextlib.contextmanager -def ceph_bootstrap(ctx, config, fsid): +def ceph_bootstrap(ctx, config): + cluster_name = config['cluster'] testdir = teuthology.get_testdir(ctx) + fsid = ctx.ceph[cluster_name].fsid - mons = ctx.mons - first_mon = sorted(mons.keys())[0] - (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys() - log.info('First mon is %s on %s' % (first_mon, mon_remote.shortname)) - ctx.first_mon = first_mon + mons = ctx.ceph[cluster_name].mons + first_mon_role = sorted(mons.keys())[0] + _, _, first_mon = teuthology.split_role(first_mon_role) + (mon_remote,) = ctx.cluster.only(first_mon_role).remotes.keys() + log.info('First mon is mon.%s on %s' % (first_mon, mon_remote.shortname)) + ctx.ceph[cluster_name].first_mon = first_mon others = ctx.cluster.remotes[mon_remote] log.info('others %s' % others) - mgrs = sorted([r for r in others if r.startswith('mgr.')]) + mgrs = sorted([r for r in others + if teuthology.is_type('mgr', cluster_name)(r)]) if not mgrs: raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon) - first_mgr = mgrs[0] + _, _, first_mgr = teuthology.split_role(mgrs[0]) log.info('First mgr is %s' % (first_mgr)) - ctx.first_mgr = first_mgr + ctx.ceph[cluster_name].first_mgr = first_mgr try: # write seed config @@ -232,7 +245,7 @@ def ceph_bootstrap(ctx, config, fsid): seed_config.write(conf_fp) teuthology.write_file( remote=mon_remote, - path='{}/seed.ceph.conf'.format(testdir), + path='{}/seed.{}.conf'.format(testdir, cluster_name), data=conf_fp.getvalue()) # bootstrap @@ -243,46 +256,46 @@ def ceph_bootstrap(ctx, config, fsid): '--image', ctx.image, 'bootstrap', '--fsid', fsid, - '--mon-id', first_mon[4:], - '--mgr-id', first_mgr[4:], - '--config', '{}/seed.ceph.conf'.format(testdir), - '--output-config', '{}/ceph.conf'.format(testdir), - '--output-keyring', '{}/ceph.keyring'.format(testdir), - '--output-pub-ssh-key', '{}/ceph.pub'.format(testdir), + '--mon-id', first_mon, + '--mgr-id', first_mgr, + '--config', '{}/seed.{}.conf'.format(testdir, cluster_name), + '--output-config', '{}/{}.conf'.format(testdir, cluster_name), + '--output-keyring', '{}/{}.keyring'.format(testdir, cluster_name), + '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name), ] - if mons[first_mon].startswith('['): - cmd += ['--mon-addrv', mons[first_mon]] + if mons[first_mon_role].startswith('['): + cmd += ['--mon-addrv', mons[first_mon_role]] else: - cmd += ['--mon-ip', mons[first_mon]] + cmd += ['--mon-ip', mons[first_mon_role]] if config.get('skip_dashboard'): cmd += ['--skip-dashboard'] # bootstrap makes the keyring root 0600, so +r it for our purposes cmd += [ run.Raw('&&'), - 'sudo', 'chmod', '+r', '{}/ceph.keyring'.format(testdir), + 'sudo', 'chmod', '+r', '{}/{}.keyring'.format(testdir, cluster_name), ] mon_remote.run(args=cmd) # fetch keys and configs log.info('Fetching config...') - ctx.config_file = teuthology.get_file( + ctx.ceph[cluster_name].config_file = teuthology.get_file( remote=mon_remote, - path='{}/ceph.conf'.format(testdir)) + path='{}/{}.conf'.format(testdir, cluster_name)) log.info('Fetching client.admin keyring...') - ctx.admin_keyring = teuthology.get_file( + ctx.ceph[cluster_name].admin_keyring = teuthology.get_file( remote=mon_remote, - path='{}/ceph.keyring'.format(testdir)) + path='{}/{}.keyring'.format(testdir, cluster_name)) log.info('Fetching mon keyring...') - ctx.mon_keyring = teuthology.get_file( + ctx.ceph[cluster_name].mon_keyring = teuthology.get_file( remote=mon_remote, - path='/var/lib/ceph/%s/%s/keyring' % (fsid, first_mon), + path='/var/lib/ceph/%s/mon.%s/keyring' % (fsid, first_mon), sudo=True) # fetch ssh key, distribute to additional nodes log.info('Fetching pub ssh key...') ssh_pub_key = teuthology.get_file( remote=mon_remote, - path='{}/ceph.pub'.format(testdir) + path='{}/{}.pub'.format(testdir, cluster_name) ).strip() log.info('Installing pub ssh key for root users...') @@ -303,15 +316,15 @@ def ceph_bootstrap(ctx, config, fsid): log.info('Writing conf and keyring to %s' % remote.shortname) teuthology.write_file( remote=remote, - path='{}/ceph.conf'.format(testdir), - data=ctx.config_file) + path='{}/{}.conf'.format(testdir, cluster_name), + data=ctx.ceph[cluster_name].config_file) teuthology.write_file( remote=remote, - path='{}/ceph.keyring'.format(testdir), - data=ctx.admin_keyring) + path='{}/{}.keyring'.format(testdir, cluster_name), + data=ctx.ceph[cluster_name].admin_keyring) log.info('Adding host %s to orchestrator...' % remote.shortname) - shell(ctx, remote, [ + shell(ctx, cluster_name, remote, [ 'ceph', 'orchestrator', 'host', 'add', remote.shortname ]) @@ -322,40 +335,56 @@ def ceph_bootstrap(ctx, config, fsid): log.info('Cleaning up testdir ceph.* files...') ctx.cluster.run(args=[ 'rm', '-f', - '{}/seed.ceph.conf'.format(testdir), - '{}/ceph.pub'.format(testdir), - '{}/ceph.conf'.format(testdir), - '{}/ceph.keyring'.format(testdir), + '{}/seed.{}.conf'.format(testdir, cluster_name), + '{}/{}.pub'.format(testdir, cluster_name), + '{}/{}.conf'.format(testdir, cluster_name), + '{}/{}.keyring'.format(testdir, cluster_name), ]) log.info('Stopping all daemons...') - ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target']) + + # this doesn't block until they are all stopped... + #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target']) + + # so, stop them individually + for remote, roles in ctx.cluster.remotes.items(): + for role in roles: + log.info('Stopping %s on %s...' % (role, remote.shortname)) + remote.run(args=[ + 'sudo', 'systemctl', 'stop', + 'ceph-%s@%s' % (fsid, role) + ]) + @contextlib.contextmanager def ceph_mons(ctx, config): """ Deploy any additional mons """ + cluster_name = config['cluster'] testdir = teuthology.get_testdir(ctx) num_mons = 1 try: for remote, roles in ctx.cluster.remotes.items(): - for mon in [r for r in roles if r.startswith('mon.')]: - if mon == ctx.first_mon: + for mon in [r for r in roles + if teuthology.is_type('mon', cluster_name)(r)]: + c_, _, id_ = teuthology.split_role(mon) + if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mon: continue - log.info('Adding %s on %s' % (mon, remote.shortname)) + log.info('Adding mon.%s on %s' % (mon, remote.shortname)) num_mons += 1 - shell(ctx, remote, [ + shell(ctx, cluster_name, remote, [ 'ceph', 'orchestrator', 'mon', 'update', str(num_mons), - remote.shortname + ':' + ctx.mons[mon], + remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + mon.split('.', 1)[1], ]) while True: log.info('Waiting for %d mons in monmap...' % (num_mons)) r = shell( ctx=ctx, + cluster_name=cluster_name, remote=remote, args=[ 'ceph', 'mon', 'dump', '-f', 'json', @@ -379,19 +408,21 @@ def ceph_mgrs(ctx, config): """ Deploy any additional mgrs """ + cluster_name = config['cluster'] testdir = teuthology.get_testdir(ctx) - (remote,) = ctx.cluster.only(ctx.first_mon).remotes.keys() try: nodes = [] for remote, roles in ctx.cluster.remotes.items(): - for mgr in [r for r in roles if r.startswith('mgr.')]: - if mgr == ctx.first_mgr: + for mgr in [r for r in roles + if teuthology.is_type('mgr', cluster_name)(r)]: + c_, _, id_ = teuthology.split_role(mgr) + if c_ == cluster_name and id_ == ctx.ceph[cluster_name].first_mgr: continue - log.info('Adding %s on %s' % (mgr, remote.shortname)) + log.info('Adding mgr.%s on %s' % (mgr, remote.shortname)) ### FIXME: we don't get to choose the mgr names #### - nodes.append(remote.shortname) - shell(ctx, remote, [ + nodes.append(remote.shortname + '=' + mgr.split('.', 1)[1]) + shell(ctx, cluster_name, remote, [ 'ceph', 'orchestrator', 'mgr', 'update', str(len(nodes) + 1)] + nodes ) @@ -406,25 +437,19 @@ def ceph_osds(ctx, config): """ Deploy OSDs """ + cluster_name = config['cluster'] try: - log.info('Zapping devices...') - devs_by_remote = {} - for remote, roles in ctx.cluster.remotes.items(): - devs = teuthology.get_scratch_devices(remote) - for dev in devs: - shell(ctx, remote, [ - 'ceph-volume', 'lvm', 'zap', dev]) - devs_by_remote[remote] = devs - log.info('Deploying OSDs...') for remote, roles in ctx.cluster.remotes.items(): - devs = devs_by_remote[remote] + devs = teuthology.get_scratch_devices(remote) for osd in [r for r in roles if r.startswith('osd.')]: assert devs ## FIXME ## dev = devs.pop() log.info('Deploying %s on %s with %s...' % ( osd, remote.shortname, dev)) - shell(ctx, remote, [ + shell(ctx, cluster_name, remote, [ + 'ceph-volume', 'lvm', 'zap', dev]) + shell(ctx, cluster_name, remote, [ 'ceph', 'orchestrator', 'osd', 'create', remote.shortname + ':' + dev ]) @@ -478,6 +503,31 @@ def stop(ctx, config): yield +@contextlib.contextmanager +def distribute_config_and_admin_keyring(ctx, config): + """ + Distribute a sufficient config and keyring for clients + """ + cluster_name = config['cluster'] + log.info('Distributing config and client.admin keyring...') + for remote, roles in ctx.cluster.remotes.items(): + remote.run(args=['sudo', 'mkdir', '-p', '/etc/ceph']) + teuthology.sudo_write_file( + remote=remote, + path='/etc/ceph/{}.conf'.format(cluster_name), + data=ctx.ceph[cluster_name].config_file) + teuthology.sudo_write_file( + remote=remote, + path='/etc/ceph/{}.client.admin.keyring'.format(cluster_name), + data=ctx.ceph[cluster_name].admin_keyring) + try: + yield + finally: + ctx.cluster.run(args=[ + 'sudo', 'rm', '-f', + '/etc/ceph/{}.conf'.format(cluster_name), + '/etc/ceph/{}.client.admin.keyring'.format(cluster_name), + ]) @contextlib.contextmanager def task(ctx, config): @@ -493,16 +543,13 @@ def task(ctx, config): testdir = teuthology.get_testdir(ctx) - ## FIXME i don't understand multicluster ## + # set up cluster context first_ceph_cluster = False if not hasattr(ctx, 'daemons'): first_ceph_cluster = True - ctx.daemons = DaemonGroup() - + ctx.daemons = DaemonGroup(use_ceph_daemon=True) if not hasattr(ctx, 'ceph'): ctx.ceph = {} - - ## FIXME i don't understand multicluster ## if 'cluster' not in config: config['cluster'] = 'ceph' cluster_name = config['cluster'] @@ -522,9 +569,7 @@ def task(ctx, config): # uuid fsid = str(uuid.uuid1()) - ctx.fsid = fsid log.info('Cluster fsid is %s' % fsid) - ## FIXME i don't understand multicluster ## ctx.ceph[cluster_name].fsid = fsid # mon ips @@ -533,23 +578,24 @@ def task(ctx, config): roles = [role_list for (remote, role_list) in remotes_and_roles] ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)] - ctx.mons = get_mons( + ctx.ceph[cluster_name].mons = get_mons( roles, ips, cluster_name, mon_bind_msgr2=config.get('mon_bind_msgr2', True), mon_bind_addrvec=config.get('mon_bind_addrvec', True), ) - log.info('Monitor IPs: %s' % ctx.mons) + log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons) with contextutil.nested( lambda: ceph_initial(), lambda: normalize_hostnames(ctx=ctx), lambda: download_ceph_daemon(ctx=ctx, config=config), - lambda: ceph_log(ctx=ctx, config=config, fsid=fsid), - lambda: ceph_crash(ctx=ctx, fsid=fsid), - lambda: ceph_bootstrap(ctx=ctx, config=config, fsid=fsid), + lambda: ceph_log(ctx=ctx, config=config), + lambda: ceph_crash(ctx=ctx, config=config), + lambda: ceph_bootstrap(ctx=ctx, config=config), lambda: ceph_mons(ctx=ctx, config=config), lambda: ceph_mgrs(ctx=ctx, config=config), lambda: ceph_osds(ctx=ctx, config=config), + lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config), ): try: log.info('Setup complete, yielding')