From: Sage Weil Date: Fri, 8 Nov 2019 22:39:00 +0000 (+0000) Subject: qa/tasks/ceph2: basic task to bring up cluster with ceph-daemon and ssh X-Git-Tag: v15.1.0~790^2~23 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ea96d47b42f0c48af560972242647bc2efffd168;p=ceph.git qa/tasks/ceph2: basic task to bring up cluster with ceph-daemon and ssh This is pretty minimal at this point, but it works so far (or will, once a few open PRs merge). Main problematic area right now is that the multi-cluster stuff isn't right. We should fix that before continuing, since ceph-daemon for once can do multiple clusters well. Signed-off-by: Sage Weil --- diff --git a/qa/tasks/ceph2.py b/qa/tasks/ceph2.py new file mode 100644 index 000000000000..acf1ef38f5bd --- /dev/null +++ b/qa/tasks/ceph2.py @@ -0,0 +1,423 @@ +""" +Ceph cluster task, deployed via ceph-daemon and ssh orchestrator +""" +from cStringIO import StringIO + +import argparse +import configobj +import contextlib +import errno +import logging +import os +import json +import time +import gevent +import re +import socket +import uuid + +from paramiko import SSHException +from ceph_manager import CephManager, write_conf +from tarfile import ReadError +from tasks.cephfs.filesystem import Filesystem +from teuthology import misc as teuthology +from teuthology import contextutil +from teuthology import exceptions +from teuthology.orchestra import run +import ceph_client as cclient +from teuthology.orchestra.daemon import DaemonGroup +from tasks.daemonwatchdog import DaemonWatchdog + +# these items we use from ceph.py should probably eventually move elsewhere +from tasks.ceph import get_mons + +log = logging.getLogger(__name__) + + +def shell(ctx, remote, args, **kwargs): + testdir = teuthology.get_testdir(ctx) + return remote.run( + args=[ + 'sudo', + '{}/ceph-daemon'.format(testdir), + '--image', ctx.image, + 'shell', + '-c', '{}/ceph.conf'.format(testdir), + '-k', '{}/ceph.keyring'.format(testdir), + '--fsid', ctx.fsid, + '--', + ] + args, + **kwargs + ) + +@contextlib.contextmanager +def download_ceph_daemon(ctx, config): + log.info('Downloading ceph-daemon...') + testdir = teuthology.get_testdir(ctx) + branch = config.get('ceph-daemon-branch', 'master') + + ctx.cluster.run( + args=[ + 'curl', '--silent', + 'https://raw.githubusercontent.com/ceph/ceph/%s/src/ceph-daemon/ceph-daemon' % branch, + run.Raw('>'), + '{tdir}/ceph-daemon'.format(tdir=testdir), + run.Raw('&&'), + 'test', '-s', + '{tdir}/ceph-daemon'.format(tdir=testdir), + run.Raw('&&'), + 'chmod', '+x', + '{tdir}/ceph-daemon'.format(tdir=testdir), + ], + ) + + try: + yield + finally: + log.info('Removing ceph-daemon ...') + ctx.cluster.run( + args=[ + 'rm', + '-rf', + '{tdir}/ceph-daemon'.format(tdir=testdir), + ], + ) + +@contextlib.contextmanager +def ceph_log(ctx, config, fsid): + try: + yield + + finally: + if ctx.archive is not None and \ + not (ctx.config.get('archive-on-error') and ctx.summary['success']): + # and logs + log.info('Compressing logs...') + run.wait( + ctx.cluster.run( + args=[ + 'sudo', + 'find', + '/var/log/ceph/' + fsid, + '-name', + '*.log', + '-print0', + run.Raw('|'), + 'sudo', + 'xargs', + '-0', + '--no-run-if-empty', + '--', + 'gzip', + '--', + ], + wait=False, + ), + ) + + log.info('Archiving logs...') + path = os.path.join(ctx.archive, 'remote') + try: + os.makedirs(path) + except OSError as e: + pass + for remote in ctx.cluster.remotes.keys(): + sub = os.path.join(path, remote.shortname) + try: + os.makedirs(sub) + except OSError as e: + pass + teuthology.pull_directory(remote, '/var/log/ceph/' + fsid, + os.path.join(sub, 'log')) + +@contextlib.contextmanager +def ceph_crash(ctx, fsid): + """ + Gather crash dumps from /var/lib/ceph/$fsid/crash + """ + try: + yield + + finally: + if ctx.archive is not None: + log.info('Archiving crash dumps...') + path = os.path.join(ctx.archive, 'remote') + try: + os.makedirs(path) + except OSError as e: + pass + for remote in ctx.cluster.remotes.keys(): + sub = os.path.join(path, remote.shortname) + try: + os.makedirs(sub) + except OSError as e: + pass + try: + teuthology.pull_directory(remote, + '/var/lib/ceph/%s/crash' % fsid, + os.path.join(sub, 'crash')) + except ReadError as e: + pass + +@contextlib.contextmanager +def ceph_bootstrap(ctx, config, fsid): + testdir = teuthology.get_testdir(ctx) + + mons = ctx.mons + first_mon = sorted(mons.keys())[0] + (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys() + log.info('First mon is %s on %s' % (first_mon, mon_remote.shortname)) + ctx.first_mon = first_mon + + others = ctx.cluster.remotes[mon_remote] + log.info('others %s' % others) + mgrs = sorted([r for r in others if r.startswith('mgr.')]) + if not mgrs: + raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon) + first_mgr = mgrs[0] + log.info('First mgr is %s' % (first_mgr)) + ctx.first_mgr = first_mgr + + try: + cmd = [ + 'sudo', + '{}/ceph-daemon'.format(testdir), + '--image', ctx.image, + 'bootstrap', + '--fsid', fsid, + '--mon-id', first_mon[4:], + '--mgr-id', first_mgr[4:], + '--output-config', '{}/ceph.conf'.format(testdir), + '--output-keyring', '{}/ceph.keyring'.format(testdir), + '--output-pub-ssh-key', '{}/ceph.pub'.format(testdir), + ] + if mons[first_mon].startswith('['): + cmd += ['--mon-addrv', mons[first_mon]] + else: + cmd += ['--mon-ip', mons[first_mon]] + # bootstrap makes the keyring root 0600; +r it for our purposes + cmd += [ + run.Raw('&&'), + 'sudo', 'chmod', '+r', '{}/ceph.keyring'.format(testdir), + ] + mon_remote.run(args=cmd) + + # fetch keys and configs + log.info('Fetching config...') + ctx.config_file = teuthology.get_file( + remote=mon_remote, + path='{}/ceph.conf'.format(testdir)) + log.info('Fetching adming keyring...') + ctx.admin_keyring = teuthology.get_file( + remote=mon_remote, + path='{}/ceph.keyring'.format(testdir)) + log.info('Fetching mon keyring...') + ctx.mon_keyring = teuthology.get_file( + remote=mon_remote, + path='/var/lib/ceph/%s/%s/keyring' % (fsid, first_mon), + sudo=True) + + # fetch ssh key, distribute to additional nodes + log.info('Fetching pub ssh key...') + ssh_pub_key = teuthology.get_file( + remote=mon_remote, + path='{}/ceph.pub'.format(testdir) + ).strip() + + log.info('Installing pub ssh key for root users...') + ctx.cluster.run(args=[ + 'sudo', 'install', '-d', '-m', '0700', '/root/.ssh', + run.Raw('&&'), + 'echo', ssh_pub_key, + run.Raw('|'), + 'sudo', 'tee', '-a', '/root/.ssh/authorized_keys', + run.Raw('&&'), + 'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys', + ]) + + # add other hosts + for remote in ctx.cluster.remotes.keys(): + if remote == mon_remote: + continue + log.info('Writing conf and keyring to %s' % remote.shortname) + teuthology.write_file( + remote=remote, + path='{}/ceph.conf'.format(testdir), + data=ctx.config_file) + teuthology.write_file( + remote=remote, + path='{}/ceph.keyring'.format(testdir), + data=ctx.admin_keyring) + + log.info('Adding host %s to orchestrator...' % remote.shortname) + shell(ctx, remote, [ + 'ceph', 'orchestrator', 'host', 'add', + remote.shortname + ]) + + yield + + finally: + log.info('Cleaning up testdir ceph.* files...') + ctx.cluster.run(args=[ + 'rm', '-f', + '{}/ceph.pub'.format(testdir), + '{}/ceph.conf'.format(testdir), + '{}/ceph.keyring'.format(testdir), + ]) + log.info('Cleaning up cluster data...') + ctx.cluster.run(args=[ + 'sudo', + '{}/ceph-daemon'.format(testdir), + 'rm-cluster', + '--fsid', fsid, + '--force']) + +@contextlib.contextmanager +def ceph_mons(ctx, config): + """ + Deploy any additional mons + """ + testdir = teuthology.get_testdir(ctx) + num_mons = 1 + (mon_remote,) = ctx.cluster.only(ctx.first_mon).remotes.keys() + + try: + for remote, roles in ctx.cluster.remotes.items(): + for mon in [r for r in roles if r.startswith('mon.')]: + if mon == ctx.first_mon: + continue + log.info('Adding %s on %s' % (mon, remote.shortname)) + num_mons += 1 + shell(ctx, remote, [ + 'ceph', 'orchestrator', 'mon', 'update', + str(num_mons), + remote.shortname + ':' + ctx.mons[mon], + ]) + + while True: + log.info('Waiting for %d mons in monmap...' % (num_mons)) + r = shell( + ctx=ctx, + remote=mon_remote, + args=[ + 'ceph', 'mon', 'dump', '-f', 'json', + ], + stdout=StringIO(), + ) + j = json.loads(r.stdout.getvalue()) + if len(j['mons']) == num_mons: + break + time.sleep(1) + + yield + + finally: + pass + +@contextlib.contextmanager +def ceph_mgrs(ctx, config): + """ + Deploy any additional mgrs + """ + testdir = teuthology.get_testdir(ctx) + (remote,) = ctx.cluster.only(ctx.first_mon).remotes.keys() + + try: + nodes = [] + for remote, roles in ctx.cluster.remotes.items(): + for mgr in [r for r in roles if r.startswith('mgr.')]: + log.info('Adding %s on %s' % (mgr, remote.shortname)) + ### FIXME: we don't get to choose the mgr names #### + nodes.append(remote.shortname) + shell(ctx, remote, [ + 'ceph', 'orchestrator', 'mgr', 'update', + str(len(nodes) + 1)] + nodes + ) + + yield + + finally: + pass + +@contextlib.contextmanager +def ceph_final(): + log.info('Setup complete, yielding') + try: + yield + finally: + log.info('Teardown begin') + +@contextlib.contextmanager +def task(ctx, config): + if config is None: + config = {} + + assert isinstance(config, dict), \ + "task only supports a dictionary for configuration" + + overrides = ctx.config.get('overrides', {}) + teuthology.deep_merge(config, overrides.get('ceph', {})) + + log.info('config ' + str(config)) + + ## FIXME i don't understand multicluster ## + first_ceph_cluster = False + if not hasattr(ctx, 'daemons'): + first_ceph_cluster = True + ctx.daemons = DaemonGroup() + + if not hasattr(ctx, 'ceph'): + ctx.ceph = {} + + ## FIXME i don't understand multicluster ## + if 'cluster' not in config: + config['cluster'] = 'ceph' + cluster_name = config['cluster'] + ctx.ceph[cluster_name] = argparse.Namespace() + + #validate_config(ctx, config) + + # image + branch = config.get('branch', 'master') + ### FIXME ### + if branch in ['master', 'nautilus']: + ctx.image = 'ceph/daemon-base:latest-%s-devel' % branch + else: + ctx.image = 'ceph-ci/ceph:%s' % branch + log.info('Cluster image is %s' % ctx.image) + + # uid + fsid = str(uuid.uuid1()) + ctx.fsid = fsid + log.info('Cluster fsid is %s' % fsid) + ## FIXME i don't understand multicluster ## + ctx.ceph[cluster_name].fsid = fsid + + # mon ips + log.info('Choosing monitor IPs and ports...') + remotes_and_roles = ctx.cluster.remotes.items() + roles = [role_list for (remote, role_list) in remotes_and_roles] + ips = [host for (host, port) in + (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)] + ctx.mons = get_mons( + roles, ips, cluster_name, + mon_bind_msgr2=config.get('mon_bind_msgr2', True), + mon_bind_addrvec=config.get('mon_bind_addrvec', True), + ) + log.info('Monitor IPs: %s' % ctx.mons) + + with contextutil.nested( + lambda: download_ceph_daemon(ctx=ctx, config=config), + lambda: ceph_log(ctx=ctx, config=config, fsid=fsid), + lambda: ceph_crash(ctx=ctx, fsid=fsid), + lambda: ceph_bootstrap(ctx=ctx, config=config, fsid=fsid), +# lambda: ceph_mons(ctx=ctx, config=config), + lambda: ceph_mgrs(ctx=ctx, config=config), + lambda: ceph_final(), + ): + try: + yield + + finally: + log.info('Teardown complete')