--- /dev/null
+"""
+Ceph cluster task, deployed via ceph-daemon and ssh orchestrator
+"""
+from cStringIO import StringIO
+
+import argparse
+import configobj
+import contextlib
+import errno
+import logging
+import os
+import json
+import time
+import gevent
+import re
+import socket
+import uuid
+
+from paramiko import SSHException
+from ceph_manager import CephManager, write_conf
+from tarfile import ReadError
+from tasks.cephfs.filesystem import Filesystem
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology import exceptions
+from teuthology.orchestra import run
+import ceph_client as cclient
+from teuthology.orchestra.daemon import DaemonGroup
+from tasks.daemonwatchdog import DaemonWatchdog
+
+# these items we use from ceph.py should probably eventually move elsewhere
+from tasks.ceph import get_mons
+
+log = logging.getLogger(__name__)
+
+
+def shell(ctx, remote, args, **kwargs):
+ testdir = teuthology.get_testdir(ctx)
+ return remote.run(
+ args=[
+ 'sudo',
+ '{}/ceph-daemon'.format(testdir),
+ '--image', ctx.image,
+ 'shell',
+ '-c', '{}/ceph.conf'.format(testdir),
+ '-k', '{}/ceph.keyring'.format(testdir),
+ '--fsid', ctx.fsid,
+ '--',
+ ] + args,
+ **kwargs
+ )
+
+@contextlib.contextmanager
+def download_ceph_daemon(ctx, config):
+ log.info('Downloading ceph-daemon...')
+ testdir = teuthology.get_testdir(ctx)
+ branch = config.get('ceph-daemon-branch', 'master')
+
+ ctx.cluster.run(
+ args=[
+ 'curl', '--silent',
+ 'https://raw.githubusercontent.com/ceph/ceph/%s/src/ceph-daemon/ceph-daemon' % branch,
+ run.Raw('>'),
+ '{tdir}/ceph-daemon'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'test', '-s',
+ '{tdir}/ceph-daemon'.format(tdir=testdir),
+ run.Raw('&&'),
+ 'chmod', '+x',
+ '{tdir}/ceph-daemon'.format(tdir=testdir),
+ ],
+ )
+
+ try:
+ yield
+ finally:
+ log.info('Removing ceph-daemon ...')
+ ctx.cluster.run(
+ args=[
+ 'rm',
+ '-rf',
+ '{tdir}/ceph-daemon'.format(tdir=testdir),
+ ],
+ )
+
+@contextlib.contextmanager
+def ceph_log(ctx, config, fsid):
+ try:
+ yield
+
+ finally:
+ if ctx.archive is not None and \
+ not (ctx.config.get('archive-on-error') and ctx.summary['success']):
+ # and logs
+ log.info('Compressing logs...')
+ run.wait(
+ ctx.cluster.run(
+ args=[
+ 'sudo',
+ 'find',
+ '/var/log/ceph/' + fsid,
+ '-name',
+ '*.log',
+ '-print0',
+ run.Raw('|'),
+ 'sudo',
+ 'xargs',
+ '-0',
+ '--no-run-if-empty',
+ '--',
+ 'gzip',
+ '--',
+ ],
+ wait=False,
+ ),
+ )
+
+ log.info('Archiving logs...')
+ path = os.path.join(ctx.archive, 'remote')
+ try:
+ os.makedirs(path)
+ except OSError as e:
+ pass
+ for remote in ctx.cluster.remotes.keys():
+ sub = os.path.join(path, remote.shortname)
+ try:
+ os.makedirs(sub)
+ except OSError as e:
+ pass
+ teuthology.pull_directory(remote, '/var/log/ceph/' + fsid,
+ os.path.join(sub, 'log'))
+
+@contextlib.contextmanager
+def ceph_crash(ctx, fsid):
+ """
+ Gather crash dumps from /var/lib/ceph/$fsid/crash
+ """
+ try:
+ yield
+
+ finally:
+ if ctx.archive is not None:
+ log.info('Archiving crash dumps...')
+ path = os.path.join(ctx.archive, 'remote')
+ try:
+ os.makedirs(path)
+ except OSError as e:
+ pass
+ for remote in ctx.cluster.remotes.keys():
+ sub = os.path.join(path, remote.shortname)
+ try:
+ os.makedirs(sub)
+ except OSError as e:
+ pass
+ try:
+ teuthology.pull_directory(remote,
+ '/var/lib/ceph/%s/crash' % fsid,
+ os.path.join(sub, 'crash'))
+ except ReadError as e:
+ pass
+
+@contextlib.contextmanager
+def ceph_bootstrap(ctx, config, fsid):
+ testdir = teuthology.get_testdir(ctx)
+
+ mons = ctx.mons
+ first_mon = sorted(mons.keys())[0]
+ (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
+ log.info('First mon is %s on %s' % (first_mon, mon_remote.shortname))
+ ctx.first_mon = first_mon
+
+ others = ctx.cluster.remotes[mon_remote]
+ log.info('others %s' % others)
+ mgrs = sorted([r for r in others if r.startswith('mgr.')])
+ if not mgrs:
+ raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon)
+ first_mgr = mgrs[0]
+ log.info('First mgr is %s' % (first_mgr))
+ ctx.first_mgr = first_mgr
+
+ try:
+ cmd = [
+ 'sudo',
+ '{}/ceph-daemon'.format(testdir),
+ '--image', ctx.image,
+ 'bootstrap',
+ '--fsid', fsid,
+ '--mon-id', first_mon[4:],
+ '--mgr-id', first_mgr[4:],
+ '--output-config', '{}/ceph.conf'.format(testdir),
+ '--output-keyring', '{}/ceph.keyring'.format(testdir),
+ '--output-pub-ssh-key', '{}/ceph.pub'.format(testdir),
+ ]
+ if mons[first_mon].startswith('['):
+ cmd += ['--mon-addrv', mons[first_mon]]
+ else:
+ cmd += ['--mon-ip', mons[first_mon]]
+ # bootstrap makes the keyring root 0600; +r it for our purposes
+ cmd += [
+ run.Raw('&&'),
+ 'sudo', 'chmod', '+r', '{}/ceph.keyring'.format(testdir),
+ ]
+ mon_remote.run(args=cmd)
+
+ # fetch keys and configs
+ log.info('Fetching config...')
+ ctx.config_file = teuthology.get_file(
+ remote=mon_remote,
+ path='{}/ceph.conf'.format(testdir))
+ log.info('Fetching adming keyring...')
+ ctx.admin_keyring = teuthology.get_file(
+ remote=mon_remote,
+ path='{}/ceph.keyring'.format(testdir))
+ log.info('Fetching mon keyring...')
+ ctx.mon_keyring = teuthology.get_file(
+ remote=mon_remote,
+ path='/var/lib/ceph/%s/%s/keyring' % (fsid, first_mon),
+ sudo=True)
+
+ # fetch ssh key, distribute to additional nodes
+ log.info('Fetching pub ssh key...')
+ ssh_pub_key = teuthology.get_file(
+ remote=mon_remote,
+ path='{}/ceph.pub'.format(testdir)
+ ).strip()
+
+ log.info('Installing pub ssh key for root users...')
+ ctx.cluster.run(args=[
+ 'sudo', 'install', '-d', '-m', '0700', '/root/.ssh',
+ run.Raw('&&'),
+ 'echo', ssh_pub_key,
+ run.Raw('|'),
+ 'sudo', 'tee', '-a', '/root/.ssh/authorized_keys',
+ run.Raw('&&'),
+ 'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys',
+ ])
+
+ # add other hosts
+ for remote in ctx.cluster.remotes.keys():
+ if remote == mon_remote:
+ continue
+ log.info('Writing conf and keyring to %s' % remote.shortname)
+ teuthology.write_file(
+ remote=remote,
+ path='{}/ceph.conf'.format(testdir),
+ data=ctx.config_file)
+ teuthology.write_file(
+ remote=remote,
+ path='{}/ceph.keyring'.format(testdir),
+ data=ctx.admin_keyring)
+
+ log.info('Adding host %s to orchestrator...' % remote.shortname)
+ shell(ctx, remote, [
+ 'ceph', 'orchestrator', 'host', 'add',
+ remote.shortname
+ ])
+
+ yield
+
+ finally:
+ log.info('Cleaning up testdir ceph.* files...')
+ ctx.cluster.run(args=[
+ 'rm', '-f',
+ '{}/ceph.pub'.format(testdir),
+ '{}/ceph.conf'.format(testdir),
+ '{}/ceph.keyring'.format(testdir),
+ ])
+ log.info('Cleaning up cluster data...')
+ ctx.cluster.run(args=[
+ 'sudo',
+ '{}/ceph-daemon'.format(testdir),
+ 'rm-cluster',
+ '--fsid', fsid,
+ '--force'])
+
+@contextlib.contextmanager
+def ceph_mons(ctx, config):
+ """
+ Deploy any additional mons
+ """
+ testdir = teuthology.get_testdir(ctx)
+ num_mons = 1
+ (mon_remote,) = ctx.cluster.only(ctx.first_mon).remotes.keys()
+
+ try:
+ for remote, roles in ctx.cluster.remotes.items():
+ for mon in [r for r in roles if r.startswith('mon.')]:
+ if mon == ctx.first_mon:
+ continue
+ log.info('Adding %s on %s' % (mon, remote.shortname))
+ num_mons += 1
+ shell(ctx, remote, [
+ 'ceph', 'orchestrator', 'mon', 'update',
+ str(num_mons),
+ remote.shortname + ':' + ctx.mons[mon],
+ ])
+
+ while True:
+ log.info('Waiting for %d mons in monmap...' % (num_mons))
+ r = shell(
+ ctx=ctx,
+ remote=mon_remote,
+ args=[
+ 'ceph', 'mon', 'dump', '-f', 'json',
+ ],
+ stdout=StringIO(),
+ )
+ j = json.loads(r.stdout.getvalue())
+ if len(j['mons']) == num_mons:
+ break
+ time.sleep(1)
+
+ yield
+
+ finally:
+ pass
+
+@contextlib.contextmanager
+def ceph_mgrs(ctx, config):
+ """
+ Deploy any additional mgrs
+ """
+ testdir = teuthology.get_testdir(ctx)
+ (remote,) = ctx.cluster.only(ctx.first_mon).remotes.keys()
+
+ try:
+ nodes = []
+ for remote, roles in ctx.cluster.remotes.items():
+ for mgr in [r for r in roles if r.startswith('mgr.')]:
+ log.info('Adding %s on %s' % (mgr, remote.shortname))
+ ### FIXME: we don't get to choose the mgr names ####
+ nodes.append(remote.shortname)
+ shell(ctx, remote, [
+ 'ceph', 'orchestrator', 'mgr', 'update',
+ str(len(nodes) + 1)] + nodes
+ )
+
+ yield
+
+ finally:
+ pass
+
+@contextlib.contextmanager
+def ceph_final():
+ log.info('Setup complete, yielding')
+ try:
+ yield
+ finally:
+ log.info('Teardown begin')
+
+@contextlib.contextmanager
+def task(ctx, config):
+ if config is None:
+ config = {}
+
+ assert isinstance(config, dict), \
+ "task only supports a dictionary for configuration"
+
+ overrides = ctx.config.get('overrides', {})
+ teuthology.deep_merge(config, overrides.get('ceph', {}))
+
+ log.info('config ' + str(config))
+
+ ## FIXME i don't understand multicluster ##
+ first_ceph_cluster = False
+ if not hasattr(ctx, 'daemons'):
+ first_ceph_cluster = True
+ ctx.daemons = DaemonGroup()
+
+ if not hasattr(ctx, 'ceph'):
+ ctx.ceph = {}
+
+ ## FIXME i don't understand multicluster ##
+ if 'cluster' not in config:
+ config['cluster'] = 'ceph'
+ cluster_name = config['cluster']
+ ctx.ceph[cluster_name] = argparse.Namespace()
+
+ #validate_config(ctx, config)
+
+ # image
+ branch = config.get('branch', 'master')
+ ### FIXME ###
+ if branch in ['master', 'nautilus']:
+ ctx.image = 'ceph/daemon-base:latest-%s-devel' % branch
+ else:
+ ctx.image = 'ceph-ci/ceph:%s' % branch
+ log.info('Cluster image is %s' % ctx.image)
+
+ # uid
+ fsid = str(uuid.uuid1())
+ ctx.fsid = fsid
+ log.info('Cluster fsid is %s' % fsid)
+ ## FIXME i don't understand multicluster ##
+ ctx.ceph[cluster_name].fsid = fsid
+
+ # mon ips
+ log.info('Choosing monitor IPs and ports...')
+ remotes_and_roles = ctx.cluster.remotes.items()
+ roles = [role_list for (remote, role_list) in remotes_and_roles]
+ ips = [host for (host, port) in
+ (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
+ ctx.mons = get_mons(
+ roles, ips, cluster_name,
+ mon_bind_msgr2=config.get('mon_bind_msgr2', True),
+ mon_bind_addrvec=config.get('mon_bind_addrvec', True),
+ )
+ log.info('Monitor IPs: %s' % ctx.mons)
+
+ with contextutil.nested(
+ lambda: download_ceph_daemon(ctx=ctx, config=config),
+ lambda: ceph_log(ctx=ctx, config=config, fsid=fsid),
+ lambda: ceph_crash(ctx=ctx, fsid=fsid),
+ lambda: ceph_bootstrap(ctx=ctx, config=config, fsid=fsid),
+# lambda: ceph_mons(ctx=ctx, config=config),
+ lambda: ceph_mgrs(ctx=ctx, config=config),
+ lambda: ceph_final(),
+ ):
+ try:
+ yield
+
+ finally:
+ log.info('Teardown complete')