]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
ceph-disk: consolidate ceph-disk-* into a single binary
authorSage Weil <sage@inktank.com>
Fri, 15 Mar 2013 23:36:45 +0000 (16:36 -0700)
committerSage Weil <sage@inktank.com>
Tue, 19 Mar 2013 19:52:23 +0000 (12:52 -0700)
 ceph-disk prepare ...
 ceph-disk activate ...
 ceph-disk ...

This let's us share code (we were already duplicating a bunch of stuff!)
and to add new commands easily.

Signed-off-by: Sage Weil <sage@inktank.com>
src/ceph-disk [new file with mode: 0755]
src/ceph-disk-activate
src/ceph-disk-prepare

diff --git a/src/ceph-disk b/src/ceph-disk
new file mode 100755 (executable)
index 0000000..4893be1
--- /dev/null
@@ -0,0 +1,1721 @@
+#!/usr/bin/python
+
+import argparse
+import logging
+import os
+import os.path
+import re
+import subprocess
+import stat
+import sys
+import tempfile
+import uuid
+
+CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
+
+JOURNAL_UUID =         '45b0969e-9b03-4f30-b4c6-b4b80ceff106'
+DMCRYPT_JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-5ec00ceff106'
+OSD_UUID =             '4fbd7e29-9d25-41b8-afd0-062c0ceff05d'
+DMCRYPT_OSD_UUID =     '4fbd7e29-9d25-41b8-afd0-5ec00ceff05d'
+TOBE_UUID =            '89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be'
+DMCRYPT_TOBE_UUID =    '89c57f98-2fe5-4dc0-89c1-5ec00ceff2be'
+
+DEFAULT_FS_TYPE = 'xfs'
+
+MOUNT_OPTIONS = dict(
+    btrfs='noatime,user_subvol_rm_allowed',
+    # user_xattr is default ever since linux 2.6.39 / 3.0, but we'll
+    # delay a moment before removing it fully because we did have some
+    # issues with ext4 before the xatts-in-leveldb work, and it seemed
+    # that user_xattr helped
+    ext4='noatime,user_xattr',
+    xfs='noatime',
+    )
+
+MKFS_ARGS = dict(
+    btrfs=[
+        '-m', 'single',
+        '-l', '32768',
+        '-n', '32768',
+        ],
+    xfs=[
+        # xfs insists on not overwriting previous fs; even if we wipe
+        # partition table, we often recreate it exactly the same way,
+        # so we'll see ghosts of filesystems past
+        '-f',
+        '-i', 'size=2048',
+        ],
+    )
+
+INIT_SYSTEMS = [
+    'upstart',
+    'sysvinit',
+    'systemd',
+    'auto',
+    ]
+
+
+log_name = __name__
+if log_name == '__main__':
+    log_name = os.path.basename(sys.argv[0])
+log = logging.getLogger(log_name)
+
+
+###### exceptions ########
+
+class PrepareError(Exception):
+    """
+    OSD preparation error
+    """
+
+    def __str__(self):
+        doc = self.__doc__.strip()
+        return ': '.join([doc] + [str(a) for a in self.args])
+
+class MountError(PrepareError):
+    """
+    Mounting filesystem failed
+    """
+
+class UnmountError(PrepareError):
+    """
+    Unmounting filesystem failed
+    """
+
+class ActivateError(Exception):
+    """
+    OSD activation error
+    """
+
+    def __str__(self):
+        doc = self.__doc__.strip()
+        return ': '.join([doc] + [str(a) for a in self.args])
+
+class BadMagicError(ActivateError):
+    """
+    Does not look like a Ceph OSD, or incompatible version
+    """
+
+class TruncatedLineError(ActivateError):
+    """
+    Line is truncated
+    """
+
+
+class TooManyLinesError(ActivateError):
+    """
+    Too many lines
+    """
+
+class FilesystemTypeError(ActivateError):
+    """
+    Cannot discover filesystem type
+    """
+
+
+####### utils
+
+
+def maybe_mkdir(*a, **kw):
+    # remove any symlink, if it is there..
+    if os.path.exists(*a) and stat.S_ISLNK(os.lstat(*a).st_mode):
+        log.debug('Removing old symlink at %s', *a)
+        os.unlink(*a)
+    try:
+        os.mkdir(*a, **kw)
+    except OSError, e:
+        if e.errno == errno.EEXIST:
+            pass
+        else:
+            raise
+
+
+def list_partitions(disk):
+    """
+    Return a list of partitions on the given device
+    """
+    disk = os.path.realpath(disk)
+    assert not is_partition(disk)
+    assert disk.startswith('/dev/')
+    base = disk[5:]
+    ls = []
+    with file('/proc/partitions', 'rb') as f:
+        for line in f.read().split('\n')[2:]:
+            fields = re.split('\s+', line)
+            if len(fields) < 5:
+                continue
+            (_, major, minor, blocks, name) = fields
+            if name != base and name.startswith(base):
+                ls.append('/dev/' + name)
+    return ls
+
+
+def is_partition(dev):
+    """
+    Check whether a given device is a partition or a full disk.
+    """
+    dev = os.path.realpath(dev)
+    if not stat.S_ISBLK(os.lstat(dev).st_mode):
+        raise PrepareError('not a block device', dev)
+
+    # if the device ends in a number, it is a partition (e.g., /dev/sda3)
+    if dev[-1].isdigit():
+        return True
+    return False
+
+
+def is_mounted(dev):
+    """
+    Check if the given device is mounted.
+    """
+    dev = os.path.realpath(dev)
+    with file('/proc/mounts') as f:
+        for line in f.read().split('\n'):
+            d = line.split(' ')[0]
+            if os.path.exists(d):
+                d = os.path.realpath(d)
+                if dev == d:
+                    return True
+    return False
+
+
+def is_held(dev):
+    """
+    Check if a device is held by another device (e.g., a dm-crypt mapping)
+    """
+    assert os.path.exists(dev)
+    dev = os.path.realpath(dev)
+    base = dev[5:]
+    disk = base
+    while disk[-1].isdigit():
+        disk = disk[:-1]
+    dir = '/sys/block/{disk}/{base}/holders'.format(disk=disk, base=base)
+    if not os.path.exists(dir):
+        return []
+    return os.listdir(dir)
+
+
+def verify_not_in_use(dev):
+    assert os.path.exists(dev)
+    if is_partition(dev):
+        if is_mounted(dev):
+            raise PrepareError('Device is mounted', dev)
+        holders = is_held(dev)
+        if holders:
+            raise PrepareError('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
+    else:
+        for p in list_partitions(dev):
+            if is_mounted(p):
+                raise PrepareError('Device is mounted', p)
+            holders = is_held(p)
+            if holders:
+                raise PrepareError('Device %s is in use by a device-mapper mapping (dm-crypt?)' % p, ','.join(holders))
+
+
+def must_be_one_line(line):
+    if line[-1:] != '\n':
+        raise TruncatedLineError(line)
+    line = line[:-1]
+    if '\n' in line:
+        raise TooManyLinesError(line)
+    return line
+
+
+def read_one_line(parent, name):
+    """
+    Read a file whose sole contents are a single line.
+
+    Strips the newline.
+
+    :return: Contents of the line, or None if file did not exist.
+    """
+    path = os.path.join(parent, name)
+    try:
+        line = file(path, 'rb').read()
+    except IOError as e:
+        if e.errno == errno.ENOENT:
+            return None
+        else:
+            raise
+
+    try:
+        line = must_be_one_line(line)
+    except (TruncatedLineError, TooManyLinesError) as e:
+        raise ActivateError('File is corrupt: {path}: {msg}'.format(
+                path=path,
+                msg=e,
+                ))
+    return line
+
+
+def write_one_line(parent, name, text):
+    """
+    Write a file whose sole contents are a single line.
+
+    Adds a newline.
+    """
+    path = os.path.join(parent, name)
+    tmp = '{path}.{pid}.tmp'.format(path=path, pid=os.getpid())
+    with file(tmp, 'wb') as f:
+        f.write(text + '\n')
+        os.fsync(f.fileno())
+    os.rename(tmp, path)
+
+
+def check_osd_magic(path):
+    """
+    Check that this path has the Ceph OSD magic.
+
+    :raises: BadMagicError if this does not look like a Ceph OSD data
+    dir.
+    """
+    magic = read_one_line(path, 'magic')
+    if magic is None:
+        # probably not mkfs'ed yet
+        raise BadMagicError(path)
+    if magic != CEPH_OSD_ONDISK_MAGIC:
+        raise BadMagicError(path)
+
+
+def check_osd_id(osd_id):
+    """
+    Ensures osd id is numeric.
+    """
+    if not re.match(r'^[0-9]+$', osd_id):
+        raise ActivateError('osd id is not numeric')
+
+
+def allocate_osd_id(
+    cluster,
+    fsid,
+    keyring,
+    ):
+    log.debug('Allocating OSD id...')
+    try:
+        osd_id = _check_output(
+            args=[
+                '/usr/bin/ceph',
+                '--cluster', cluster,
+                '--name', 'client.bootstrap-osd',
+                '--keyring', keyring,
+                'osd', 'create', '--concise',
+                fsid,
+                ],
+            )
+    except subprocess.CalledProcessError as e:
+        raise ActivateError('ceph osd create failed', e)
+    osd_id = must_be_one_line(osd_id)
+    check_osd_id(osd_id)
+    return osd_id
+
+
+def get_osd_id(path):
+    osd_id = read_one_line(path, 'whoami')
+    if osd_id is not None:
+        check_osd_id(osd_id)
+    return osd_id
+
+
+def _check_output(*args, **kwargs):
+    process = subprocess.Popen(
+        stdout=subprocess.PIPE,
+        *args, **kwargs)
+    out, _ = process.communicate()
+    ret = process.wait()
+    if ret:
+        cmd = kwargs.get("args")
+        if cmd is None:
+            cmd = args[0]
+        raise subprocess.CalledProcessError(ret, cmd, output=out)
+    return out
+
+
+def get_conf(cluster, variable):
+    try:
+        p = subprocess.Popen(
+            args=[
+                'ceph-conf',
+                '--cluster={cluster}'.format(
+                    cluster=cluster,
+                    ),
+                '--name=osd.',
+                '--lookup',
+                variable,
+                ],
+            stdout=subprocess.PIPE,
+            close_fds=True,
+            )
+    except OSError as e:
+        raise PrepareError('error executing ceph-conf', e)
+    (out, _err) = p.communicate()
+    ret = p.wait()
+    if ret == 1:
+        # config entry not found
+        return None
+    elif ret != 0:
+        raise PrepareError('getting variable from configuration failed')
+    value = out.split('\n', 1)[0]
+    # don't differentiate between "var=" and no var set
+    if not value:
+        return None
+    return value
+
+
+def get_conf_with_default(cluster, variable):
+    """
+    Get a config value that is known to the C++ code.
+
+    This will fail if called on variables that are not defined in
+    common config options.
+    """
+    try:
+        out = _check_output(
+            args=[
+                'ceph-osd',
+                '--cluster={cluster}'.format(
+                    cluster=cluster,
+                    ),
+                '--show-config-value={variable}'.format(
+                    variable=variable,
+                    ),
+                ],
+            close_fds=True,
+            )
+    except subprocess.CalledProcessError as e:
+        raise PrepareError(
+            'getting variable from configuration failed',
+            e,
+            )
+
+    value = out.split('\n', 1)[0]
+    return value
+
+
+def get_fsid(cluster):
+    fsid = get_conf(cluster=cluster, variable='fsid')
+    if fsid is None:
+        raise PrepareError('getting cluster uuid from configuration failed')
+    return fsid
+
+
+def get_or_create_dmcrypt_key(
+    uuid,
+    key_dir,
+    ):
+    path = os.path.join(key_dir, uuid)
+
+    # already have it?
+    if os.path.exists(path):
+        return path
+
+    # make a new key
+    try:
+        if not os.path.exists(key_dir):
+            os.makedirs(key_dir)
+        with file('/dev/urandom', 'rb') as i:
+            key = i.read(256)
+            with file(path, 'wb') as f:
+                f.write(key)
+        return path
+    except:
+        raise PrepareError('unable to read or create dm-crypt key', path)
+
+
+def dmcrypt_map(
+    rawdev,
+    keypath,
+    uuid,
+    ):
+    dev = '/dev/mapper/'+ uuid
+    args = [
+        'cryptsetup',
+        '--key-file',
+        keypath,
+        '--key-size', '256',
+        'create',
+        uuid,
+        rawdev,
+        ]
+    try:
+        subprocess.check_call(args)
+        return dev
+
+    except subprocess.CalledProcessError as e:
+        raise PrepareError('unable to map device', rawdev)
+
+
+def dmcrypt_unmap(
+    uuid
+    ):
+    args = [
+        'cryptsetup',
+        'remove',
+        uuid
+    ]
+
+    try:
+        subprocess.check_call(args)
+
+    except subprocess.CalledProcessError as e:
+        raise PrepareError('unable to unmap device', uuid)
+
+
+def mount(
+    dev,
+    fstype,
+    options,
+    ):
+    # pick best-of-breed mount options based on fs type
+    if options is None:
+        options = MOUNT_OPTIONS.get(fstype, '')
+
+    # mount
+    path = tempfile.mkdtemp(
+        prefix='mnt.',
+        dir='/var/lib/ceph/tmp',
+        )
+    try:
+        log.debug('Mounting %s on %s with options %s', dev, path, options)
+        subprocess.check_call(
+            args=[
+                'mount',
+                '-o', options,
+                '--',
+                dev,
+                path,
+                ],
+            )
+    except subprocess.CalledProcessError as e:
+        try:
+            os.rmdir(path)
+        except (OSError, IOError):
+            pass
+        raise MountError(e)
+
+    return path
+
+
+def unmount(
+    path,
+    ):
+    try:
+        log.debug('Unmounting %s', path)
+        subprocess.check_call(
+            args=[
+                'umount',
+                '--',
+                path,
+                ],
+            )
+    except subprocess.CalledProcessError as e:
+        raise UnmountError(e)
+
+    os.rmdir(path)
+
+
+###########################################
+
+
+def get_free_partition_index(dev):
+    try:
+        lines = _check_output(
+            args=[
+                'parted',
+                '--machine',
+                '--',
+                dev,
+                'print',
+                ],
+            )
+    except subprocess.CalledProcessError as e:
+        print 'cannot read partition index; assume it isn\'t present\n'
+        return 1
+
+    if not lines:
+        raise PrepareError('parted failed to output anything')
+    lines = lines.splitlines(True)
+
+    if lines[0] not in ['CHS;\n', 'CYL;\n', 'BYT;\n']:
+        raise PrepareError('weird parted units', lines[0])
+    del lines[0]
+
+    if not lines[0].startswith('/dev/'):
+        raise PrepareError('weird parted disk entry', lines[0])
+    del lines[0]
+
+    seen = set()
+    for line in lines:
+        idx, _ = line.split(':', 1)
+        idx = int(idx)
+        seen.add(idx)
+
+    num = 1
+    while num in seen:
+        num += 1
+    return num
+
+
+def zap(dev):
+    """
+    Destroy the partition table and content of a given disk.
+    """
+    try:
+        log.debug('Zapping partition table on %s', dev)
+
+        # try to wipe out any GPT partition table backups.  sgdisk
+        # isn't too thorough.
+        lba_size = 4096
+        size = 33 * lba_size
+        with file(dev, 'wb') as f:
+            f.seek(-size, os.SEEK_END)
+            f.write(size*'\0')
+
+        subprocess.check_call(
+            args=[
+                'sgdisk',
+                '--zap-all',
+                '--clear',
+                '--mbrtogpt',
+                '--',
+                dev,
+                ],
+            )
+    except subprocess.CalledProcessError as e:
+        raise PrepareError(e)
+
+
+def prepare_journal_dev(
+    data,
+    journal,
+    journal_size,
+    journal_uuid,
+    journal_dm_keypath,
+    ):
+
+    if is_partition(journal):
+        log.debug('Journal %s is a partition', journal)
+        log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
+        return (journal, None, None)
+
+    key = None
+    ptype = JOURNAL_UUID
+    if journal_dm_keypath:
+        ptype = DMCRYPT_JOURNAL_UUID
+
+    # it is a whole disk.  create a partition!
+    num = None
+    if journal == data:
+        # we're sharing the disk between osd data and journal;
+        # make journal be partition number 2, so it's pretty; put
+        # journal at end of free space so partitioning tools don't
+        # reorder them suddenly
+        num = 2
+        journal_part = '{num}:-{size}M:0'.format(
+            num=num,
+            size=journal_size,
+            )
+    else:
+        # sgdisk has no way for me to say "whatever is the next
+        # free index number" when setting type guids etc, so we
+        # need to awkwardly look up the next free number, and then
+        # fix that in the call -- and hope nobody races with us;
+        # then again nothing guards the partition table from races
+        # anyway
+        num = get_free_partition_index(dev=journal)
+        journal_part = '{num}:0:+{size}M'.format(
+            num=num,
+            size=journal_size,
+            )
+        log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
+
+    try:
+        log.debug('Creating journal partition num %d size %d on %s', num, journal_size, journal)
+        subprocess.check_call(
+            args=[
+                'sgdisk',
+                '--new={part}'.format(part=journal_part),
+                '--change-name={num}:ceph journal'.format(num=num),
+                '--partition-guid={num}:{journal_uuid}'.format(
+                    num=num,
+                    journal_uuid=journal_uuid,
+                    ),
+                '--typecode={num}:{uuid}'.format(
+                    num=num,
+                    uuid=ptype,
+                    ),
+                '--',
+                journal,
+                ],
+            )
+        subprocess.check_call(
+            args=[
+                # also make sure the kernel refreshes the new table
+                'partprobe',
+                journal,
+                ],
+            )
+
+        journal_symlink = '/dev/disk/by-partuuid/{journal_uuid}'.format(
+            journal_uuid=journal_uuid,
+            )
+
+        journal_dmcrypt = None
+        if journal_dm_keypath:
+            journal_dmcrypt = journal_symlink
+            journal_symlink = '/dev/mapper/{uuid}'.format(uuid=journal_uuid)
+
+        log.debug('Journal is GPT partition %s', journal_symlink)
+        return (journal_symlink, journal_dmcrypt, journal_uuid)
+
+    except subprocess.CalledProcessError as e:
+        raise PrepareError(e)
+
+
+def prepare_journal_file(
+    journal,
+    journal_size):
+
+    if not os.path.exists(journal):
+        log.debug('Creating journal file %s with size %dM', journal, journal_size)
+        with file(journal, 'wb') as f:
+            f.truncate(journal_size * 1048576)
+
+    # FIXME: should we resize an existing journal file?
+
+    log.debug('Journal is file %s', journal)
+    log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
+    return (journal, None, None)
+
+
+def prepare_journal(
+    data,
+    journal,
+    journal_size,
+    journal_uuid,
+    force_file,
+    force_dev,
+    journal_dm_keypath,
+    ):
+
+    if journal is None:
+        if force_dev:
+            raise PrepareError('Journal is unspecified; not a block device')
+        return (None, None, None)
+
+    if not os.path.exists(journal):
+        if force_dev:
+            raise PrepareError('Journal does not exist; not a block device', journal)
+        return prepare_journal_file(journal, journal_size)
+
+    jmode = os.stat(journal).st_mode
+    if stat.S_ISREG(jmode):
+        if force_dev:
+            raise PrepareError('Journal is not a block device', journal)
+        return prepare_journal_file(journal, journal_size)
+
+    if stat.S_ISBLK(jmode):
+        if force_file:
+            raise PrepareError('Journal is not a regular file', journal)
+        return prepare_journal_dev(data, journal, journal_size, journal_uuid, journal_dm_keypath)
+
+    raise PrepareError('Journal %s is neither a block device nor regular file', journal)
+
+
+def adjust_symlink(target, path):
+    create = True
+    if os.path.lexists(path):
+        try:
+            mode = os.path.lstat(canonical).st_mode
+            if stat.S_ISREG(mode):
+                log.debug('Removing old file %s', canonical)
+                os.unlink(canonical)
+            elif stat.S_ISLNK(mode):
+                old = os.readlink(canonical)
+                if old != journal:
+                    log.debug('Removing old symlink %s -> %s', canonical, old)
+                    os.unlink(canonical)
+                else:
+                    create = False
+        except:
+            raise PrepareError('unable to remove (or adjust) old file (symlink)', canonical)
+    if create:
+        log.debug('Creating symlink %s -> %s', path, target)
+        try:
+            os.symlink(target, path)
+        except:
+            raise PrepareError('unable to create symlink %s -> %s' % (path, target))
+
+def prepare_dir(
+    path,
+    journal,
+    cluster_uuid,
+    osd_uuid,
+    journal_uuid,
+    journal_dmcrypt = None,
+    ):
+    log.debug('Preparing osd data dir %s', path)
+
+    if osd_uuid is None:
+        osd_uuid = str(uuid.uuid4())
+
+    if journal is not None:
+        # we're using an external journal; point to it here
+        adjust_symlink(journal, os.path.join(path, 'journal'))
+
+    if journal_dmcrypt is not None:
+        adjust_symlink(journal_dmcrypt, os.path.join(path, 'journal_dmcrypt'))
+    else:
+        try:
+            os.unlink(os.path.join(path, 'journal_dmcrypt'))
+        except:
+            pass
+
+    write_one_line(path, 'ceph_fsid', cluster_uuid)
+    write_one_line(path, 'fsid', osd_uuid)
+    write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC)
+
+    if journal_uuid is not None:
+        # i.e., journal is a tagged partition
+        write_one_line(path, 'journal_uuid', journal_uuid)
+
+def prepare_dev(
+    data,
+    journal,
+    fstype,
+    mkfs_args,
+    mount_options,
+    cluster_uuid,
+    osd_uuid,
+    journal_uuid,
+    journal_dmcrypt,
+    osd_dm_keypath,
+    ):
+    """
+    Prepare a data/journal combination to be used for an OSD.
+
+    The ``magic`` file is written last, so it's presence is a reliable
+    indicator of the whole sequence having completed.
+
+    WARNING: This will unconditionally overwrite anything given to
+    it.
+    """
+
+    ptype_tobe = TOBE_UUID
+    ptype_osd = OSD_UUID
+    if osd_dm_keypath:
+        ptype_tobe = DMCRYPT_TOBE_UUID
+        ptype_osd = DMCRYPT_OSD_UUID
+
+    rawdev = None
+    if is_partition(data):
+        log.debug('OSD data device %s is a partition', data)
+        rawdev = data
+    else:
+        log.debug('Creating osd partition on %s', data)
+        try:
+            subprocess.check_call(
+                args=[
+                    'sgdisk',
+                    '--largest-new=1',
+                    '--change-name=1:ceph data',
+                    '--partition-guid=1:{osd_uuid}'.format(
+                        osd_uuid=osd_uuid,
+                        ),
+                    '--typecode=1:%s' % ptype_tobe,
+                    '--',
+                    data,
+                    ],
+                )
+            subprocess.check_call(
+                args=[
+                    # also make sure the kernel refreshes the new table
+                    'partprobe',
+                    data,
+                    ],
+                )
+        except subprocess.CalledProcessError as e:
+            raise PrepareError(e)
+
+        rawdev = '{data}1'.format(data=data)
+
+    dev = None
+    if osd_dm_keypath:
+        dev = dmcrypt_map(rawdev, osd_dm_keypath, osd_uuid)
+    else:
+        dev = rawdev
+
+    try:
+        args = [
+            'mkfs',
+            '-t',
+            fstype,
+            ]
+        if mkfs_args is not None:
+            args.extend(mkfs_args.split())
+            if fstype == 'xfs':
+                args.extend(['-f'])  # always force
+        else:
+            args.extend(MKFS_ARGS.get(fstype, []))
+        args.extend([
+                '--',
+                dev,
+                ])
+        try:
+            log.debug('Creating %s fs on %s', fstype, dev)
+            subprocess.check_call(args=args)
+        except subprocess.CalledProcessError as e:
+            raise PrepareError(e)
+
+        #remove whitespaces from mount_options
+        if mount_options is not None:
+            mount_options = "".join(mount_options.split())
+
+        path = mount(dev=dev, fstype=fstype, options=mount_options)
+
+        try:
+            prepare_dir(
+                path=path,
+                journal=journal,
+                cluster_uuid=cluster_uuid,
+                osd_uuid=osd_uuid,
+                journal_uuid=journal_uuid,
+                journal_dmcrypt=journal_dmcrypt,
+                )
+        finally:
+            unmount(path)
+    finally:
+        if rawdev != dev:
+            dmcrypt_unmap(osd_uuid)
+
+    if not is_partition(data):
+        try:
+            subprocess.check_call(
+                args=[
+                    'sgdisk',
+                    '--typecode=1:%s' % ptype_osd,
+                    '--',
+                    data,
+                    ],
+                )
+            subprocess.check_call(
+                args=[
+                    # also make sure the kernel refreshes the new table
+                    'partprobe',
+                    data,
+                    ],
+                )
+        except subprocess.CalledProcessError as e:
+            raise PrepareError(e)
+
+
+def main_prepare(args):
+    journal_dm_keypath = None
+    osd_dm_keypath = None
+
+    try:
+        if not os.path.exists(args.data):
+            raise PrepareError('data path does not exist', args.data)
+
+        # in use?
+        dmode = os.stat(args.data).st_mode
+        if stat.S_ISBLK(dmode):
+            verify_not_in_use(args.data)
+
+        if args.journal and os.path.exists(args.journal):
+            jmode = os.stat(args.journal).st_mode
+            if stat.S_ISBLK(jmode):
+                verify_not_in_use(args.journal)
+
+        if args.zap_disk is not None:
+            if stat.S_ISBLK(dmode) and not is_partition(args.data):
+                zap(args.data)
+            else:
+                raise PrepareError('not full block device; cannot zap', args.data)
+
+        if args.cluster_uuid is None:
+            args.cluster_uuid = get_fsid(cluster=args.cluster)
+            if args.cluster_uuid is None:
+                raise PrepareError(
+                    'must have fsid in config or pass --cluster--uuid=',
+                    )
+
+        if args.fs_type is None:
+            args.fs_type = get_conf(
+                cluster=args.cluster,
+                variable='osd_mkfs_type',
+                )
+            if args.fs_type is None:
+                args.fs_type = get_conf(
+                    cluster=args.cluster,
+                    variable='osd_fs_type',
+                    )
+            if args.fs_type is None:
+                args.fs_type = DEFAULT_FS_TYPE
+
+        mkfs_args = get_conf(
+            cluster=args.cluster,
+            variable='osd_mkfs_options_{fstype}'.format(
+                fstype=args.fs_type,
+                ),
+            )
+        if mkfs_args is None:
+            mkfs_args = get_conf(
+                cluster=args.cluster,
+                variable='osd_fs_mkfs_options_{fstype}'.format(
+                    fstype=args.fs_type,
+                    ),
+                )
+
+        mount_options = get_conf(
+            cluster=args.cluster,
+            variable='osd_mount_options_{fstype}'.format(
+                fstype=args.fs_type,
+                ),
+            )
+        if mount_options is None:
+            mount_options = get_conf(
+                cluster=args.cluster,
+                variable='osd_fs_mount_options_{fstype}'.format(
+                    fstype=args.fs_type,
+                    ),
+                )
+
+        journal_size = get_conf_with_default(
+            cluster=args.cluster,
+            variable='osd_journal_size',
+            )
+        journal_size = int(journal_size)
+
+        # colocate journal with data?
+        if stat.S_ISBLK(dmode) and not is_partition(args.data) and args.journal is None and args.journal_file is None:
+            log.info('Will colocate journal with data on %s', args.data)
+            args.journal = args.data
+
+        if args.journal_uuid is None:
+            args.journal_uuid = str(uuid.uuid4())
+        if args.osd_uuid is None:
+            args.osd_uuid = str(uuid.uuid4())
+
+        # dm-crypt keys?
+        if args.dmcrypt:
+            journal_dm_keypath = get_or_create_dmcrypt_key(args.journal_uuid, args.dmcrypt_key_dir)
+            osd_dm_keypath = get_or_create_dmcrypt_key(args.osd_uuid, args.dmcrypt_key_dir)
+
+        # prepare journal
+        (journal_symlink, journal_dmcrypt, journal_uuid) = prepare_journal(
+            data=args.data,
+            journal=args.journal,
+            journal_size=journal_size,
+            journal_uuid=args.journal_uuid,
+            force_file=args.journal_file,
+            force_dev=args.journal_dev,
+            journal_dm_keypath=journal_dm_keypath,
+            )
+
+        # prepare data
+        if stat.S_ISDIR(dmode):
+            if args.data_dev:
+                raise PrepareError('data path is not a block device', args.data)
+            prepare_dir(
+                path=args.data,
+                journal=journal_symlink,
+                cluster_uuid=args.cluster_uuid,
+                osd_uuid=args.osd_uuid,
+                journal_uuid=journal_uuid,
+                journal_dmcrypt=journal_dmcrypt,
+                )
+        elif stat.S_ISBLK(dmode):
+            if args.data_dir:
+                raise PrepareError('data path is not a directory', args.data)
+            prepare_dev(
+                data=args.data,
+                journal=journal_symlink,
+                fstype=args.fs_type,
+                mkfs_args=mkfs_args,
+                mount_options=mount_options,
+                cluster_uuid=args.cluster_uuid,
+                osd_uuid=args.osd_uuid,
+                journal_uuid=journal_uuid,
+                journal_dmcrypt=journal_dmcrypt,
+                osd_dm_keypath=osd_dm_keypath,
+                )
+        else:
+            raise PrepareError('not a dir or block device', args.data)
+
+    except PrepareError as e:
+        if journal_dm_keypath:
+            os.unlink(journal_dm_keypath)
+        if osd_dm_keypath:
+            os.unlink(osd_dm_keypath)
+        print >>sys.stderr, '{prog}: {msg}'.format(
+            prog=args.prog,
+            msg=e,
+            )
+        sys.exit(1)
+
+
+
+###########################
+
+
+def mkfs(
+    path,
+    cluster,
+    osd_id,
+    fsid,
+    keyring,
+    ):
+    monmap = os.path.join(path, 'activate.monmap')
+    subprocess.check_call(
+        args=[
+            '/usr/bin/ceph',
+            '--cluster', cluster,
+            '--name', 'client.bootstrap-osd',
+            '--keyring', keyring,
+            'mon', 'getmap', '-o', monmap,
+            ],
+        )
+
+    subprocess.check_call(
+        args=[
+            '/usr/bin/ceph-osd',
+            '--cluster', cluster,
+            '--mkfs',
+            '--mkkey',
+            '-i', osd_id,
+            '--monmap', monmap,
+            '--osd-data', path,
+            '--osd-journal', os.path.join(path, 'journal'),
+            '--osd-uuid', fsid,
+            '--keyring', os.path.join(path, 'keyring'),
+            ],
+        )
+    # TODO ceph-osd --mkfs removes the monmap file?
+    # os.unlink(monmap)
+
+
+def auth_key(
+    path,
+    cluster,
+    osd_id,
+    keyring,
+    ):
+    subprocess.check_call(
+        args=[
+            '/usr/bin/ceph',
+            '--cluster', cluster,
+            '--name', 'client.bootstrap-osd',
+            '--keyring', keyring,
+            'auth', 'add', 'osd.{osd_id}'.format(osd_id=osd_id),
+            '-i', os.path.join(path, 'keyring'),
+            'osd', 'allow *',
+            'mon', 'allow rwx',
+            ],
+        )
+
+
+def move_mount(
+    path,
+    cluster,
+    osd_id,
+    ):
+    log.debug('Moving mount to final location...')
+    parent = '/var/lib/ceph/osd'
+    osd_data = os.path.join(
+        parent,
+        '{cluster}-{osd_id}'.format(cluster=cluster, osd_id=osd_id),
+        )
+    maybe_mkdir(osd_data)
+    subprocess.check_call(
+        args=[
+            '/bin/mount',
+            '--move',
+            '--',
+            path,
+            osd_data,
+            ],
+        )
+
+
+def start_daemon(
+    cluster,
+    osd_id,
+    ):
+    log.debug('Starting %s osd.%s...', cluster, osd_id)
+
+    path = '/var/lib/ceph/osd/{cluster}-{osd_id}'.format(
+        cluster=cluster, osd_id=osd_id)
+
+    # upstart?
+    try:
+        if os.path.exists(os.path.join(path,'upstart')):
+            subprocess.check_call(
+                args=[
+                    '/sbin/initctl',
+                    # use emit, not start, because start would fail if the
+                    # instance was already running
+                    'emit',
+                    # since the daemon starting doesn't guarantee much about
+                    # the service being operational anyway, don't bother
+                    # waiting for it
+                    '--no-wait',
+                    '--',
+                    'ceph-osd',
+                    'cluster={cluster}'.format(cluster=cluster),
+                    'id={osd_id}'.format(osd_id=osd_id),
+                    ],
+                )
+        elif os.path.exists(os.path.join(path, 'sysvinit')):
+            subprocess.check_call(
+                args=[
+                    '/usr/sbin/service',
+                    'ceph',
+                    'start',
+                    'osd.{osd_id}'.format(osd_id=osd_id),
+                    ],
+                )
+        else:
+            raise ActivateError('{cluster} osd.{osd_id} is not tagged with an init system'.format(
+                    cluster=cluster,
+                    osd_id=osd_id,
+                    ))
+    except subprocess.CalledProcessError as e:
+        raise ActivateError('ceph osd start failed', e)
+
+def detect_fstype(
+    dev,
+    ):
+    fstype = _check_output(
+        args=[
+            '/sbin/blkid',
+            # we don't want stale cached results
+            '-p',
+            '-s', 'TYPE',
+            '-o' 'value',
+            '--',
+            dev,
+            ],
+        )
+    fstype = must_be_one_line(fstype)
+    return fstype
+
+
+def get_conf(cluster, variable):
+    try:
+        p = subprocess.Popen(
+            args=[
+                '/usr/bin/ceph-conf',
+                '--cluster={cluster}'.format(
+                    cluster=cluster,
+                    ),
+                '--name=osd.',
+                '--lookup',
+                variable,
+                ],
+            stdout=subprocess.PIPE,
+            close_fds=True,
+            )
+    except OSError as e:
+        raise ActivateError('error executing ceph-conf', e)
+    (out, _err) = p.communicate()
+    ret = p.wait()
+    if ret == 1:
+        # config entry not found
+        return None
+    elif ret != 0:
+        raise ActivateError('getting variable from configuration failed')
+    value = out.split('\n', 1)[0]
+    # don't differentiate between "var=" and no var set
+    if not value:
+        return None
+    return value
+
+
+def mount(
+    dev,
+    fstype,
+    options,
+    ):
+    # pick best-of-breed mount options based on fs type
+    if options is None:
+        options = MOUNT_OPTIONS.get(fstype, '')
+
+    # mount
+    path = tempfile.mkdtemp(
+        prefix='mnt.',
+        dir='/var/lib/ceph/tmp',
+        )
+    try:
+        subprocess.check_call(
+            args=[
+                '/bin/mount',
+                '-o', options,
+                '--',
+                dev,
+                path,
+                ],
+            )
+    except subprocess.CalledProcessError as e:
+        try:
+            os.rmdir(path)
+        except (OSError, IOError):
+            pass
+        raise MountError(e)
+
+    return path
+
+
+def unmount(
+    path,
+    ):
+    try:
+        subprocess.check_call(
+            args=[
+                '/bin/umount',
+                '--',
+                path,
+                ],
+            )
+    except subprocess.CalledProcessError as e:
+        raise UnmountError(e)
+
+def mount_activate(
+    dev,
+    activate_key_template,
+    init,
+    ):
+
+    try:
+        fstype = detect_fstype(dev=dev)
+    except (subprocess.CalledProcessError,
+            TruncatedLineError,
+            TooManyLinesError) as e:
+        raise FilesystemTypeError(
+            'device {dev}'.format(dev=dev),
+            e,
+            )
+
+    # TODO always using mount options from cluster=ceph for
+    # now; see http://tracker.newdream.net/issues/3253
+    mount_options = get_conf(
+        cluster='ceph',
+        variable='osd_mount_options_{fstype}'.format(
+            fstype=fstype,
+            ),
+        )
+
+    if mount_options is None:
+        mount_options = get_conf(
+            cluster='ceph',
+            variable='osd_fs_mount_options_{fstype}'.format(
+                fstype=fstype,
+                ),
+            )
+
+    #remove whitespaces from mount_options
+    if mount_options is not None:
+        mount_options = "".join(mount_options.split())
+
+    path = mount(dev=dev, fstype=fstype, options=mount_options)
+
+    osd_id = None
+    cluster = None
+    try:
+        (osd_id, cluster) = activate(path, activate_key_template, init)
+
+        # check if the disk is already active, or if something else is already
+        # mounted there
+        active = False
+        other = False
+        src_dev = os.stat(path).st_dev
+        try:
+            dst_dev = os.stat('/var/lib/ceph/osd/{cluster}-{osd_id}'.format(
+                    cluster=cluster,
+                    osd_id=osd_id)).st_dev
+            if src_dev == dst_dev:
+                active = True
+            else:
+                parent_dev = os.stat('/var/lib/ceph/osd').st_dev
+                if dst_dev != parent_dev:
+                    other = True
+        except:
+            pass
+        if active:
+            log.info('%s osd.%s already mounted in position; unmounting ours.' % (cluster, osd_id))
+            unmount(path)
+        elif other:
+            raise ActivateError('another %s osd.%s already mounted in position (old/different cluster instance?); unmounting ours.' % (cluster, osd_id))
+        else:
+            move_mount(
+                path=path,
+                cluster=cluster,
+                osd_id=osd_id,
+                )
+        return (cluster, osd_id)
+
+    except:
+        log.error('Failed to activate')
+        unmount(path)
+        raise
+    finally:
+        # remove out temp dir
+        os.rmdir(path)
+
+
+def activate_dir(
+    path,
+    activate_key_template,
+    init,
+    ):
+
+    if not os.path.exists(path):
+        raise ActivateError(
+            'directory %s does not exist' % path
+            )
+
+    (osd_id, cluster) = activate(path, activate_key_template, init)
+    canonical = '/var/lib/ceph/osd/{cluster}-{osd_id}'.format(
+        cluster=cluster,
+        osd_id=osd_id)
+    if path != canonical:
+        # symlink it from the proper location
+        create = True
+        if os.path.lexists(canonical):
+            old = os.readlink(canonical)
+            if old != path:
+                log.debug('Removing old symlink %s -> %s', canonical, old)
+                try:
+                    os.unlink(canonical)
+                except:
+                    raise ActivateError('unable to remove old symlink %s', canonical)
+            else:
+                create = False
+        if create:
+            log.debug('Creating symlink %s -> %s', canonical, path)
+            try:
+                os.symlink(path, canonical)
+            except:
+                raise ActivateError('unable to create symlink %s -> %s', canonical, path)
+
+    return (cluster, osd_id)
+
+
+def find_cluster_by_uuid(uuid):
+    """
+    Find a cluster name by searching /etc/ceph/*.conf for a conf file
+    with the right uuid.
+    """
+    no_fsid = []
+    if not os.path.exists('/etc/ceph'):
+        return None
+    for file in os.listdir('/etc/ceph'):
+        if not file.endswith('.conf'):
+            continue
+        cluster = file[:-5]
+        u = get_conf(cluster, 'fsid')
+        if u is None:
+            no_fsid.append(cluster)
+        elif u == uuid:
+            return cluster
+    # be tolerant of /etc/ceph/ceph.conf without an fsid defined.
+    if len(no_fsid) == 1 and no_fsid[0] == 'ceph':
+        log.warning('No fsid defined in /etc/ceph/ceph.conf; using anyway')
+        return 'ceph'
+    return None
+
+def activate(
+    path,
+    activate_key_template,
+    init,
+    ):
+
+    try:
+        check_osd_magic(path)
+
+        ceph_fsid = read_one_line(path, 'ceph_fsid')
+        if ceph_fsid is None:
+            raise ActivateError('No cluster uuid assigned.')
+        log.debug('Cluster uuid is %s', ceph_fsid)
+
+        cluster = find_cluster_by_uuid(ceph_fsid)
+        if cluster is None:
+            raise ActivateError('No cluster conf found in /etc/ceph with fsid %s' % ceph_fsid)
+        log.debug('Cluster name is %s', cluster)
+
+        fsid = read_one_line(path, 'fsid')
+        if fsid is None:
+            raise ActivateError('No OSD uuid assigned.')
+        log.debug('OSD uuid is %s', fsid)
+
+        keyring = activate_key_template.format(cluster=cluster)
+
+        osd_id = get_osd_id(path)
+        if osd_id is None:
+            osd_id = allocate_osd_id(
+                cluster=cluster,
+                fsid=fsid,
+                keyring=keyring,
+                )
+            write_one_line(path, 'whoami', osd_id)
+        log.debug('OSD id is %s', osd_id)
+
+        if not os.path.exists(os.path.join(path, 'ready')):
+            log.debug('Initializing OSD...')
+            # re-running mkfs is safe, so just run until it completes
+            mkfs(
+                path=path,
+                cluster=cluster,
+                osd_id=osd_id,
+                fsid=fsid,
+                keyring=keyring,
+                )
+
+        if init is not None:
+            if init == 'auto':
+                c = get_conf(
+                    cluster=cluster,
+                    variable='init'
+                    )
+                if c is not None:
+                    init = c
+                else:
+                    (distro, release, codename) = platform.dist()
+                    if distro == 'Ubuntu':
+                        init = 'upstart'
+                    else:
+                        init = 'sysvinit'
+
+            log.debug('Marking with init system %s', init)
+            with file(os.path.join(path, init), 'w'):
+                pass
+
+            # remove markers for others, just in case.
+            for other in INIT_SYSTEMS:
+                if other != init:
+                    try:
+                        os.unlink(os.path.join(path, other))
+                    except:
+                        pass
+
+        if not os.path.exists(os.path.join(path, 'active')):
+            log.debug('Authorizing OSD key...')
+            auth_key(
+                path=path,
+                cluster=cluster,
+                osd_id=osd_id,
+                keyring=keyring,
+                )
+            write_one_line(path, 'active', 'ok')
+        log.debug('%s osd.%s data dir is ready at %s', cluster, osd_id, path)
+        return (osd_id, cluster)
+    except:
+        raise
+
+
+
+def main_activate(args):
+    try:
+        cluster = None
+        osd_id = None
+
+        if not os.path.exists(args.path):
+            raise ActivateError('%s does not exist', args.path)
+
+        mode = os.stat(args.path).st_mode
+        if stat.S_ISBLK(mode):
+            (cluster, osd_id) = mount_activate(
+                dev=args.path,
+                activate_key_template=args.activate_key_template,
+                init=args.mark_init,
+                )
+        elif stat.S_ISDIR(mode):
+            (cluster, osd_id) = activate_dir(
+                path=args.path,
+                activate_key_template=args.activate_key_template,
+                init=args.mark_init,
+                )
+        else:
+            raise ActivateError('%s is not a directory or block device', args.path)
+
+        start_daemon(
+            cluster=cluster,
+            osd_id=osd_id,
+            )
+
+    except ActivateError as e:
+        print >>sys.stderr, '{prog}: {msg}'.format(
+            prog=args.prog,
+            msg=e,
+            )
+        sys.exit(1)
+
+
+
+###########################
+
+
+def list_dev(dev):
+    print '%s' % dev
+    
+def main_list(args):
+    ls = list_all_partitions()
+    log.debug('partitions are %s' % ls)
+
+    for base, parts in ls.iteritems():
+        if parts:
+            print '%s :' % base
+            for part in parts:
+                list_dev(part)
+        else:
+            list_dev(base)
+
+
+###########################
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        'Manage data disks or directories for use by a Ceph OSD',
+        )
+    parser.add_argument(
+        '-v', '--verbose',
+        action='store_true', default=None,
+        help='be more verbose',
+        )
+    parser.set_defaults(
+        # we want to hold on to this, for later
+        prog=parser.prog,
+        cluster='ceph',
+        )
+
+    subparsers = parser.add_subparsers(help='sub-command help')
+
+    prepare_parser = subparsers.add_parser('prepare', help='Prepare a directory or disk for a Ceph OSD')
+    prepare_parser.add_argument(
+        '--cluster',
+        metavar='NAME',
+        help='cluster name to assign this disk to',
+        )
+    prepare_parser.add_argument(
+        '--cluster-uuid',
+        metavar='UUID',
+        help='cluster uuid to assign this disk to',
+        )
+    prepare_parser.add_argument(
+        '--osd-uuid',
+        metavar='UUID',
+        help='unique OSD uuid to assign this disk to',
+        )
+    prepare_parser.add_argument(
+        '--journal-uuid',
+        metavar='UUID',
+        help='unique uuid to assign to the journal',
+        )
+    prepare_parser.add_argument(
+        '--fs-type',
+        help='file system type to use (e.g. "ext4")',
+        )
+    prepare_parser.add_argument(
+        '--zap-disk',
+        action='store_true', default=None,
+        help='destroy the partition table (and content) of a disk',
+        )
+    prepare_parser.add_argument(
+        '--data-dir',
+        action='store_true', default=None,
+        help='verify that DATA is a dir',
+        )
+    prepare_parser.add_argument(
+        '--data-dev',
+        action='store_true', default=None,
+        help='verify that DATA is a block device',
+        )
+    prepare_parser.add_argument(
+        '--journal-file',
+        action='store_true', default=None,
+        help='verify that JOURNAL is a file',
+        )
+    prepare_parser.add_argument(
+        '--journal-dev',
+        action='store_true', default=None,
+        help='verify that JOURNAL is a block device',
+        )
+    prepare_parser.add_argument(
+        '--dmcrypt',
+        action='store_true', default=None,
+        help='encrypt DATA and/or JOURNAL devices with dm-crypt',
+        )
+    prepare_parser.add_argument(
+        '--dmcrypt-key-dir',
+        metavar='KEYDIR',
+        default='/etc/ceph/dmcrypt-keys',
+        help='directory where dm-crypt keys are stored',
+        )
+    prepare_parser.add_argument(
+        'data',
+        metavar='DATA',
+        help='path to OSD data (a disk block device or directory)',
+        )
+    prepare_parser.add_argument(
+        'journal',
+        metavar='JOURNAL',
+        nargs='?',
+        help=('path to OSD journal disk block device;'
+              + ' leave out to store journal in file'),
+        )
+
+    activate_parser = subparsers.add_parser('activate', help='Activate a Ceph OSD')
+    activate_parser.add_argument(
+        '--mount',
+        action='store_true', default=None,
+        help='mount a block device [deprecated, ignored]',
+        )
+    activate_parser.add_argument(
+        '--activate-key',
+        metavar='PATH',
+        help='bootstrap-osd keyring path template (%(default)s)',
+        dest='activate_key_template',
+        )
+    activate_parser.add_argument(
+        '--mark-init',
+        metavar='INITSYSTEM',
+        help='init system to manage this dir',
+        default='auto',
+        choices=INIT_SYSTEMS,
+        )
+    activate_parser.set_defaults(
+        activate_key_template='/var/lib/ceph/bootstrap-osd/{cluster}.keyring',
+        )
+    activate_parser.add_argument(
+        'path',
+        metavar='PATH',
+        nargs='?',
+        help='path to block device or directory',
+        )
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    loglevel = logging.INFO
+    if args.verbose:
+        loglevel = logging.DEBUG
+
+    logging.basicConfig(
+        level=loglevel,
+        )
+
+    if args.command == 'prepare':
+        main_prepare(args)
+    elif args.command == 'activate':
+        main_activate(args)
+    elif args.command == 'list':
+        main_list(args)
+    else:
+        log.error('unimplemented command %s', args.command)
+
+if __name__ == '__main__':
+    main()
index 9b5370f5d22054823d378e4632b49df9fb9bcea7..72e89f9af30be58f18399a46e8d9cbcf196cd47f 100755 (executable)
@@ -1,763 +1,3 @@
-#!/usr/bin/python
-
-import argparse
-import errno
-import logging
-import os
-import os.path
-import platform
-import re
-import subprocess
-import stat
-import sys
-import tempfile
-
-init_systems = [
-    'upstart',
-    'sysvinit',
-    'systemd',
-    'auto',
-    ]
-
-log_name = __name__
-if log_name == '__main__':
-    log_name = os.path.basename(sys.argv[0])
-log = logging.getLogger(log_name)
-
-
-class ActivateError(Exception):
-    """
-    OSD activation error
-    """
-
-    def __str__(self):
-        doc = self.__doc__.strip()
-        return ': '.join([doc] + [str(a) for a in self.args])
-
-
-class BadMagicError(ActivateError):
-    """
-    Does not look like a Ceph OSD, or incompatible version
-    """
-
-
-class TruncatedLineError(ActivateError):
-    """
-    Line is truncated
-    """
-
-
-class TooManyLinesError(ActivateError):
-    """
-    Too many lines
-    """
-
-
-class FilesystemTypeError(ActivateError):
-    """
-    Cannot discover filesystem type
-    """
-
-
-class MountError(ActivateError):
-    """
-    Mounting filesystem failed
-    """
-
-
-class UnmountError(ActivateError):
-    """
-    Unmounting filesystem failed
-    """
-
-
-def maybe_mkdir(*a, **kw):
-    # remove any symlink, if it is there..
-    if os.path.exists(*a) and stat.S_ISLNK(os.lstat(*a).st_mode):
-        log.debug('Removing old symlink at %s', *a)
-        os.unlink(*a)
-    try:
-        os.mkdir(*a, **kw)
-    except OSError, e:
-        if e.errno == errno.EEXIST:
-            pass
-        else:
-            raise
-
-
-def must_be_one_line(line):
-    if line[-1:] != '\n':
-        raise TruncatedLineError(line)
-    line = line[:-1]
-    if '\n' in line:
-        raise TooManyLinesError(line)
-    return line
-
-
-def read_one_line(parent, name):
-    """
-    Read a file whose sole contents are a single line.
-
-    Strips the newline.
-
-    :return: Contents of the line, or None if file did not exist.
-    """
-    path = os.path.join(parent, name)
-    try:
-        line = file(path, 'rb').read()
-    except IOError as e:
-        if e.errno == errno.ENOENT:
-            return None
-        else:
-            raise
-
-    try:
-        line = must_be_one_line(line)
-    except (TruncatedLineError, TooManyLinesError) as e:
-        raise ActivateError('File is corrupt: {path}: {msg}'.format(
-                path=path,
-                msg=e,
-                ))
-    return line
-
-
-def write_one_line(parent, name, text):
-    """
-    Write a file whose sole contents are a single line.
-
-    Adds a newline.
-    """
-    path = os.path.join(parent, name)
-    tmp = '{path}.{pid}.tmp'.format(path=path, pid=os.getpid())
-    with file(tmp, 'wb') as f:
-        f.write(text + '\n')
-        os.fsync(f.fileno())
-    os.rename(tmp, path)
-
-
-CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
-
-
-def check_osd_magic(path):
-    """
-    Check that this path has the Ceph OSD magic.
-
-    :raises: BadMagicError if this does not look like a Ceph OSD data
-    dir.
-    """
-    magic = read_one_line(path, 'magic')
-    if magic is None:
-        # probably not mkfs'ed yet
-        raise BadMagicError(path)
-    if magic != CEPH_OSD_ONDISK_MAGIC:
-        raise BadMagicError(path)
-
-
-def check_osd_id(osd_id):
-    """
-    Ensures osd id is numeric.
-    """
-    if not re.match(r'^[0-9]+$', osd_id):
-        raise ActivateError('osd id is not numeric')
-
-
-def get_osd_id(path):
-    osd_id = read_one_line(path, 'whoami')
-    if osd_id is not None:
-        check_osd_id(osd_id)
-    return osd_id
-
-
-# TODO depend on python2.7
-def _check_output(*args, **kwargs):
-    process = subprocess.Popen(
-        stdout=subprocess.PIPE,
-        *args, **kwargs)
-    out, _ = process.communicate()
-    ret = process.wait()
-    if ret:
-        cmd = kwargs.get("args")
-        if cmd is None:
-            cmd = args[0]
-        raise subprocess.CalledProcessError(ret, cmd, output=out)
-    return out
-
-
-def allocate_osd_id(
-    cluster,
-    fsid,
-    keyring,
-    ):
-    log.debug('Allocating OSD id...')
-    try:
-        osd_id = _check_output(
-            args=[
-                '/usr/bin/ceph',
-                '--cluster', cluster,
-                '--name', 'client.bootstrap-osd',
-                '--keyring', keyring,
-                'osd', 'create', '--concise',
-                fsid,
-                ],
-            )
-    except subprocess.CalledProcessError as e:
-        raise ActivateError('ceph osd create failed', e)
-    osd_id = must_be_one_line(osd_id)
-    check_osd_id(osd_id)
-    return osd_id
-
-
-def mkfs(
-    path,
-    cluster,
-    osd_id,
-    fsid,
-    keyring,
-    ):
-    monmap = os.path.join(path, 'activate.monmap')
-    subprocess.check_call(
-        args=[
-            '/usr/bin/ceph',
-            '--cluster', cluster,
-            '--name', 'client.bootstrap-osd',
-            '--keyring', keyring,
-            'mon', 'getmap', '-o', monmap,
-            ],
-        )
-
-    subprocess.check_call(
-        args=[
-            '/usr/bin/ceph-osd',
-            '--cluster', cluster,
-            '--mkfs',
-            '--mkkey',
-            '-i', osd_id,
-            '--monmap', monmap,
-            '--osd-data', path,
-            '--osd-journal', os.path.join(path, 'journal'),
-            '--osd-uuid', fsid,
-            '--keyring', os.path.join(path, 'keyring'),
-            ],
-        )
-    # TODO ceph-osd --mkfs removes the monmap file?
-    # os.unlink(monmap)
-
-
-def auth_key(
-    path,
-    cluster,
-    osd_id,
-    keyring,
-    ):
-    subprocess.check_call(
-        args=[
-            '/usr/bin/ceph',
-            '--cluster', cluster,
-            '--name', 'client.bootstrap-osd',
-            '--keyring', keyring,
-            'auth', 'add', 'osd.{osd_id}'.format(osd_id=osd_id),
-            '-i', os.path.join(path, 'keyring'),
-            'osd', 'allow *',
-            'mon', 'allow rwx',
-            ],
-        )
-
-
-def move_mount(
-    path,
-    cluster,
-    osd_id,
-    ):
-    log.debug('Moving mount to final location...')
-    parent = '/var/lib/ceph/osd'
-    osd_data = os.path.join(
-        parent,
-        '{cluster}-{osd_id}'.format(cluster=cluster, osd_id=osd_id),
-        )
-    maybe_mkdir(osd_data)
-    subprocess.check_call(
-        args=[
-            '/bin/mount',
-            '--move',
-            '--',
-            path,
-            osd_data,
-            ],
-        )
-
-
-def start_daemon(
-    cluster,
-    osd_id,
-    ):
-    log.debug('Starting %s osd.%s...', cluster, osd_id)
-
-    path = '/var/lib/ceph/osd/{cluster}-{osd_id}'.format(
-        cluster=cluster, osd_id=osd_id)
-
-    # upstart?
-    try:
-        if os.path.exists(os.path.join(path,'upstart')):
-            subprocess.check_call(
-                args=[
-                    '/sbin/initctl',
-                    # use emit, not start, because start would fail if the
-                    # instance was already running
-                    'emit',
-                    # since the daemon starting doesn't guarantee much about
-                    # the service being operational anyway, don't bother
-                    # waiting for it
-                    '--no-wait',
-                    '--',
-                    'ceph-osd',
-                    'cluster={cluster}'.format(cluster=cluster),
-                    'id={osd_id}'.format(osd_id=osd_id),
-                    ],
-                )
-        elif os.path.exists(os.path.join(path, 'sysvinit')):
-            subprocess.check_call(
-                args=[
-                    '/usr/sbin/service',
-                    'ceph',
-                    'start',
-                    'osd.{osd_id}'.format(osd_id=osd_id),
-                    ],
-                )
-        else:
-            raise ActivateError('{cluster} osd.{osd_id} is not tagged with an init system'.format(
-                    cluster=cluster,
-                    osd_id=osd_id,
-                    ))
-    except subprocess.CalledProcessError as e:
-        raise ActivateError('ceph osd start failed', e)
-
-def detect_fstype(
-    dev,
-    ):
-    fstype = _check_output(
-        args=[
-            '/sbin/blkid',
-            # we don't want stale cached results
-            '-p',
-            '-s', 'TYPE',
-            '-o' 'value',
-            '--',
-            dev,
-            ],
-        )
-    fstype = must_be_one_line(fstype)
-    return fstype
-
-
-def get_conf(cluster, variable):
-    try:
-        p = subprocess.Popen(
-            args=[
-                '/usr/bin/ceph-conf',
-                '--cluster={cluster}'.format(
-                    cluster=cluster,
-                    ),
-                '--name=osd.',
-                '--lookup',
-                variable,
-                ],
-            stdout=subprocess.PIPE,
-            close_fds=True,
-            )
-    except OSError as e:
-        raise ActivateError('error executing ceph-conf', e)
-    (out, _err) = p.communicate()
-    ret = p.wait()
-    if ret == 1:
-        # config entry not found
-        return None
-    elif ret != 0:
-        raise ActivateError('getting variable from configuration failed')
-    value = out.split('\n', 1)[0]
-    # don't differentiate between "var=" and no var set
-    if not value:
-        return None
-    return value
-
-
-MOUNT_OPTIONS = dict(
-    btrfs='noatime,user_subvol_rm_allowed',
-    # user_xattr is default ever since linux 2.6.39 / 3.0, but we'll
-    # delay a moment before removing it fully because we did have some
-    # issues with ext4 before the xatts-in-leveldb work, and it seemed
-    # that user_xattr helped
-    ext4='noatime,user_xattr',
-    xfs='noatime',
-    )
-
-
-def mount(
-    dev,
-    fstype,
-    options,
-    ):
-    # pick best-of-breed mount options based on fs type
-    if options is None:
-        options = MOUNT_OPTIONS.get(fstype, '')
-
-    # mount
-    path = tempfile.mkdtemp(
-        prefix='mnt.',
-        dir='/var/lib/ceph/tmp',
-        )
-    try:
-        subprocess.check_call(
-            args=[
-                '/bin/mount',
-                '-o', options,
-                '--',
-                dev,
-                path,
-                ],
-            )
-    except subprocess.CalledProcessError as e:
-        try:
-            os.rmdir(path)
-        except (OSError, IOError):
-            pass
-        raise MountError(e)
-
-    return path
-
-
-def unmount(
-    path,
-    ):
-    try:
-        subprocess.check_call(
-            args=[
-                '/bin/umount',
-                '--',
-                path,
-                ],
-            )
-    except subprocess.CalledProcessError as e:
-        raise UnmountError(e)
-
-def mount_activate(
-    dev,
-    activate_key_template,
-    init,
-    ):
-
-    try:
-        fstype = detect_fstype(dev=dev)
-    except (subprocess.CalledProcessError,
-            TruncatedLineError,
-            TooManyLinesError) as e:
-        raise FilesystemTypeError(
-            'device {dev}'.format(dev=dev),
-            e,
-            )
-
-    # TODO always using mount options from cluster=ceph for
-    # now; see http://tracker.newdream.net/issues/3253
-    mount_options = get_conf(
-        cluster='ceph',
-        variable='osd_mount_options_{fstype}'.format(
-            fstype=fstype,
-            ),
-        )
-
-    if mount_options is None:
-        mount_options = get_conf(
-            cluster='ceph',
-            variable='osd_fs_mount_options_{fstype}'.format(
-                fstype=fstype,
-                ),
-            )
-
-    #remove whitespaces from mount_options
-    if mount_options is not None:
-        mount_options = "".join(mount_options.split())
-
-    path = mount(dev=dev, fstype=fstype, options=mount_options)
-
-    osd_id = None
-    cluster = None
-    try:
-        (osd_id, cluster) = activate(path, activate_key_template, init)
-
-        # check if the disk is already active, or if something else is already
-        # mounted there
-        active = False
-        other = False
-        src_dev = os.stat(path).st_dev
-        try:
-            dst_dev = os.stat('/var/lib/ceph/osd/{cluster}-{osd_id}'.format(
-                    cluster=cluster,
-                    osd_id=osd_id)).st_dev
-            if src_dev == dst_dev:
-                active = True
-            else:
-                parent_dev = os.stat('/var/lib/ceph/osd').st_dev
-                if dst_dev != parent_dev:
-                    other = True
-        except:
-            pass
-        if active:
-            log.info('%s osd.%s already mounted in position; unmounting ours.' % (cluster, osd_id))
-            unmount(path)
-        elif other:
-            raise ActivateError('another %s osd.%s already mounted in position (old/different cluster instance?); unmounting ours.' % (cluster, osd_id))
-        else:
-            move_mount(
-                path=path,
-                cluster=cluster,
-                osd_id=osd_id,
-                )
-        return (cluster, osd_id)
-
-    except:
-        log.error('Failed to activate')
-        unmount(path)
-        raise
-    finally:
-        # remove out temp dir
-        os.rmdir(path)
-
-
-def activate_dir(
-    path,
-    activate_key_template,
-    init,
-    ):
-
-    if not os.path.exists(path):
-        raise ActivateError(
-            'directory %s does not exist' % path
-            )
-
-    (osd_id, cluster) = activate(path, activate_key_template, init)
-    canonical = '/var/lib/ceph/osd/{cluster}-{osd_id}'.format(
-        cluster=cluster,
-        osd_id=osd_id)
-    if path != canonical:
-        # symlink it from the proper location
-        create = True
-        if os.path.lexists(canonical):
-            old = os.readlink(canonical)
-            if old != path:
-                log.debug('Removing old symlink %s -> %s', canonical, old)
-                try:
-                    os.unlink(canonical)
-                except:
-                    raise ActivateError('unable to remove old symlink %s', canonical)
-            else:
-                create = False
-        if create:
-            log.debug('Creating symlink %s -> %s', canonical, path)
-            try:
-                os.symlink(path, canonical)
-            except:
-                raise ActivateError('unable to create symlink %s -> %s', canonical, path)
-
-    return (cluster, osd_id)
-
-
-def find_cluster_by_uuid(uuid):
-    """
-    Find a cluster name by searching /etc/ceph/*.conf for a conf file
-    with the right uuid.
-    """
-    no_fsid = []
-    if not os.path.exists('/etc/ceph'):
-        return None
-    for file in os.listdir('/etc/ceph'):
-        if not file.endswith('.conf'):
-            continue
-        cluster = file[:-5]
-        u = get_conf(cluster, 'fsid')
-        if u is None:
-            no_fsid.append(cluster)
-        elif u == uuid:
-            return cluster
-    # be tolerant of /etc/ceph/ceph.conf without an fsid defined.
-    if len(no_fsid) == 1 and no_fsid[0] == 'ceph':
-        log.warning('No fsid defined in /etc/ceph/ceph.conf; using anyway')
-        return 'ceph'
-    return None
-
-def activate(
-    path,
-    activate_key_template,
-    init,
-    ):
-
-    try:
-        check_osd_magic(path)
-
-        ceph_fsid = read_one_line(path, 'ceph_fsid')
-        if ceph_fsid is None:
-            raise ActivateError('No cluster uuid assigned.')
-        log.debug('Cluster uuid is %s', ceph_fsid)
-
-        cluster = find_cluster_by_uuid(ceph_fsid)
-        if cluster is None:
-            raise ActivateError('No cluster conf found in /etc/ceph with fsid %s' % ceph_fsid)
-        log.debug('Cluster name is %s', cluster)
-
-        fsid = read_one_line(path, 'fsid')
-        if fsid is None:
-            raise ActivateError('No OSD uuid assigned.')
-        log.debug('OSD uuid is %s', fsid)
-
-        keyring = activate_key_template.format(cluster=cluster)
-
-        osd_id = get_osd_id(path)
-        if osd_id is None:
-            osd_id = allocate_osd_id(
-                cluster=cluster,
-                fsid=fsid,
-                keyring=keyring,
-                )
-            write_one_line(path, 'whoami', osd_id)
-        log.debug('OSD id is %s', osd_id)
-
-        if not os.path.exists(os.path.join(path, 'ready')):
-            log.debug('Initializing OSD...')
-            # re-running mkfs is safe, so just run until it completes
-            mkfs(
-                path=path,
-                cluster=cluster,
-                osd_id=osd_id,
-                fsid=fsid,
-                keyring=keyring,
-                )
-
-        if init is not None:
-            if init == 'auto':
-                c = get_conf(
-                    cluster=cluster,
-                    variable='init'
-                    )
-                if c is not None:
-                    init = c
-                else:
-                    (distro, release, codename) = platform.dist()
-                    if distro == 'Ubuntu':
-                        init = 'upstart'
-                    else:
-                        init = 'sysvinit'
-
-            log.debug('Marking with init system %s', init)
-            with file(os.path.join(path, init), 'w'):
-                pass
-
-            # remove markers for others, just in case.
-            for other in init_systems:
-                if other != init:
-                    try:
-                        os.unlink(os.path.join(path, other))
-                    except:
-                        pass
-
-        if not os.path.exists(os.path.join(path, 'active')):
-            log.debug('Authorizing OSD key...')
-            auth_key(
-                path=path,
-                cluster=cluster,
-                osd_id=osd_id,
-                keyring=keyring,
-                )
-            write_one_line(path, 'active', 'ok')
-        log.debug('%s osd.%s data dir is ready at %s', cluster, osd_id, path)
-        return (osd_id, cluster)
-    except:
-        raise
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description='Activate a Ceph OSD',
-        )
-    parser.add_argument(
-        '-v', '--verbose',
-        action='store_true', default=None,
-        help='be more verbose',
-        )
-    parser.add_argument(
-        '--mount',
-        action='store_true', default=None,
-        help='mount a block device; path must follow',
-        )
-    parser.add_argument(
-        '--activate-key',
-        metavar='PATH',
-        help='bootstrap-osd keyring path template (%(default)s)',
-        dest='activate_key_template',
-        )
-    parser.add_argument(
-        'path',
-        metavar='PATH',
-        nargs='?',
-        help='path to block device or directory',
-        )
-    parser.add_argument(
-        '--mark-init',
-        metavar='INITSYSTEM',
-        help='init system to manage this dir',
-        default='auto',
-        choices=init_systems,
-        )
-    parser.set_defaults(
-        activate_key_template='/var/lib/ceph/bootstrap-osd/{cluster}.keyring',
-        # we want to hold on to this, for later
-        prog=parser.prog,
-        )
-    args = parser.parse_args()
-    return args
-
-
-def main():
-    args = parse_args()
-
-    loglevel = logging.INFO
-    if args.verbose:
-        loglevel = logging.DEBUG
-
-    logging.basicConfig(
-        level=loglevel,
-        )
-
-    try:
-        cluster = None
-        osd_id = None
-
-        if not os.path.exists(args.path):
-            raise ActivateError('%s does not exist', args.path)
-
-        mode = os.stat(args.path).st_mode
-        if stat.S_ISBLK(mode):
-            (cluster, osd_id) = mount_activate(
-                dev=args.path,
-                activate_key_template=args.activate_key_template,
-                init=args.mark_init,
-                )
-        elif stat.S_ISDIR(mode):
-            (cluster, osd_id) = activate_dir(
-                path=args.path,
-                activate_key_template=args.activate_key_template,
-                init=args.mark_init,
-                )
-        else:
-            raise ActivateError('%s is not a directory or block device', args.path)
-
-        start_daemon(
-            cluster=cluster,
-            osd_id=osd_id,
-            )
-
-    except ActivateError as e:
-        print >>sys.stderr, '{prog}: {msg}'.format(
-            prog=args.prog,
-            msg=e,
-            )
-        sys.exit(1)
-
-if __name__ == '__main__':
-    main()
+#!/bin/sh
+dir=`dirname $0`
+$dir/ceph-disk activate $*
index 3d5cbf6fe1f160392d3a41b48719f015f2f7db6e..f9255eb88316825585fdd59972f063197f99e0b0 100755 (executable)
-#!/usr/bin/python
-
-import argparse
-import logging
-import os
-import os.path
-import re
-import subprocess
-import stat
-import sys
-import tempfile
-import uuid
-
-CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
-
-JOURNAL_UUID =         '45b0969e-9b03-4f30-b4c6-b4b80ceff106'
-DMCRYPT_JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-5ec00ceff106'
-OSD_UUID =             '4fbd7e29-9d25-41b8-afd0-062c0ceff05d'
-DMCRYPT_OSD_UUID =     '4fbd7e29-9d25-41b8-afd0-5ec00ceff05d'
-TOBE_UUID =            '89c57f98-2fe5-4dc0-89c1-f3ad0ceff2be'
-DMCRYPT_TOBE_UUID =    '89c57f98-2fe5-4dc0-89c1-5ec00ceff2be'
-
-DEFAULT_FS_TYPE = 'xfs'
-
-MOUNT_OPTIONS = dict(
-    btrfs='noatime,user_subvol_rm_allowed',
-    ext4='noatime,user_xattr',
-    xfs='noatime',
-    )
-
-MKFS_ARGS = dict(
-    btrfs=[
-        '-m', 'single',
-        '-l', '32768',
-        '-n', '32768',
-        ],
-    xfs=[
-        # xfs insists on not overwriting previous fs; even if we wipe
-        # partition table, we often recreate it exactly the same way,
-        # so we'll see ghosts of filesystems past
-        '-f',
-        '-i', 'size=2048',
-        ],
-    )
-
-
-log_name = __name__
-if log_name == '__main__':
-    log_name = os.path.basename(sys.argv[0])
-log = logging.getLogger(log_name)
-
-
-class PrepareError(Exception):
-    """
-    OSD preparation error
-    """
-
-    def __str__(self):
-        doc = self.__doc__.strip()
-        return ': '.join([doc] + [str(a) for a in self.args])
-
-
-class MountError(PrepareError):
-    """
-    Mounting filesystem failed
-    """
-
-
-class UnmountError(PrepareError):
-    """
-    Unmounting filesystem failed
-    """
-
-def list_partitions(disk):
-    """
-    Return a list of partitions on the given device
-    """
-    disk = os.path.realpath(disk)
-    assert not is_partition(disk)
-    assert disk.startswith('/dev/')
-    base = disk[5:]
-    ls = []
-    with file('/proc/partitions', 'rb') as f:
-        for line in f.read().split('\n')[2:]:
-            fields = re.split('\s+', line)
-            if len(fields) < 5:
-                continue
-            (_, major, minor, blocks, name) = fields
-            if name != base and name.startswith(base):
-                ls.append('/dev/' + name)
-    return ls
-
-def is_partition(dev):
-    """
-    Check whether a given device is a partition or a full disk.
-    """
-    dev = os.path.realpath(dev)
-    if not stat.S_ISBLK(os.lstat(dev).st_mode):
-        raise PrepareError('not a block device', dev)
-
-    # if the device ends in a number, it is a partition (e.g., /dev/sda3)
-    if dev[-1].isdigit():
-        return True
-    return False
-
-def is_mounted(dev):
-    """
-    Check if the given device is mounted.
-    """
-    dev = os.path.realpath(dev)
-    with file('/proc/mounts') as f:
-        for line in f.read().split('\n'):
-            d = line.split(' ')[0]
-            if os.path.exists(d):
-                d = os.path.realpath(d)
-                if dev == d:
-                    return True
-    return False
-
-def is_held(dev):
-    """
-    Check if a device is held by another device (e.g., a dm-crypt mapping)
-    """
-    assert os.path.exists(dev)
-    dev = os.path.realpath(dev)
-    base = dev[5:]
-    disk = base
-    while disk[-1].isdigit():
-        disk = disk[:-1]
-    dir = '/sys/block/{disk}/{base}/holders'.format(disk=disk, base=base)
-    if not os.path.exists(dir):
-        return []
-    return os.listdir(dir)
-
-def verify_not_in_use(dev):
-    assert os.path.exists(dev)
-    if is_partition(dev):
-        if is_mounted(dev):
-            raise PrepareError('Device is mounted', dev)
-        holders = is_held(dev)
-        if holders:
-            raise PrepareError('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
-    else:
-        for p in list_partitions(dev):
-            if is_mounted(p):
-                raise PrepareError('Device is mounted', p)
-            holders = is_held(p)
-            if holders:
-                raise PrepareError('Device %s is in use by a device-mapper mapping (dm-crypt?)' % p, ','.join(holders))
-
-def write_one_line(parent, name, text):
-    """
-    Write a file whose sole contents are a single line.
-
-    Adds a newline.
-    """
-    path = os.path.join(parent, name)
-    tmp = '{path}.{pid}.tmp'.format(path=path, pid=os.getpid())
-    with file(tmp, 'wb') as f:
-        f.write(text + '\n')
-        os.fsync(f.fileno())
-    os.rename(tmp, path)
-
-
-# TODO depend on python2.7
-def _check_output(*args, **kwargs):
-    process = subprocess.Popen(
-        stdout=subprocess.PIPE,
-        *args, **kwargs)
-    out, _ = process.communicate()
-    ret = process.wait()
-    if ret:
-        cmd = kwargs.get("args")
-        if cmd is None:
-            cmd = args[0]
-        raise subprocess.CalledProcessError(ret, cmd, output=out)
-    return out
-
-
-def get_conf(cluster, variable):
-    try:
-        p = subprocess.Popen(
-            args=[
-                'ceph-conf',
-                '--cluster={cluster}'.format(
-                    cluster=cluster,
-                    ),
-                '--name=osd.',
-                '--lookup',
-                variable,
-                ],
-            stdout=subprocess.PIPE,
-            close_fds=True,
-            )
-    except OSError as e:
-        raise PrepareError('error executing ceph-conf', e)
-    (out, _err) = p.communicate()
-    ret = p.wait()
-    if ret == 1:
-        # config entry not found
-        return None
-    elif ret != 0:
-        raise PrepareError('getting variable from configuration failed')
-    value = out.split('\n', 1)[0]
-    # don't differentiate between "var=" and no var set
-    if not value:
-        return None
-    return value
-
-
-def get_conf_with_default(cluster, variable):
-    """
-    Get a config value that is known to the C++ code.
-
-    This will fail if called on variables that are not defined in
-    common config options.
-    """
-    try:
-        out = _check_output(
-            args=[
-                'ceph-osd',
-                '--cluster={cluster}'.format(
-                    cluster=cluster,
-                    ),
-                '--show-config-value={variable}'.format(
-                    variable=variable,
-                    ),
-                ],
-            close_fds=True,
-            )
-    except subprocess.CalledProcessError as e:
-        raise PrepareError(
-            'getting variable from configuration failed',
-            e,
-            )
-
-    value = out.split('\n', 1)[0]
-    return value
-
-
-def get_fsid(cluster):
-    fsid = get_conf(cluster=cluster, variable='fsid')
-    if fsid is None:
-        raise PrepareError('getting cluster uuid from configuration failed')
-    return fsid
-
-
-def get_or_create_dmcrypt_key(
-    uuid,
-    key_dir,
-    ):
-    path = os.path.join(key_dir, uuid)
-
-    # already have it?
-    if os.path.exists(path):
-        return path
-
-    # make a new key
-    try:
-        if not os.path.exists(key_dir):
-            os.makedirs(key_dir)
-        with file('/dev/urandom', 'rb') as i:
-            key = i.read(256)
-            with file(path, 'wb') as f:
-                f.write(key)
-        return path
-    except:
-        raise PrepareError('unable to read or create dm-crypt key', path)
-
-
-def dmcrypt_map(
-    rawdev,
-    keypath,
-    uuid,
-    ):
-    dev = '/dev/mapper/'+ uuid
-    args = [
-        'cryptsetup',
-        '--key-file',
-        keypath,
-        '--key-size', '256',
-        'create',
-        uuid,
-        rawdev,
-        ]
-    try:
-        subprocess.check_call(args)
-        return dev
-
-    except subprocess.CalledProcessError as e:
-        raise PrepareError('unable to map device', rawdev)
-
-
-def dmcrypt_unmap(
-    uuid
-    ):
-    args = [
-        'cryptsetup',
-        'remove',
-        uuid
-    ]
-
-    try:
-        subprocess.check_call(args)
-
-    except subprocess.CalledProcessError as e:
-        raise PrepareError('unable to unmap device', uuid)
-
-
-def mount(
-    dev,
-    fstype,
-    options,
-    ):
-    # pick best-of-breed mount options based on fs type
-    if options is None:
-        options = MOUNT_OPTIONS.get(fstype, '')
-
-    # mount
-    path = tempfile.mkdtemp(
-        prefix='mnt.',
-        dir='/var/lib/ceph/tmp',
-        )
-    try:
-        log.debug('Mounting %s on %s with options %s', dev, path, options)
-        subprocess.check_call(
-            args=[
-                'mount',
-                '-o', options,
-                '--',
-                dev,
-                path,
-                ],
-            )
-    except subprocess.CalledProcessError as e:
-        try:
-            os.rmdir(path)
-        except (OSError, IOError):
-            pass
-        raise MountError(e)
-
-    return path
-
-
-def unmount(
-    path,
-    ):
-    try:
-        log.debug('Unmounting %s', path)
-        subprocess.check_call(
-            args=[
-                'umount',
-                '--',
-                path,
-                ],
-            )
-    except subprocess.CalledProcessError as e:
-        raise UnmountError(e)
-
-    os.rmdir(path)
-
-
-def get_free_partition_index(dev):
-    try:
-        lines = _check_output(
-            args=[
-                'parted',
-                '--machine',
-                '--',
-                dev,
-                'print',
-                ],
-            )
-    except subprocess.CalledProcessError as e:
-        print 'cannot read partition index; assume it isn\'t present\n'
-        return 1
-
-    if not lines:
-        raise PrepareError('parted failed to output anything')
-    lines = lines.splitlines(True)
-
-    if lines[0] not in ['CHS;\n', 'CYL;\n', 'BYT;\n']:
-        raise PrepareError('weird parted units', lines[0])
-    del lines[0]
-
-    if not lines[0].startswith('/dev/'):
-        raise PrepareError('weird parted disk entry', lines[0])
-    del lines[0]
-
-    seen = set()
-    for line in lines:
-        idx, _ = line.split(':', 1)
-        idx = int(idx)
-        seen.add(idx)
-
-    num = 1
-    while num in seen:
-        num += 1
-    return num
-
-
-def zap(dev):
-    """
-    Destroy the partition table and content of a given disk.
-    """
-    try:
-        log.debug('Zapping partition table on %s', dev)
-
-        # try to wipe out any GPT partition table backups.  sgdisk
-        # isn't too thorough.
-        lba_size = 4096
-        size = 33 * lba_size
-        with file(dev, 'wb') as f:
-            f.seek(-size, os.SEEK_END)
-            f.write(size*'\0')
-
-        subprocess.check_call(
-            args=[
-                'sgdisk',
-                '--zap-all',
-                '--clear',
-                '--mbrtogpt',
-                '--',
-                dev,
-                ],
-            )
-    except subprocess.CalledProcessError as e:
-        raise PrepareError(e)
-
-
-def prepare_journal_dev(
-    data,
-    journal,
-    journal_size,
-    journal_uuid,
-    journal_dm_keypath,
-    ):
-
-    if is_partition(journal):
-        log.debug('Journal %s is a partition', journal)
-        log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
-        return (journal, None, None)
-
-    key = None
-    ptype = JOURNAL_UUID
-    if journal_dm_keypath:
-        ptype = DMCRYPT_JOURNAL_UUID
-
-    # it is a whole disk.  create a partition!
-    num = None
-    if journal == data:
-        # we're sharing the disk between osd data and journal;
-        # make journal be partition number 2, so it's pretty; put
-        # journal at end of free space so partitioning tools don't
-        # reorder them suddenly
-        num = 2
-        journal_part = '{num}:-{size}M:0'.format(
-            num=num,
-            size=journal_size,
-            )
-    else:
-        # sgdisk has no way for me to say "whatever is the next
-        # free index number" when setting type guids etc, so we
-        # need to awkwardly look up the next free number, and then
-        # fix that in the call -- and hope nobody races with us;
-        # then again nothing guards the partition table from races
-        # anyway
-        num = get_free_partition_index(dev=journal)
-        journal_part = '{num}:0:+{size}M'.format(
-            num=num,
-            size=journal_size,
-            )
-        log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
-
-    try:
-        log.debug('Creating journal partition num %d size %d on %s', num, journal_size, journal)
-        subprocess.check_call(
-            args=[
-                'sgdisk',
-                '--new={part}'.format(part=journal_part),
-                '--change-name={num}:ceph journal'.format(num=num),
-                '--partition-guid={num}:{journal_uuid}'.format(
-                    num=num,
-                    journal_uuid=journal_uuid,
-                    ),
-                '--typecode={num}:{uuid}'.format(
-                    num=num,
-                    uuid=ptype,
-                    ),
-                '--',
-                journal,
-                ],
-            )
-        subprocess.check_call(
-            args=[
-                # also make sure the kernel refreshes the new table
-                'partprobe',
-                journal,
-                ],
-            )
-
-        journal_symlink = '/dev/disk/by-partuuid/{journal_uuid}'.format(
-            journal_uuid=journal_uuid,
-            )
-
-        journal_dmcrypt = None
-        if journal_dm_keypath:
-            journal_dmcrypt = journal_symlink
-            journal_symlink = '/dev/mapper/{uuid}'.format(uuid=journal_uuid)
-
-        log.debug('Journal is GPT partition %s', journal_symlink)
-        return (journal_symlink, journal_dmcrypt, journal_uuid)
-
-    except subprocess.CalledProcessError as e:
-        raise PrepareError(e)
-
-
-def prepare_journal_file(
-    journal,
-    journal_size):
-
-    if not os.path.exists(journal):
-        log.debug('Creating journal file %s with size %dM', journal, journal_size)
-        with file(journal, 'wb') as f:
-            f.truncate(journal_size * 1048576)
-
-    # FIXME: should we resize an existing journal file?
-
-    log.debug('Journal is file %s', journal)
-    log.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
-    return (journal, None, None)
-
-
-def prepare_journal(
-    data,
-    journal,
-    journal_size,
-    journal_uuid,
-    force_file,
-    force_dev,
-    journal_dm_keypath,
-    ):
-
-    if journal is None:
-        if force_dev:
-            raise PrepareError('Journal is unspecified; not a block device')
-        return (None, None, None)
-
-    if not os.path.exists(journal):
-        if force_dev:
-            raise PrepareError('Journal does not exist; not a block device', journal)
-        return prepare_journal_file(journal, journal_size)
-
-    jmode = os.stat(journal).st_mode
-    if stat.S_ISREG(jmode):
-        if force_dev:
-            raise PrepareError('Journal is not a block device', journal)
-        return prepare_journal_file(journal, journal_size)
-
-    if stat.S_ISBLK(jmode):
-        if force_file:
-            raise PrepareError('Journal is not a regular file', journal)
-        return prepare_journal_dev(data, journal, journal_size, journal_uuid, journal_dm_keypath)
-
-    raise PrepareError('Journal %s is neither a block device nor regular file', journal)
-
-
-def adjust_symlink(target, path):
-    create = True
-    if os.path.lexists(path):
-        try:
-            mode = os.path.lstat(canonical).st_mode
-            if stat.S_ISREG(mode):
-                log.debug('Removing old file %s', canonical)
-                os.unlink(canonical)
-            elif stat.S_ISLNK(mode):
-                old = os.readlink(canonical)
-                if old != journal:
-                    log.debug('Removing old symlink %s -> %s', canonical, old)
-                    os.unlink(canonical)
-                else:
-                    create = False
-        except:
-            raise PrepareError('unable to remove (or adjust) old file (symlink)', canonical)
-    if create:
-        log.debug('Creating symlink %s -> %s', path, target)
-        try:
-            os.symlink(target, path)
-        except:
-            raise PrepareError('unable to create symlink %s -> %s' % (path, target))
-
-def prepare_dir(
-    path,
-    journal,
-    cluster_uuid,
-    osd_uuid,
-    journal_uuid,
-    journal_dmcrypt = None,
-    ):
-    log.debug('Preparing osd data dir %s', path)
-
-    if osd_uuid is None:
-        osd_uuid = str(uuid.uuid4())
-
-    if journal is not None:
-        # we're using an external journal; point to it here
-        adjust_symlink(journal, os.path.join(path, 'journal'))
-
-    if journal_dmcrypt is not None:
-        adjust_symlink(journal_dmcrypt, os.path.join(path, 'journal_dmcrypt'))
-    else:
-        try:
-            os.unlink(os.path.join(path, 'journal_dmcrypt'))
-        except:
-            pass
-
-    write_one_line(path, 'ceph_fsid', cluster_uuid)
-    write_one_line(path, 'fsid', osd_uuid)
-    write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC)
-
-    if journal_uuid is not None:
-        # i.e., journal is a tagged partition
-        write_one_line(path, 'journal_uuid', journal_uuid)
-
-def prepare_dev(
-    data,
-    journal,
-    fstype,
-    mkfs_args,
-    mount_options,
-    cluster_uuid,
-    osd_uuid,
-    journal_uuid,
-    journal_dmcrypt,
-    osd_dm_keypath,
-    ):
-    """
-    Prepare a data/journal combination to be used for an OSD.
-
-    The ``magic`` file is written last, so it's presence is a reliable
-    indicator of the whole sequence having completed.
-
-    WARNING: This will unconditionally overwrite anything given to
-    it.
-    """
-
-    ptype_tobe = TOBE_UUID
-    ptype_osd = OSD_UUID
-    if osd_dm_keypath:
-        ptype_tobe = DMCRYPT_TOBE_UUID
-        ptype_osd = DMCRYPT_OSD_UUID
-
-    rawdev = None
-    if is_partition(data):
-        log.debug('OSD data device %s is a partition', data)
-        rawdev = data
-    else:
-        log.debug('Creating osd partition on %s', data)
-        try:
-            subprocess.check_call(
-                args=[
-                    'sgdisk',
-                    '--largest-new=1',
-                    '--change-name=1:ceph data',
-                    '--partition-guid=1:{osd_uuid}'.format(
-                        osd_uuid=osd_uuid,
-                        ),
-                    '--typecode=1:%s' % ptype_tobe,
-                    '--',
-                    data,
-                    ],
-                )
-            subprocess.check_call(
-                args=[
-                    # also make sure the kernel refreshes the new table
-                    'partprobe',
-                    data,
-                    ],
-                )
-        except subprocess.CalledProcessError as e:
-            raise PrepareError(e)
-
-        rawdev = '{data}1'.format(data=data)
-
-    dev = None
-    if osd_dm_keypath:
-        dev = dmcrypt_map(rawdev, osd_dm_keypath, osd_uuid)
-    else:
-        dev = rawdev
-
-    try:
-        args = [
-            'mkfs',
-            '-t',
-            fstype,
-            ]
-        if mkfs_args is not None:
-            args.extend(mkfs_args.split())
-            if fstype == 'xfs':
-                args.extend(['-f'])  # always force
-        else:
-            args.extend(MKFS_ARGS.get(fstype, []))
-        args.extend([
-                '--',
-                dev,
-                ])
-        try:
-            log.debug('Creating %s fs on %s', fstype, dev)
-            subprocess.check_call(args=args)
-        except subprocess.CalledProcessError as e:
-            raise PrepareError(e)
-
-        #remove whitespaces from mount_options
-        if mount_options is not None:
-            mount_options = "".join(mount_options.split())
-
-        path = mount(dev=dev, fstype=fstype, options=mount_options)
-
-        try:
-            prepare_dir(
-                path=path,
-                journal=journal,
-                cluster_uuid=cluster_uuid,
-                osd_uuid=osd_uuid,
-                journal_uuid=journal_uuid,
-                journal_dmcrypt=journal_dmcrypt,
-                )
-        finally:
-            unmount(path)
-    finally:
-        if rawdev != dev:
-            dmcrypt_unmap(osd_uuid)
-
-    if not is_partition(data):
-        try:
-            subprocess.check_call(
-                args=[
-                    'sgdisk',
-                    '--typecode=1:%s' % ptype_osd,
-                    '--',
-                    data,
-                    ],
-                )
-            subprocess.check_call(
-                args=[
-                    # also make sure the kernel refreshes the new table
-                    'partprobe',
-                    data,
-                    ],
-                )
-        except subprocess.CalledProcessError as e:
-            raise PrepareError(e)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description='Prepare a directory for a Ceph OSD',
-        )
-    parser.add_argument(
-        '-v', '--verbose',
-        action='store_true', default=None,
-        help='be more verbose',
-        )
-    parser.add_argument(
-        '--cluster',
-        metavar='NAME',
-        help='cluster name to assign this disk to',
-        )
-    parser.add_argument(
-        '--cluster-uuid',
-        metavar='UUID',
-        help='cluster uuid to assign this disk to',
-        )
-    parser.add_argument(
-        '--osd-uuid',
-        metavar='UUID',
-        help='unique OSD uuid to assign this disk to',
-        )
-    parser.add_argument(
-        '--journal-uuid',
-        metavar='UUID',
-        help='unique uuid to assign to the journal',
-        )
-    parser.add_argument(
-        '--fs-type',
-        help='file system type to use (e.g. "ext4")',
-        )
-    parser.add_argument(
-        '--zap-disk',
-        action='store_true', default=None,
-        help='destroy the partition table (and content) of a disk',
-        )
-    parser.add_argument(
-        '--data-dir',
-        action='store_true', default=None,
-        help='verify that DATA is a dir',
-        )
-    parser.add_argument(
-        '--data-dev',
-        action='store_true', default=None,
-        help='verify that DATA is a block device',
-        )
-    parser.add_argument(
-        '--journal-file',
-        action='store_true', default=None,
-        help='verify that JOURNAL is a file',
-        )
-    parser.add_argument(
-        '--journal-dev',
-        action='store_true', default=None,
-        help='verify that JOURNAL is a block device',
-        )
-    parser.add_argument(
-        '--dmcrypt',
-        action='store_true', default=None,
-        help='encrypt DATA and/or JOURNAL devices with dm-crypt',
-        )
-    parser.add_argument(
-        '--dmcrypt-key-dir',
-        metavar='KEYDIR',
-        default='/etc/ceph/dmcrypt-keys',
-        help='directory where dm-crypt keys are stored',
-        )
-    parser.add_argument(
-        'data',
-        metavar='DATA',
-        help='path to OSD data (a disk block device or directory)',
-        )
-    parser.add_argument(
-        'journal',
-        metavar='JOURNAL',
-        nargs='?',
-        help=('path to OSD journal disk block device;'
-              + ' leave out to store journal in file'),
-        )
-    parser.set_defaults(
-        # we want to hold on to this, for later
-        prog=parser.prog,
-        cluster='ceph',
-        )
-    args = parser.parse_args()
-    return args
-
-
-def main():
-    args = parse_args()
-
-    loglevel = logging.INFO
-    if args.verbose:
-        loglevel = logging.DEBUG
-
-    logging.basicConfig(
-        level=loglevel,
-        )
-
-    journal_dm_keypath = None
-    osd_dm_keypath = None
-
-    try:
-        if not os.path.exists(args.data):
-            raise PrepareError('data path does not exist', args.data)
-
-        # in use?
-        dmode = os.stat(args.data).st_mode
-        if stat.S_ISBLK(dmode):
-            verify_not_in_use(args.data)
-
-        if args.journal and os.path.exists(args.journal):
-            jmode = os.stat(args.journal).st_mode
-            if stat.S_ISBLK(jmode):
-                verify_not_in_use(args.journal)
-
-        if args.zap_disk is not None:
-            if stat.S_ISBLK(dmode) and not is_partition(args.data):
-                zap(args.data)
-            else:
-                raise PrepareError('not full block device; cannot zap', args.data)
-
-        if args.cluster_uuid is None:
-            args.cluster_uuid = get_fsid(cluster=args.cluster)
-            if args.cluster_uuid is None:
-                raise PrepareError(
-                    'must have fsid in config or pass --cluster--uuid=',
-                    )
-
-        if args.fs_type is None:
-            args.fs_type = get_conf(
-                cluster=args.cluster,
-                variable='osd_mkfs_type',
-                )
-            if args.fs_type is None:
-                args.fs_type = get_conf(
-                    cluster=args.cluster,
-                    variable='osd_fs_type',
-                    )
-            if args.fs_type is None:
-                args.fs_type = DEFAULT_FS_TYPE
-
-        mkfs_args = get_conf(
-            cluster=args.cluster,
-            variable='osd_mkfs_options_{fstype}'.format(
-                fstype=args.fs_type,
-                ),
-            )
-        if mkfs_args is None:
-            mkfs_args = get_conf(
-                cluster=args.cluster,
-                variable='osd_fs_mkfs_options_{fstype}'.format(
-                    fstype=args.fs_type,
-                    ),
-                )
-
-        mount_options = get_conf(
-            cluster=args.cluster,
-            variable='osd_mount_options_{fstype}'.format(
-                fstype=args.fs_type,
-                ),
-            )
-        if mount_options is None:
-            mount_options = get_conf(
-                cluster=args.cluster,
-                variable='osd_fs_mount_options_{fstype}'.format(
-                    fstype=args.fs_type,
-                    ),
-                )
-
-        journal_size = get_conf_with_default(
-            cluster=args.cluster,
-            variable='osd_journal_size',
-            )
-        journal_size = int(journal_size)
-
-        # colocate journal with data?
-        if stat.S_ISBLK(dmode) and not is_partition(args.data) and args.journal is None and args.journal_file is None:
-            log.info('Will colocate journal with data on %s', args.data)
-            args.journal = args.data
-
-        if args.journal_uuid is None:
-            args.journal_uuid = str(uuid.uuid4())
-        if args.osd_uuid is None:
-            args.osd_uuid = str(uuid.uuid4())
-
-        # dm-crypt keys?
-        if args.dmcrypt:
-            journal_dm_keypath = get_or_create_dmcrypt_key(args.journal_uuid, args.dmcrypt_key_dir)
-            osd_dm_keypath = get_or_create_dmcrypt_key(args.osd_uuid, args.dmcrypt_key_dir)
-
-        # prepare journal
-        (journal_symlink, journal_dmcrypt, journal_uuid) = prepare_journal(
-            data=args.data,
-            journal=args.journal,
-            journal_size=journal_size,
-            journal_uuid=args.journal_uuid,
-            force_file=args.journal_file,
-            force_dev=args.journal_dev,
-            journal_dm_keypath=journal_dm_keypath,
-            )
-
-        # prepare data
-        if stat.S_ISDIR(dmode):
-            if args.data_dev:
-                raise PrepareError('data path is not a block device', args.data)
-            prepare_dir(
-                path=args.data,
-                journal=journal_symlink,
-                cluster_uuid=args.cluster_uuid,
-                osd_uuid=args.osd_uuid,
-                journal_uuid=journal_uuid,
-                journal_dmcrypt=journal_dmcrypt,
-                )
-        elif stat.S_ISBLK(dmode):
-            if args.data_dir:
-                raise PrepareError('data path is not a directory', args.data)
-            prepare_dev(
-                data=args.data,
-                journal=journal_symlink,
-                fstype=args.fs_type,
-                mkfs_args=mkfs_args,
-                mount_options=mount_options,
-                cluster_uuid=args.cluster_uuid,
-                osd_uuid=args.osd_uuid,
-                journal_uuid=journal_uuid,
-                journal_dmcrypt=journal_dmcrypt,
-                osd_dm_keypath=osd_dm_keypath,
-                )
-        else:
-            raise PrepareError('not a dir or block device', args.data)
-
-    except PrepareError as e:
-        if journal_dm_keypath:
-            os.unlink(journal_dm_keypath)
-        if osd_dm_keypath:
-            os.unlink(osd_dm_keypath)
-        print >>sys.stderr, '{prog}: {msg}'.format(
-            prog=args.prog,
-            msg=e,
-            )
-        sys.exit(1)
-
-if __name__ == '__main__':
-    main()
+#!/bin/sh
+dir=`dirname $0`
+$dir/ceph-disk prepare $*