]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
ceph-disk-prepare, debian/control: Support external journals.
authorTommi Virtanen <tv@inktank.com>
Fri, 5 Oct 2012 17:57:42 +0000 (10:57 -0700)
committerSage Weil <sage@inktank.com>
Wed, 17 Oct 2012 01:15:25 +0000 (18:15 -0700)
Previously, ceph-disk-* would only let you use a journal that was a
file inside the OSD data directory. With this, you can do:

  ceph-disk-prepare /dev/sdb /dev/sdb

to put the journal as a second partition on the same disk as the OSD
data (might save some file system overhead), or, more interestingly:

  ceph-disk-prepare /dev/sdb /dev/sdc

which makes it create a new partition on /dev/sdc to use as the
journal. Size of the partition is decided by $osd_journal_size.
/dev/sdc must be a GPT-format disk. Multiple OSDs may share the same
journal disk (using separate partitions); this way, a single fast SSD
can serve as journal for multiple spinning disks.

The second use case currently requires parted, so a Recommends: for
parted has been added to Debian packaging.

Closes: #3078
Closes: #3079
Signed-off-by: Tommi Virtanen <tv@inktank.com>
debian/control
src/ceph-disk-prepare

index 7bfb1a4bf096b3b945dfdb84b8465e64e6557ab1..b03fe89ed0118ea5c8222f40e7ac6dbdbf53506e 100644 (file)
@@ -12,7 +12,7 @@ Standards-Version: 3.9.3
 Package: ceph
 Architecture: linux-any
 Depends: ${shlibs:Depends}, ${misc:Depends}, sdparm | hdparm, binutils, ceph-common, uuid-runtime, python, xfsprogs
-Recommends: ceph-mds, librados2, librbd1, btrfs-tools, gdisk
+Recommends: ceph-mds, librados2, librbd1, btrfs-tools, gdisk, parted
 Description: distributed storage and file system
  Ceph is a distributed storage system designed to provide excellent
  performance, reliability, and scalability.
index b69f21e4bf3e612246bd5427a2c5f1104f4733de..ec3dd8250f3c05349587d93c81351cb34974b738 100755 (executable)
@@ -54,6 +54,23 @@ def write_one_line(parent, name, text):
 
 CEPH_OSD_ONDISK_MAGIC = 'ceph osd volume v026'
 
+JOURNAL_UUID = '45b0969e-9b03-4f30-b4c6-b4b80ceff106'
+
+
+# TODO depend on python2.7
+def _check_output(*args, **kwargs):
+    process = subprocess.Popen(
+        stdout=subprocess.PIPE,
+        *args, **kwargs)
+    out, _ = process.communicate()
+    ret = process.wait()
+    if ret:
+        cmd = kwargs.get("args")
+        if cmd is None:
+            cmd = args[0]
+        raise subprocess.CalledProcessError(ret, cmd, output=out)
+    return out
+
 
 def get_conf(cluster, variable):
     try:
@@ -86,6 +103,36 @@ def get_conf(cluster, variable):
     return value
 
 
+def get_conf_with_default(cluster, variable):
+    """
+    Get a config value that is known to the C++ code.
+
+    This will fail if called on variables that are not defined in
+    common config options.
+    """
+    try:
+        out = _check_output(
+            args=[
+                'ceph-osd',
+                '--cluster={cluster}'.format(
+                    cluster=cluster,
+                    ),
+                '--show-config-value={variable}'.format(
+                    variable=variable,
+                    ),
+                ],
+            close_fds=True,
+            )
+    except subprocess.CalledProcessError as e:
+        raise PrepareError(
+            'getting variable from configuration failed',
+            e,
+            )
+
+    value = out.split('\n', 1)[0]
+    return value
+
+
 def get_fsid(cluster):
     fsid = get_conf(cluster=cluster, variable='fsid')
     if fsid is None:
@@ -168,8 +215,48 @@ def unmount(
     os.rmdir(path)
 
 
+def get_free_partition_index(dev):
+    try:
+        lines = _check_output(
+            args=[
+                'parted',
+                '--machine',
+                '--',
+                dev,
+                'print',
+                ],
+            )
+    except subprocess.CalledProcessError as e:
+        raise PrepareError('cannot read partition index', e)
+
+    if not lines:
+        raise PrepareError('parted failed to output anything')
+    lines = lines.splitlines(True)
+
+    if lines[0] not in ['CHS;\n', 'CYL;\n', 'BYT;\n']:
+        raise PrepareError('weird parted units', lines[0])
+    del lines[0]
+
+    if not lines[0].startswith('/dev/'):
+        raise PrepareError('weird parted disk entry', lines[0])
+    del lines[0]
+
+    seen = set()
+    for line in lines:
+        idx, _ = line.split(':', 1)
+        idx = int(idx)
+        seen.add(idx)
+
+    num = 1
+    while num in seen:
+        num += 1
+    return num
+
+
 def prepare(
     disk,
+    journal,
+    journal_size,
     fstype,
     mkfs_args,
     mount_options,
@@ -184,15 +271,78 @@ def prepare(
     WARNING: This will unconditionally overwrite anything given to
     it.
     """
-    osd_uuid = str(uuid.uuid4())
 
     try:
+        # this kills the crab
         subprocess.check_call(
             args=[
                 'sgdisk',
                 '--zap-all',
                 '--clear',
                 '--mbrtogpt',
+                '--',
+                disk,
+                ],
+            )
+    except subprocess.CalledProcessError as e:
+        raise PrepareError(e)
+
+    osd_uuid = str(uuid.uuid4())
+
+    # store the partition uuid iff using external journal
+    journal_uuid = None
+
+    if journal is not None:
+        journal_uuid = str(uuid.uuid4())
+
+        if journal == disk:
+            # we're sharing the disk between osd data and journal;
+            # make journal be partition number 2, so it's pretty; put
+            # journal at end of free space so partitioning tools don't
+            # reorder them suddenly
+            num = 2
+            journal_part = '{num}:-{size}M:0'.format(
+                num=num,
+                size=journal_size,
+                )
+        else:
+            # sgdisk has no way for me to say "whatever is the next
+            # free index number" when setting type guids etc, so we
+            # need to awkwardly look up the next free number, and then
+            # fix that in the call -- and hope nobody races with us;
+            # then again nothing guards the partition table from races
+            # anyway
+            num = get_free_partition_index(dev=journal)
+            journal_part = '{num}:0:{size}M'.format(
+                num=num,
+                size=journal_size,
+                )
+
+        try:
+            subprocess.check_call(
+                args=[
+                    'sgdisk',
+                    '--new={part}'.format(part=journal_part),
+                    '--change-name={num}:ceph journal'.format(num=num),
+                    '--partition-guid={num}:{journal_uuid}'.format(
+                        num=num,
+                        journal_uuid=journal_uuid,
+                        ),
+                    '--typecode={num}:{uuid}'.format(
+                        num=num,
+                        uuid=JOURNAL_UUID,
+                        ),
+                    '--',
+                    journal,
+                    ],
+                )
+        except subprocess.CalledProcessError as e:
+            raise PrepareError(e)
+
+    try:
+        subprocess.check_call(
+            args=[
+                'sgdisk',
                 '--largest-new=1',
                 '--change-name=1:ceph data',
                 '--partition-guid=1:{osd_uuid}'.format(
@@ -226,6 +376,14 @@ def prepare(
 
     path = mount(dev=dev, fstype=fstype, options=mount_options)
     try:
+        if journal_uuid is not None:
+            # we're using an external journal; point to it here
+            os.symlink(
+                '/dev/disk/by-partuuid/{journal_uuid}'.format(
+                    journal_uuid=journal_uuid,
+                    ),
+                os.path.join(path, 'journal'),
+                )
         write_one_line(path, 'ceph_fsid', cluster_uuid)
         write_one_line(path, 'fsid', osd_uuid)
         write_one_line(path, 'magic', CEPH_OSD_ONDISK_MAGIC)
@@ -273,6 +431,13 @@ def parse_args():
         metavar='DISK',
         help='path to OSD data disk block device',
         )
+    parser.add_argument(
+        'journal',
+        metavar='JOURNAL',
+        nargs='?',
+        help=('path to OSD journal disk block device;'
+              + ' leave out to store journal in file'),
+        )
     parser.set_defaults(
         # we want to hold on to this, for later
         prog=parser.prog,
@@ -323,8 +488,16 @@ def main():
                 ),
             )
 
+        journal_size = get_conf_with_default(
+            cluster=args.cluster,
+            variable='osd_journal_size',
+            )
+        journal_size = int(journal_size)
+
         prepare(
             disk=args.disk,
+            journal=args.journal,
+            journal_size=journal_size,
             fstype=args.fs_type,
             mkfs_args=mkfs_args,
             mount_options=mount_options,