From 8f720454cbb536ac6f410fb6c18b2b0945b70160 Mon Sep 17 00:00:00 2001
From: Sam Lang <sam.lang@inktank.com>
Date: Tue, 22 Jan 2013 20:27:41 -0600
Subject: [PATCH] Assign devices to osds using the device wwn

Linux doesn't guarantee device names (/dev/sdb, etc.)
are always mapped to the same disk.  Instead of assigning
nominal devices to osds, we map devices by their wwn
(/dev/disk/by-id/wwn-*) to an osd (both data and journal).

Signed-off-by: Sam Lang <sam.lang@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 teuthology/misc.py              | 74 ++++++++++++++++++++-------------
 teuthology/task/ceph.py         | 53 +++++++++++++----------
 teuthology/task/ceph_manager.py |  6 ++-
 3 files changed, 82 insertions(+), 51 deletions(-)

diff --git a/teuthology/misc.py b/teuthology/misc.py
index fc40c72c0d..024bf051f1 100644
--- a/teuthology/misc.py
+++ b/teuthology/misc.py
@@ -347,6 +347,44 @@ def pull_directory_tarball(remote, remotedir, localfile):
         )
     proc.exitstatus.get()
 
+# returns map of devices to device id links:
+# /dev/sdb: /dev/disk/by-id/wwn-0xf00bad
+def get_wwn_id_map(remote, devs):
+    stdout = None
+    try:
+        r = remote.run(
+            args=[
+                'ls',
+                '-l',
+                '/dev/disk/by-id/wwn-*',
+                ],
+            stdout=StringIO(),
+            )
+        stdout = r.stdout.getvalue()
+    except:
+        return None
+
+    devmap = {}
+
+    # lines will be:
+    # lrwxrwxrwx 1 root root  9 Jan 22 14:58 /dev/disk/by-id/wwn-0x50014ee002ddecaf -> ../../sdb
+    for line in stdout.splitlines():
+        comps = line.split(' ')
+        # comps[-1] should be:
+        # ../../sdb
+        rdev = comps[-1]
+        # translate to /dev/sdb
+        dev='/dev/{d}'.format(d=rdev.split('/')[-1])
+
+        # comps[-3] should be:
+        # /dev/disk/by-id/wwn-0x50014ee002ddecaf
+        iddev = comps[-3]
+
+        if dev in devs:
+            devmap[dev] = iddev
+
+    return devmap
+
 def get_scratch_devices(remote):
     """
     Read the scratch disk list from remote host
@@ -356,33 +394,13 @@ def get_scratch_devices(remote):
         file_data = get_file(remote, "/scratch_devs")
         devs = file_data.split()
     except:
-        devs = [
-            '/dev/sda',
-            '/dev/sdb',
-            '/dev/sdc',
-            '/dev/sdd',
-            '/dev/sde',
-            '/dev/sdf',
-            '/dev/sdg',
-            '/dev/sdh',
-            '/dev/sdi',
-            '/dev/sdj',
-            '/dev/sdk',
-            '/dev/sdl',
-            '/dev/sdm',
-            '/dev/sdn',
-            '/dev/sdo',
-            '/dev/sdp',
-            '/dev/vda',
-            '/dev/vdb',
-            '/dev/vdc',
-            '/dev/vdd',
-            '/dev/vde',
-            '/dev/vdf',
-            '/dev/vdg',
-            '/dev/vdh',
-            '/dev/vdi',
-            ]
+        r = remote.run(
+                args=['ls', run.Raw('/dev/[sv]d*')],
+                stdout=StringIO()
+                )
+        devs = r.stdout.getvalue().split('\n')
+
+    log.debug('devs={d}'.format(d=devs))
 
     retval = []
     for dev in devs:
@@ -395,7 +413,7 @@ def get_scratch_devices(remote):
                     run.Raw('&&'),
                     # readable
                     'sudo', 'dd', 'if=%s' % dev, 'of=/dev/null', 'count=1',
-                    run.Raw('&&'),                   
+                    run.Raw('&&'),
                     # not mounted
                     run.Raw('!'),
                     'mount',
diff --git a/teuthology/task/ceph.py b/teuthology/task/ceph.py
index 541af0cb6b..63be429ede 100644
--- a/teuthology/task/ceph.py
+++ b/teuthology/task/ceph.py
@@ -332,26 +332,33 @@ def valgrind_post(ctx, config):
 
 def mount_osd_data(ctx, remote, osd):
     testdir = teuthology.get_testdir(ctx)
-    dev = ctx.disk_config.remote_to_roles_to_dev[remote][osd]
-    journal = ctx.disk_config.remote_to_roles_to_journals[remote][osd]
-    mount_options = ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][osd]
-    fstype = ctx.disk_config.remote_to_roles_to_dev_fstype[remote][osd]
-
-    remote.run(
+    log.debug('Mounting data for osd.{o} on {r}'.format(o=osd, r=remote))
+    if remote in ctx.disk_config.remote_to_roles_to_dev and osd in ctx.disk_config.remote_to_roles_to_dev[remote]:
+        dev = ctx.disk_config.remote_to_roles_to_dev[remote][osd]
+        journal = ctx.disk_config.remote_to_roles_to_journals[remote][osd]
+        mount_options = ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][osd]
+        fstype = ctx.disk_config.remote_to_roles_to_dev_fstype[remote][osd]
+        mnt = os.path.join('{tdir}/data'.format(tdir=testdir), 'osd.{id}.data'.format(id=osd))
+
+        log.info('Mounting osd.{o}: dev: {n}, mountpoint: {p}, type: {t}, options: {v}'.format(
+                 o=osd, n=remote.name, p=mnt, t=fstype, v=mount_options))
+
+        remote.run(
             args=[
                 'sudo',
                 'mount',
                 '-t', fstype,
                 '-o', ','.join(mount_options),
                 dev,
-                os.path.join('{tdir}/data'.format(tdir=testdir), 'osd.{id}.data'.format(id=osd)),
+                mnt,
             ]
             )
 
-    if journal == ('/mnt/osd.%s' % osd):
-        remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] )
-        tmpfs = '/mnt/osd.%s' % osd
-        remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] )
+        if journal == ('/mnt/osd.%s' % osd):
+            tmpfs = '/mnt/osd.%s' % osd
+            log.info('Creating journal file on tmpfs at {t}'.format(t=tmpfs))
+            remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] )
+            remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] )
 
 @contextlib.contextmanager
 def cluster(ctx, config):
@@ -379,18 +386,19 @@ def cluster(ctx, config):
         if config.get('fs'):
             log.info('fs option selected, checking for scratch devs')
             log.info('found devs: %s' % (str(devs),))
+            devs_id_map = teuthology.get_wwn_id_map(remote, devs)
+            iddevs = devs_id_map.items()
             roles_to_devs = assign_devs(
-                teuthology.roles_of_type(roles_for_host, 'osd'), devs
+                teuthology.roles_of_type(roles_for_host, 'osd'), iddevs
                 )
-            if len(roles_to_devs) < len(devs):
-                devs = devs[len(roles_to_devs):]
-            log.info('dev map: %s' % (str(roles_to_devs),))
+            if len(roles_to_devs) < len(iddevs):
+                iddevs = iddevs[len(roles_to_devs):]
             devs_to_clean[remote] = []
 
         if config.get('block_journal'):
             log.info('block journal enabled')
             roles_to_journals = assign_devs(
-                teuthology.roles_of_type(roles_for_host, 'osd'), devs
+                teuthology.roles_of_type(roles_for_host, 'osd'), iddevs
                 )
             log.info('journal map: %s', roles_to_journals)
 
@@ -404,6 +412,7 @@ def cluster(ctx, config):
                 remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] )
             log.info('journal map: %s', roles_to_journals)
 
+        log.info('dev map: %s' % (str(roles_to_devs),))
         remote_to_roles_to_devs[remote] = roles_to_devs
         remote_to_roles_to_journals[remote] = roles_to_journals
 
@@ -730,12 +739,12 @@ def cluster(ctx, config):
                         os.path.join('{tdir}/data'.format(tdir=testdir), 'osd.{id}.data'.format(id=id_)),
                         ]
                     )
-                if not remote in ctx.disk_config.remotes_to_roles_to_dev_mount_options:
-                    ctx.disk_config.remotes_to_roles_to_dev_mount_options[remote] = {}
-                ctx.disk_config.remotes_to_roles_to_dev_mount_options[remote][id_] = mount_options
-                if not remote in ctx.disk_config.remotes_to_roles_to_dev_fstype:
-                    ctx.disk_config.remotes_to_roles_to_dev_fstype[remote] = {}
-                ctx.disk_config.remotes_to_roles_to_dev_fstype[remote][id_] = fs
+                if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
+                    ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
+                ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][id_] = mount_options
+                if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
+                    ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
+                ctx.disk_config.remote_to_roles_to_dev_fstype[remote][id_] = fs
                 remote.run(
                     args=[
                         'sudo', 'chown', '-R', 'ubuntu.ubuntu',
diff --git a/teuthology/task/ceph_manager.py b/teuthology/task/ceph_manager.py
index 123c1fd949..8c413ab10b 100644
--- a/teuthology/task/ceph_manager.py
+++ b/teuthology/task/ceph_manager.py
@@ -664,8 +664,12 @@ class CephManager:
         if 'powercycle' in self.config and self.config['powercycle']:
             (remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys()
             self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name))
-            remote.console.hard_reset()
+            remote.console.power_on()
+            if not remote.console.check_status(300):
+                raise 'Failed to revive osd.{o} via ipmi'.format(o=osd)
+            teuthology.reconnect(self.ctx, 60)
             ceph_task.mount_osd_data(self.ctx, remote, osd)
+            self.ctx.daemons.get_daemon('osd', osd).reset()
         self.ctx.daemons.get_daemon('osd', osd).restart()
 
     def mark_down_osd(self, osd):
-- 
2.39.5