From 8f720454cbb536ac6f410fb6c18b2b0945b70160 Mon Sep 17 00:00:00 2001 From: Sam Lang Date: Tue, 22 Jan 2013 20:27:41 -0600 Subject: [PATCH] Assign devices to osds using the device wwn Linux doesn't guarantee device names (/dev/sdb, etc.) are always mapped to the same disk. Instead of assigning nominal devices to osds, we map devices by their wwn (/dev/disk/by-id/wwn-*) to an osd (both data and journal). Signed-off-by: Sam Lang Reviewed-by: Josh Durgin --- teuthology/misc.py | 74 ++++++++++++++++++++------------- teuthology/task/ceph.py | 53 +++++++++++++---------- teuthology/task/ceph_manager.py | 6 ++- 3 files changed, 82 insertions(+), 51 deletions(-) diff --git a/teuthology/misc.py b/teuthology/misc.py index fc40c72c0d..024bf051f1 100644 --- a/teuthology/misc.py +++ b/teuthology/misc.py @@ -347,6 +347,44 @@ def pull_directory_tarball(remote, remotedir, localfile): ) proc.exitstatus.get() +# returns map of devices to device id links: +# /dev/sdb: /dev/disk/by-id/wwn-0xf00bad +def get_wwn_id_map(remote, devs): + stdout = None + try: + r = remote.run( + args=[ + 'ls', + '-l', + '/dev/disk/by-id/wwn-*', + ], + stdout=StringIO(), + ) + stdout = r.stdout.getvalue() + except: + return None + + devmap = {} + + # lines will be: + # lrwxrwxrwx 1 root root 9 Jan 22 14:58 /dev/disk/by-id/wwn-0x50014ee002ddecaf -> ../../sdb + for line in stdout.splitlines(): + comps = line.split(' ') + # comps[-1] should be: + # ../../sdb + rdev = comps[-1] + # translate to /dev/sdb + dev='/dev/{d}'.format(d=rdev.split('/')[-1]) + + # comps[-3] should be: + # /dev/disk/by-id/wwn-0x50014ee002ddecaf + iddev = comps[-3] + + if dev in devs: + devmap[dev] = iddev + + return devmap + def get_scratch_devices(remote): """ Read the scratch disk list from remote host @@ -356,33 +394,13 @@ def get_scratch_devices(remote): file_data = get_file(remote, "/scratch_devs") devs = file_data.split() except: - devs = [ - '/dev/sda', - '/dev/sdb', - '/dev/sdc', - '/dev/sdd', - '/dev/sde', - '/dev/sdf', - '/dev/sdg', - '/dev/sdh', - '/dev/sdi', - '/dev/sdj', - '/dev/sdk', - '/dev/sdl', - '/dev/sdm', - '/dev/sdn', - '/dev/sdo', - '/dev/sdp', - '/dev/vda', - '/dev/vdb', - '/dev/vdc', - '/dev/vdd', - '/dev/vde', - '/dev/vdf', - '/dev/vdg', - '/dev/vdh', - '/dev/vdi', - ] + r = remote.run( + args=['ls', run.Raw('/dev/[sv]d*')], + stdout=StringIO() + ) + devs = r.stdout.getvalue().split('\n') + + log.debug('devs={d}'.format(d=devs)) retval = [] for dev in devs: @@ -395,7 +413,7 @@ def get_scratch_devices(remote): run.Raw('&&'), # readable 'sudo', 'dd', 'if=%s' % dev, 'of=/dev/null', 'count=1', - run.Raw('&&'), + run.Raw('&&'), # not mounted run.Raw('!'), 'mount', diff --git a/teuthology/task/ceph.py b/teuthology/task/ceph.py index 541af0cb6b..63be429ede 100644 --- a/teuthology/task/ceph.py +++ b/teuthology/task/ceph.py @@ -332,26 +332,33 @@ def valgrind_post(ctx, config): def mount_osd_data(ctx, remote, osd): testdir = teuthology.get_testdir(ctx) - dev = ctx.disk_config.remote_to_roles_to_dev[remote][osd] - journal = ctx.disk_config.remote_to_roles_to_journals[remote][osd] - mount_options = ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][osd] - fstype = ctx.disk_config.remote_to_roles_to_dev_fstype[remote][osd] - - remote.run( + log.debug('Mounting data for osd.{o} on {r}'.format(o=osd, r=remote)) + if remote in ctx.disk_config.remote_to_roles_to_dev and osd in ctx.disk_config.remote_to_roles_to_dev[remote]: + dev = ctx.disk_config.remote_to_roles_to_dev[remote][osd] + journal = ctx.disk_config.remote_to_roles_to_journals[remote][osd] + mount_options = ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][osd] + fstype = ctx.disk_config.remote_to_roles_to_dev_fstype[remote][osd] + mnt = os.path.join('{tdir}/data'.format(tdir=testdir), 'osd.{id}.data'.format(id=osd)) + + log.info('Mounting osd.{o}: dev: {n}, mountpoint: {p}, type: {t}, options: {v}'.format( + o=osd, n=remote.name, p=mnt, t=fstype, v=mount_options)) + + remote.run( args=[ 'sudo', 'mount', '-t', fstype, '-o', ','.join(mount_options), dev, - os.path.join('{tdir}/data'.format(tdir=testdir), 'osd.{id}.data'.format(id=osd)), + mnt, ] ) - if journal == ('/mnt/osd.%s' % osd): - remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] ) - tmpfs = '/mnt/osd.%s' % osd - remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] ) + if journal == ('/mnt/osd.%s' % osd): + tmpfs = '/mnt/osd.%s' % osd + log.info('Creating journal file on tmpfs at {t}'.format(t=tmpfs)) + remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] ) + remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] ) @contextlib.contextmanager def cluster(ctx, config): @@ -379,18 +386,19 @@ def cluster(ctx, config): if config.get('fs'): log.info('fs option selected, checking for scratch devs') log.info('found devs: %s' % (str(devs),)) + devs_id_map = teuthology.get_wwn_id_map(remote, devs) + iddevs = devs_id_map.items() roles_to_devs = assign_devs( - teuthology.roles_of_type(roles_for_host, 'osd'), devs + teuthology.roles_of_type(roles_for_host, 'osd'), iddevs ) - if len(roles_to_devs) < len(devs): - devs = devs[len(roles_to_devs):] - log.info('dev map: %s' % (str(roles_to_devs),)) + if len(roles_to_devs) < len(iddevs): + iddevs = iddevs[len(roles_to_devs):] devs_to_clean[remote] = [] if config.get('block_journal'): log.info('block journal enabled') roles_to_journals = assign_devs( - teuthology.roles_of_type(roles_for_host, 'osd'), devs + teuthology.roles_of_type(roles_for_host, 'osd'), iddevs ) log.info('journal map: %s', roles_to_journals) @@ -404,6 +412,7 @@ def cluster(ctx, config): remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] ) log.info('journal map: %s', roles_to_journals) + log.info('dev map: %s' % (str(roles_to_devs),)) remote_to_roles_to_devs[remote] = roles_to_devs remote_to_roles_to_journals[remote] = roles_to_journals @@ -730,12 +739,12 @@ def cluster(ctx, config): os.path.join('{tdir}/data'.format(tdir=testdir), 'osd.{id}.data'.format(id=id_)), ] ) - if not remote in ctx.disk_config.remotes_to_roles_to_dev_mount_options: - ctx.disk_config.remotes_to_roles_to_dev_mount_options[remote] = {} - ctx.disk_config.remotes_to_roles_to_dev_mount_options[remote][id_] = mount_options - if not remote in ctx.disk_config.remotes_to_roles_to_dev_fstype: - ctx.disk_config.remotes_to_roles_to_dev_fstype[remote] = {} - ctx.disk_config.remotes_to_roles_to_dev_fstype[remote][id_] = fs + if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options: + ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {} + ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][id_] = mount_options + if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype: + ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {} + ctx.disk_config.remote_to_roles_to_dev_fstype[remote][id_] = fs remote.run( args=[ 'sudo', 'chown', '-R', 'ubuntu.ubuntu', diff --git a/teuthology/task/ceph_manager.py b/teuthology/task/ceph_manager.py index 123c1fd949..8c413ab10b 100644 --- a/teuthology/task/ceph_manager.py +++ b/teuthology/task/ceph_manager.py @@ -664,8 +664,12 @@ class CephManager: if 'powercycle' in self.config and self.config['powercycle']: (remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys() self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name)) - remote.console.hard_reset() + remote.console.power_on() + if not remote.console.check_status(300): + raise 'Failed to revive osd.{o} via ipmi'.format(o=osd) + teuthology.reconnect(self.ctx, 60) ceph_task.mount_osd_data(self.ctx, remote, osd) + self.ctx.daemons.get_daemon('osd', osd).reset() self.ctx.daemons.get_daemon('osd', osd).restart() def mark_down_osd(self, osd): -- 2.39.5