From 4d7e1e9188e984f21324a130a75c4ab52ac47ed7 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 14 Mar 2013 21:53:44 -0700 Subject: [PATCH] osd: data loss: low space handling Automated test cases for feature #4197 Signed-off-by: David Zafman Reported-by: Sam Just --- teuthology/task/osd_failsafe_enospc.py | 234 +++++++++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 teuthology/task/osd_failsafe_enospc.py diff --git a/teuthology/task/osd_failsafe_enospc.py b/teuthology/task/osd_failsafe_enospc.py new file mode 100644 index 0000000000000..ed32e5cf0233f --- /dev/null +++ b/teuthology/task/osd_failsafe_enospc.py @@ -0,0 +1,234 @@ +from cStringIO import StringIO +import logging +import ceph_manager +from teuthology import misc as teuthology +import time +from ..orchestra import run + +log = logging.getLogger(__name__) + + +def rados(testdir, remote, cmd, wait=True): + log.info("rados %s" % ' '.join(cmd)) + pre = [ + '{tdir}/enable-coredump'.format(tdir=testdir), + 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=testdir), + 'rados', + ]; + pre.extend(cmd) + proc = remote.run( + args=pre, + check_status=False, + wait=wait + ) + if wait: + return proc.exitstatus + else: + return proc + +def task(ctx, config): + """ + Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio + configuration settings + + In order for test to pass must use log-whitelist as follows + + tasks: + - chef: + - install: + - ceph: + log-whitelist: ['OSD near full', 'OSD full dropping all updates'] + - osd_failsafe_enospc: + + """ + if config is None: + config = {} + assert isinstance(config, dict), \ + 'osd_failsafe_enospc task only accepts a dict for configuration' + first_mon = teuthology.get_first_mon(ctx, config) + (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() + + testdir = teuthology.get_testdir(ctx) + manager = ceph_manager.CephManager( + mon, + ctx=ctx, + logger=log.getChild('ceph_manager'), + ) + ctx.manager = manager + + # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding + sleep_time = 50 + + # something that is always there + dummyfile = '/etc/fstab' + dummyfile2 = '/etc/resolv.conf' + + # create 1 pg pool with 1 rep which can only be on osd.0 + osds = manager.get_osd_dump() + for osd in osds: + if osd['osd'] != 0: + manager.mark_out_osd(osd['osd']) + + log.info('creating pool foo') + manager.create_pool("foo") + manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1') + + # State NONE -> NEAR + log.info('1. Verify warning messages when exceeding nearfull_ratio') + + proc = mon.run( + args=[ + '{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)), + 'kill', + 'ceph', '-w' + ], + stdin=run.PIPE, + stdout=StringIO(), + wait=False, + ) + + manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001') + + time.sleep(sleep_time) + proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w + proc.exitstatus.get() + + lines = proc.stdout.getvalue().split('\n') + + count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) + assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count + count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) + assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count + + # State NEAR -> FULL + log.info('2. Verify error messages when exceeding full_ratio') + + proc = mon.run( + args=[ + '{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)), + 'kill', + 'ceph', '-w' + ], + stdin=run.PIPE, + stdout=StringIO(), + wait=False, + ) + + manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001') + + time.sleep(sleep_time) + proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w + proc.exitstatus.get() + + lines = proc.stdout.getvalue().split('\n') + + count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) + assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count + + log.info('3. Verify write failure when exceeding full_ratio') + + # Write data should fail + ret = rados(testdir, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile]) + assert ret != 0, 'Expected write failure but it succeeded with exit status 0' + + # Put back default + manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97') + time.sleep(10) + + # State FULL -> NEAR + log.info('4. Verify write success when NOT exceeding full_ratio') + + # Write should succeed + ret = rados(testdir, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2]) + assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret + + log.info('5. Verify warning messages again when exceeding nearfull_ratio') + + proc = mon.run( + args=[ + '{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)), + 'kill', + 'ceph', '-w' + ], + stdin=run.PIPE, + stdout=StringIO(), + wait=False, + ) + + time.sleep(sleep_time) + proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w + proc.exitstatus.get() + + lines = proc.stdout.getvalue().split('\n') + + count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) + assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count + count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) + assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count + + manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90') + time.sleep(10) + + # State NONE -> FULL + log.info('6. Verify error messages again when exceeding full_ratio') + + proc = mon.run( + args=[ + '{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)), + 'kill', + 'ceph', '-w' + ], + stdin=run.PIPE, + stdout=StringIO(), + wait=False, + ) + + manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001') + + time.sleep(sleep_time) + proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w + proc.exitstatus.get() + + lines = proc.stdout.getvalue().split('\n') + + count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) + assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count + count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) + assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count + + # State FULL -> NONE + log.info('7. Verify no messages settings back to default') + + manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97') + time.sleep(10) + + proc = mon.run( + args=[ + '{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)), + 'kill', + 'ceph', '-w' + ], + stdin=run.PIPE, + stdout=StringIO(), + wait=False, + ) + + time.sleep(sleep_time) + proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w + proc.exitstatus.get() + + lines = proc.stdout.getvalue().split('\n') + + count = len(filter(lambda line: '[WRN] OSD near full' in line, lines)) + assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count + count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines)) + assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count + + log.info('Test Passed') + + # Bring all OSDs back in + manager.remove_pool("foo") + for osd in osds: + if osd['osd'] != 0: + manager.mark_in_osd(osd['osd']) -- 2.39.5