From: Mykola Golub Date: Thu, 9 Sep 2021 11:44:25 +0000 (+0300) Subject: qa/suites/rados: add backfill_toofull test X-Git-Tag: v16.2.7~82^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ccd1c40433120a39daf2a0398a9da9517da2e5bb;p=ceph.git qa/suites/rados: add backfill_toofull test Signed-off-by: Mykola Golub (cherry picked from commit 76743e005866664795e9240460734b31108824e2) --- diff --git a/qa/suites/rados/singleton/all/backfill-toofull.yaml b/qa/suites/rados/singleton/all/backfill-toofull.yaml new file mode 100644 index 00000000000..fcc3d0e2942 --- /dev/null +++ b/qa/suites/rados/singleton/all/backfill-toofull.yaml @@ -0,0 +1,37 @@ +roles: +- - mon.a + - mon.b + - mon.c + - mgr.x + - osd.0 + - osd.1 + - osd.2 + - osd.3 +openstack: + - volumes: # attached to each instance + count: 4 + size: 10 # GB +tasks: +- install: +- ceph: + create_rbd_pool: false + pre-mgr-commands: + - sudo ceph config set mgr mgr/devicehealth/enable_monitoring false --force + log-ignorelist: + - Error + - overall HEALTH_ + - \(OBJECT_ + - \(OSDMAP_FLAGS\) + - \(OSD_ + - \(PG_ + - \(POOL_BACKFILLFULL\) + - \(POOL_NEARFULL\) + - \(SLOW_OPS\) + - \(TOO_FEW_PGS\) + - Monitor daemon marked osd\.[[:digit:]]+ down, but it is still running + - slow request + conf: + osd: + osd min pg log entries: 5 + osd max pg log entries: 5 +- backfill_toofull: diff --git a/qa/tasks/backfill_toofull.py b/qa/tasks/backfill_toofull.py new file mode 100644 index 00000000000..f2156bea601 --- /dev/null +++ b/qa/tasks/backfill_toofull.py @@ -0,0 +1,181 @@ +""" +Backfill_toofull +""" +import logging +import time +from tasks import ceph_manager +from tasks.util.rados import rados +from teuthology import misc as teuthology + +log = logging.getLogger(__name__) + +def wait_for_pg_state(manager, pgid, state, to_osd): + log.debug("waiting for pg %s state is %s" % (pgid, state)) + for i in range(300): + time.sleep(5) + manager.flush_pg_stats([0, 1, 2, 3]) + pgs = manager.get_pg_stats() + pg = next((pg for pg in pgs if pg['pgid'] == pgid), None) + log.info('pg=%s' % pg); + assert pg + status = pg['state'].split('+') + if 'active' not in status: + log.debug('not active') + continue + if state not in status: + log.debug('not %s' % state) + continue + assert to_osd in pg['up'] + return + assert False, '%s not in %s' % (pgid, state) + + +def task(ctx, config): + """ + Test backfill reservation calculates "toofull" condition correctly. + + A pretty rigid cluster is brought up and tested by this task + """ + if config is None: + config = {} + assert isinstance(config, dict), \ + 'backfill_toofull task only accepts a dict for configuration' + first_mon = teuthology.get_first_mon(ctx, config) + (mon,) = ctx.cluster.only(first_mon).remotes.keys() + + manager = ceph_manager.CephManager( + mon, + ctx=ctx, + logger=log.getChild('ceph_manager'), + ) + + profile = config.get('erasure_code_profile', { + 'k': '2', + 'm': '1', + 'crush-failure-domain': 'osd' + }) + profile_name = profile.get('name', 'backfill_toofull') + manager.create_erasure_code_profile(profile_name, profile) + pool = manager.create_pool_with_unique_name( + pg_num=1, + erasure_code_profile_name=profile_name, + min_size=2) + manager.raw_cluster_cmd('osd', 'pool', 'set', pool, + 'pg_autoscale_mode', 'off') + + manager.flush_pg_stats([0, 1, 2, 3]) + manager.wait_for_clean() + + pool_id = manager.get_pool_num(pool) + pgid = '%d.0' % pool_id + pgs = manager.get_pg_stats() + acting = next((pg['acting'] for pg in pgs if pg['pgid'] == pgid), None) + log.debug("acting=%s" % acting) + assert acting + primary = acting[0] + target = acting[1] + + log.debug("write some data") + rados(ctx, mon, ['-p', pool, 'bench', '60', 'write', '--no-cleanup']) + df = manager.get_osd_df(target) + log.debug("target osd df: %s" % df) + + total_kb = df['kb'] + used_kb = df['kb_used'] + + log.debug("pause recovery") + manager.raw_cluster_cmd('osd', 'set', 'noout') + manager.raw_cluster_cmd('osd', 'set', 'nobackfill') + manager.raw_cluster_cmd('osd', 'set', 'norecover') + + log.debug("stop tartget osd %s" % target) + manager.kill_osd(target) + manager.wait_till_active() + + pgs = manager.get_pg_stats() + pg = next((pg for pg in pgs if pg['pgid'] == pgid), None) + log.debug('pg=%s' % pg) + assert pg + + log.debug("re-write data") + rados(ctx, mon, ['-p', pool, 'cleanup']) + time.sleep(10) + rados(ctx, mon, ['-p', pool, 'bench', '60', 'write', '--no-cleanup']) + + df = manager.get_osd_df(primary) + log.debug("primary osd df: %s" % df) + + primary_used_kb = df['kb_used'] + + log.info("test backfill reservation rejected with toofull") + + # We set backfillfull ratio less than new data size and expect the pg + # entering backfill_toofull state. + # + # We also need to update nearfull ratio to prevent "full ratio(s) out of order". + + backfillfull = 0.9 * primary_used_kb / total_kb + nearfull = backfillfull * 0.9 + + log.debug("update nearfull ratio to %s and backfillfull ratio to %s" % + (nearfull, backfillfull)) + manager.raw_cluster_cmd('osd', 'set-nearfull-ratio', + '{:.3f}'.format(nearfull)) + manager.raw_cluster_cmd('osd', 'set-backfillfull-ratio', + '{:.3f}'.format(backfillfull)) + + log.debug("start tartget osd %s" % target) + + manager.revive_osd(target) + manager.wait_for_active() + manager.wait_till_osd_is_up(target) + + wait_for_pg_state(manager, pgid, 'backfill_toofull', target) + + log.info("test pg not enter backfill_toofull after restarting backfill") + + # We want to set backfillfull ratio to be big enough for the target to + # successfully backfill new data but smaller than the sum of old and new + # data, so if the osd backfill reservation incorrectly calculates "toofull" + # the test will detect this (fail). + # + # We also need to update nearfull ratio to prevent "full ratio(s) out of order". + + backfillfull = min(used_kb + primary_used_kb, total_kb * 0.9) / total_kb + nearfull_min = max(used_kb, primary_used_kb) / total_kb + delta = backfillfull - nearfull_min + nearfull = nearfull_min + delta * 0.1 + backfillfull = nearfull_min + delta * 0.2 + + log.debug("update nearfull ratio to %s and backfillfull ratio to %s" % + (nearfull, backfillfull)) + manager.raw_cluster_cmd('osd', 'set-nearfull-ratio', + '{:.3f}'.format(nearfull)) + manager.raw_cluster_cmd('osd', 'set-backfillfull-ratio', + '{:.3f}'.format(backfillfull)) + + wait_for_pg_state(manager, pgid, 'backfilling', target) + + pgs = manager.get_pg_stats() + pg = next((pg for pg in pgs if pg['pgid'] == pgid), None) + log.debug('pg=%s' % pg) + assert pg + + log.debug("interrupt %s backfill" % target) + manager.mark_down_osd(target) + # after marking the target osd down it will automatically be + # up soon again + + log.debug("resume recovery") + manager.raw_cluster_cmd('osd', 'unset', 'noout') + manager.raw_cluster_cmd('osd', 'unset', 'nobackfill') + manager.raw_cluster_cmd('osd', 'unset', 'norecover') + + # wait for everything to peer, backfill and recover + manager.wait_for_clean() + + pgs = manager.get_pg_stats() + pg = next((pg for pg in pgs if pg['pgid'] == pgid), None) + log.info('pg=%s' % pg) + assert pg + assert 'clean' in pg['state'].split('+') diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index eb078c478c1..2ff26a9cbf0 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -2324,6 +2324,15 @@ class CephManager: except KeyError: return j['pg_stats'] + def get_osd_df(self, osdid): + """ + Get the osd df stats + """ + out = self.raw_cluster_cmd('osd', 'df', 'name', 'osd.{}'.format(osdid), + '--format=json') + j = json.loads('\n'.join(out.split('\n')[1:])) + return j['nodes'][0] + def get_pgids_to_force(self, backfill): """ Return the randomized list of PGs that can have their recovery/backfill forced