From b0e09c2215248e8d7f6de1c04045b29fcff7a946 Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Fri, 19 Feb 2021 19:09:48 +0100 Subject: [PATCH] qa: Add bluestore resharing test Signed-off-by: Adam Kupczyk (cherry picked from commit a84820b7432926617d710cf05f0e93d0e7151b49) --- .../thrashers/default.yaml | 2 + .../thrashers/default.yaml | 2 + .../thrashers/default.yaml | 2 + qa/suites/rados/thrash/thrashers/default.yaml | 2 + qa/tasks/ceph_manager.py | 130 ++++++++++++++++++ 5 files changed, 138 insertions(+) diff --git a/qa/suites/rados/thrash-erasure-code-big/thrashers/default.yaml b/qa/suites/rados/thrash-erasure-code-big/thrashers/default.yaml index c36175c50370..d2c7b85b50ec 100644 --- a/qa/suites/rados/thrash-erasure-code-big/thrashers/default.yaml +++ b/qa/suites/rados/thrash-erasure-code-big/thrashers/default.yaml @@ -17,3 +17,5 @@ tasks: chance_pgnum_shrink: 1 chance_pgpnum_fix: 1 min_in: 8 + chance_bluestore_reshard: 1 + bluestore_new_sharding: random diff --git a/qa/suites/rados/thrash-erasure-code-shec/thrashers/default.yaml b/qa/suites/rados/thrash-erasure-code-shec/thrashers/default.yaml index 3dab3f9c5690..a869369fadd6 100644 --- a/qa/suites/rados/thrash-erasure-code-shec/thrashers/default.yaml +++ b/qa/suites/rados/thrash-erasure-code-shec/thrashers/default.yaml @@ -17,3 +17,5 @@ tasks: chance_pgnum_shrink: 1 chance_pgpnum_fix: 1 min_in: 8 + chance_bluestore_reshard: 1 + bluestore_new_sharding: random diff --git a/qa/suites/rados/thrash-erasure-code/thrashers/default.yaml b/qa/suites/rados/thrash-erasure-code/thrashers/default.yaml index e66dab90de60..3728bd8e7186 100644 --- a/qa/suites/rados/thrash-erasure-code/thrashers/default.yaml +++ b/qa/suites/rados/thrash-erasure-code/thrashers/default.yaml @@ -16,3 +16,5 @@ tasks: chance_pgnum_shrink: 1 chance_pgpnum_fix: 1 min_in: 4 + chance_bluestore_reshard: 1 + bluestore_new_sharding: random diff --git a/qa/suites/rados/thrash/thrashers/default.yaml b/qa/suites/rados/thrash/thrashers/default.yaml index 05e0f8e76d11..5a300a9ff701 100644 --- a/qa/suites/rados/thrash/thrashers/default.yaml +++ b/qa/suites/rados/thrash/thrashers/default.yaml @@ -24,3 +24,5 @@ tasks: chance_pgnum_grow: 1 chance_pgnum_shrink: 1 chance_pgpnum_fix: 1 + chance_bluestore_reshard: 1 + bluestore_new_sharding: random diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index b84d564debd5..e65ee3a21d2e 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -239,6 +239,22 @@ class OSDThrasher(Thrasher): stdout=StringIO(), stderr=StringIO()) + def run_ceph_bluestore_tool(self, remote, osd, cmd): + if self.ceph_manager.cephadm: + return shell( + self.ceph_manager.ctx, self.ceph_manager.cluster, remote, + args=['ceph-bluestore-tool', '--err-to-stderr'] + cmd, + name=osd, + wait=True, check_status=False, + stdout=StringIO(), + stderr=StringIO()) + else: + return remote.run( + args=['sudo', 'ceph-bluestore-tool', '--err-to-stderr'] + cmd, + wait=True, check_status=False, + stdout=StringIO(), + stderr=StringIO()) + def kill_osd(self, osd=None, mark_down=False, mark_out=False): """ :param osd: Osd to be killed. @@ -953,6 +969,113 @@ class OSDThrasher(Thrasher): self.ceph_manager.osd_admin_socket(i, command=['injectfull', 'none'], check_status=True, timeout=30, stdout=DEVNULL) + + def generate_random_sharding(self): + prefixes = [ + 'm','O','P','L' + ] + new_sharding = '' + for prefix in prefixes: + choose = random.choice([False, True]) + if not choose: + continue + if new_sharding != '': + new_sharding = new_sharding + ' ' + columns = random.randint(1, 5) + do_hash = random.choice([False, True]) + if do_hash: + low_hash = random.choice([0, 5, 8]) + do_high_hash = random.choice([False, True]) + if do_high_hash: + high_hash = random.choice([8, 16, 30]) + low_hash + new_sharding = new_sharding + prefix + '(' + str(columns) + ',' + str(low_hash) + '-' + str(high_hash) + ')' + else: + new_sharding = new_sharding + prefix + '(' + str(columns) + ',' + str(low_hash) + '-)' + else: + if columns == 1: + new_sharding = new_sharding + prefix + else: + new_sharding = new_sharding + prefix + '(' + str(columns) + ')' + return new_sharding + + def test_bluestore_reshard_action(self): + """ + Test if resharding of bluestore works properly. + If bluestore is not used, or bluestore is in version that + does not support sharding, skip. + """ + + osd = random.choice(self.dead_osds) + remote = self.ceph_manager.find_remote('osd', osd) + FSPATH = self.ceph_manager.get_filepath() + + prefix = [ + '--no-mon-config', + '--log-file=/var/log/ceph/bluestore_tool.$pid.log', + '--log-level=10', + '--path', FSPATH.format(id=osd) + ] + + # sanity check if bluestore-tool accessible + self.log('checking if target objectstore is bluestore on osd.%s' % osd) + cmd = prefix + [ + 'show-label' + ] + proc = self.run_ceph_bluestore_tool(remote, 'osd.%s' % osd, cmd) + if proc.exitstatus != 0: + raise Exception("ceph-bluestore-tool access failed.") + + # check if sharding is possible + self.log('checking if target bluestore supports sharding on osd.%s' % osd) + cmd = prefix + [ + 'show-sharding' + ] + proc = self.run_ceph_bluestore_tool(remote, 'osd.%s' % osd, cmd) + if proc.exitstatus != 0: + self.log("Unable to test resharding, " + "ceph-bluestore-tool does not support it.") + return + + # now go for reshard to something else + self.log('applying new sharding to bluestore on osd.%s' % osd) + new_sharding = self.config.get('bluestore_new_sharding','random') + + if new_sharding == 'random': + self.log('generate random sharding') + new_sharding = self.generate_random_sharding() + + self.log("applying new sharding: " + new_sharding) + cmd = prefix + [ + '--sharding', new_sharding, + 'reshard' + ] + proc = self.run_ceph_bluestore_tool(remote, 'osd.%s' % osd, cmd) + if proc.exitstatus != 0: + raise Exception("ceph-bluestore-tool resharding failed.") + + # now do fsck to + self.log('running fsck to verify new sharding on osd.%s' % osd) + cmd = prefix + [ + 'fsck' + ] + proc = self.run_ceph_bluestore_tool(remote, 'osd.%s' % osd, cmd) + if proc.exitstatus != 0: + raise Exception("ceph-bluestore-tool fsck failed.") + self.log('resharding successfully completed') + + def test_bluestore_reshard(self): + """ + 1) kills an osd + 2) reshards bluestore on killed osd + 3) revives the osd + """ + self.log('test_bluestore_reshard started') + self.kill_osd(mark_down=True, mark_out=True) + self.test_bluestore_reshard_action() + self.revive_osd() + self.log('test_bluestore_reshard completed') + + def test_map_discontinuity(self): """ 1) Allows the osds to recover @@ -1054,6 +1177,13 @@ class OSDThrasher(Thrasher): self.config.get('chance_inject_pause_long', 0),)]: actions.append(scenario) + # only consider resharding if objectstore is bluestore + cluster_name = self.ceph_manager.cluster + cluster = self.ceph_manager.ctx.ceph[cluster_name] + if cluster.conf.get('osd', {}).get('osd objectstore', 'bluestore') == 'bluestore': + actions.append((self.test_bluestore_reshard, + self.config.get('chance_bluestore_reshard', 0),)) + total = sum([y for (x, y) in actions]) val = random.uniform(0, total) for (action, prob) in actions: -- 2.47.3