From b0134cc7a8707afdc94452aeaaa5eed834f965f2 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Piotr=20Da=C5=82ek?= Date: Wed, 28 Jun 2017 15:47:03 +0200 Subject: [PATCH] qa: add force/cancel recovery/backfill to QA testing MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This randomly issues pg force-recovery/force-backfill and pg cancel-force-recovery/cancel-force-backfill during QA testing. Disabled for upgrades from hammer, jewel and kraken. Signed-off-by: Piotr Dałek --- .../jewel-x-singleton/3-thrash/default.yaml | 1 + .../3-thrash/default.yaml | 1 + .../stress-split/3-thrash/default.yaml | 1 + .../3-thrash/default.yaml | 1 + .../stress-split/3-thrash/default.yaml | 1 + qa/tasks/ceph_manager.py | 70 +++++++++++++++++++ 6 files changed, 75 insertions(+) diff --git a/qa/suites/rados/upgrade/jewel-x-singleton/3-thrash/default.yaml b/qa/suites/rados/upgrade/jewel-x-singleton/3-thrash/default.yaml index a83c77dab191d..da4c7ef94d08a 100644 --- a/qa/suites/rados/upgrade/jewel-x-singleton/3-thrash/default.yaml +++ b/qa/suites/rados/upgrade/jewel-x-singleton/3-thrash/default.yaml @@ -20,4 +20,5 @@ split_tasks: chance_thrash_cluster_full: 0 chance_thrash_pg_upmap: 0 chance_thrash_pg_upmap_items: 0 + chance_force_recovery: 0 - print: "**** done thrashosds 3-thrash" diff --git a/qa/suites/upgrade/jewel-x/stress-split-erasure-code/3-thrash/default.yaml b/qa/suites/upgrade/jewel-x/stress-split-erasure-code/3-thrash/default.yaml index d3259109ffbee..b037bca5728c7 100644 --- a/qa/suites/upgrade/jewel-x/stress-split-erasure-code/3-thrash/default.yaml +++ b/qa/suites/upgrade/jewel-x/stress-split-erasure-code/3-thrash/default.yaml @@ -20,4 +20,5 @@ stress-tasks: chance_thrash_cluster_full: 0 chance_thrash_pg_upmap: 0 chance_thrash_pg_upmap_items: 0 + chance_force_recovery: 0 - print: "**** done thrashosds 3-thrash" diff --git a/qa/suites/upgrade/jewel-x/stress-split/3-thrash/default.yaml b/qa/suites/upgrade/jewel-x/stress-split/3-thrash/default.yaml index 73449101449b0..d5d28fbd1e4e8 100644 --- a/qa/suites/upgrade/jewel-x/stress-split/3-thrash/default.yaml +++ b/qa/suites/upgrade/jewel-x/stress-split/3-thrash/default.yaml @@ -20,4 +20,5 @@ stress-tasks: chance_thrash_pg_upmap: 0 chance_thrash_pg_upmap_items: 0 disable_objectstore_tool_tests: true + chance_force_recovery: 0 - print: "**** done thrashosds 3-thrash" diff --git a/qa/suites/upgrade/kraken-x/stress-split-erasure-code/3-thrash/default.yaml b/qa/suites/upgrade/kraken-x/stress-split-erasure-code/3-thrash/default.yaml index d3259109ffbee..b037bca5728c7 100644 --- a/qa/suites/upgrade/kraken-x/stress-split-erasure-code/3-thrash/default.yaml +++ b/qa/suites/upgrade/kraken-x/stress-split-erasure-code/3-thrash/default.yaml @@ -20,4 +20,5 @@ stress-tasks: chance_thrash_cluster_full: 0 chance_thrash_pg_upmap: 0 chance_thrash_pg_upmap_items: 0 + chance_force_recovery: 0 - print: "**** done thrashosds 3-thrash" diff --git a/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml b/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml index 73449101449b0..d5d28fbd1e4e8 100644 --- a/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml +++ b/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml @@ -20,4 +20,5 @@ stress-tasks: chance_thrash_pg_upmap: 0 chance_thrash_pg_upmap_items: 0 disable_objectstore_tool_tests: true + chance_force_recovery: 0 - print: "**** done thrashosds 3-thrash" diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index d541aeacccc25..b9fcc1ea2bbb5 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -126,6 +126,7 @@ class Thrasher: self.chance_thrash_pg_upmap = self.config.get('chance_thrash_pg_upmap', 1.0) self.chance_thrash_pg_upmap_items = self.config.get('chance_thrash_pg_upmap', 1.0) self.random_eio = self.config.get('random_eio') + self.chance_force_recovery = self.config.get('chance_force_recovery', 0.3) num_osds = self.in_osds + self.out_osds self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds @@ -603,6 +604,39 @@ class Thrasher: except CommandFailedError: self.log('Failed to rm-pg-upmap-items, ignoring') + def force_recovery(self): + """ + Force recovery on some of PGs + """ + backfill = random.random() >= 0.5 + j = self.ceph_manager.get_pgids_to_force(backfill) + if j: + if backfill: + self.ceph_manager.raw_cluster_cmd('pg', 'force-backfill', *j) + else: + self.ceph_manager.raw_cluster_cmd('pg', 'force-recovery', *j) + + def cancel_force_recovery(self): + """ + Force recovery on some of PGs + """ + backfill = random.random() >= 0.5 + j = self.ceph_manager.get_pgids_to_cancel_force(backfill) + if j: + if backfill: + self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-backfill', *j) + else: + self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-recovery', *j) + + def force_cancel_recovery(self): + """ + Force or cancel forcing recovery + """ + if random.random() >= 0.4: + self.force_recovery() + else: + self.cancel_force_recovery() + def all_up(self): """ Make sure all osds are up and not out. @@ -841,6 +875,8 @@ class Thrasher: actions.append((self.thrash_pg_upmap, self.chance_thrash_pg_upmap,)) if self.chance_thrash_pg_upmap_items > 0: actions.append((self.thrash_pg_upmap_items, self.chance_thrash_pg_upmap_items,)) + if self.chance_force_recovery > 0: + actions.append((self.force_cancel_recovery, self.chance_force_recovery)) for key in ['heartbeat_inject_failure', 'filestore_inject_stall']: for scenario in [ @@ -1786,6 +1822,40 @@ class CephManager: j = json.loads('\n'.join(out.split('\n')[1:])) return j['pg_stats'] + def get_pgids_to_force(self, backfill): + """ + Return the randomized list of PGs that can have their recovery/backfill forced + """ + j = self.get_pg_stats(); + pgids = [] + if backfill: + wanted = ['degraded', 'backfilling', 'backfill_wait'] + else: + wanted = ['recovering', 'degraded', 'recovery_wait'] + for pg in j: + status = pg['state'].split('+') + for t in wanted: + if random.random() > 0.5 and not ('forced_backfill' in status or 'forced_recovery' in status) and t in status: + pgids.append(pg['pgid']) + break + return pgids + + def get_pgids_to_cancel_force(self, backfill): + """ + Return the randomized list of PGs whose recovery/backfill priority is forced + """ + j = self.get_pg_stats(); + pgids = [] + if backfill: + wanted = 'forced_backfill' + else: + wanted = 'forced_recovery' + for pg in j: + status = pg['state'].split('+') + if wanted in status and random.random() > 0.5: + pgids.append(pg['pgid']) + return pgids + def compile_pg_status(self): """ Return a histogram of pg state values -- 2.39.5