]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
qa: add force/cancel recovery/backfill to QA testing
authorPiotr Dałek <piotr.dalek@corp.ovh.com>
Wed, 28 Jun 2017 13:47:03 +0000 (15:47 +0200)
committerPiotr Dałek <piotr.dalek@corp.ovh.com>
Thu, 20 Jul 2017 07:35:55 +0000 (09:35 +0200)
This randomly issues pg force-recovery/force-backfill and
pg cancel-force-recovery/cancel-force-backfill during QA
testing. Disabled for upgrades from hammer, jewel and kraken.

Signed-off-by: Piotr Dałek <piotr.dalek@corp.ovh.com>
qa/suites/rados/upgrade/jewel-x-singleton/3-thrash/default.yaml
qa/suites/upgrade/jewel-x/stress-split-erasure-code/3-thrash/default.yaml
qa/suites/upgrade/jewel-x/stress-split/3-thrash/default.yaml
qa/suites/upgrade/kraken-x/stress-split-erasure-code/3-thrash/default.yaml
qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml
qa/tasks/ceph_manager.py

index a83c77dab191dbab9d73ef7e7756b710d860221d..da4c7ef94d08ae4ccf0294fbeeedc932cc954772 100644 (file)
@@ -20,4 +20,5 @@ split_tasks:
         chance_thrash_cluster_full: 0
         chance_thrash_pg_upmap: 0
         chance_thrash_pg_upmap_items: 0
+        chance_force_recovery: 0
     - print: "**** done thrashosds 3-thrash"
index d3259109ffbee9c951662c6a8a205668cd2322d3..b037bca5728c70329284aeea3bdab4bb0e8c9f7a 100644 (file)
@@ -20,4 +20,5 @@ stress-tasks:
     chance_thrash_cluster_full: 0
     chance_thrash_pg_upmap: 0
     chance_thrash_pg_upmap_items: 0
+    chance_force_recovery: 0
 - print: "**** done thrashosds 3-thrash"
index 73449101449b086758be5b091bd4543696689315..d5d28fbd1e4e8a19e7a8e4d56ecc06c16ca7039a 100644 (file)
@@ -20,4 +20,5 @@ stress-tasks:
     chance_thrash_pg_upmap: 0
     chance_thrash_pg_upmap_items: 0
     disable_objectstore_tool_tests: true
+    chance_force_recovery: 0
 - print: "**** done thrashosds 3-thrash"
index d3259109ffbee9c951662c6a8a205668cd2322d3..b037bca5728c70329284aeea3bdab4bb0e8c9f7a 100644 (file)
@@ -20,4 +20,5 @@ stress-tasks:
     chance_thrash_cluster_full: 0
     chance_thrash_pg_upmap: 0
     chance_thrash_pg_upmap_items: 0
+    chance_force_recovery: 0
 - print: "**** done thrashosds 3-thrash"
index 73449101449b086758be5b091bd4543696689315..d5d28fbd1e4e8a19e7a8e4d56ecc06c16ca7039a 100644 (file)
@@ -20,4 +20,5 @@ stress-tasks:
     chance_thrash_pg_upmap: 0
     chance_thrash_pg_upmap_items: 0
     disable_objectstore_tool_tests: true
+    chance_force_recovery: 0
 - print: "**** done thrashosds 3-thrash"
index d541aeacccc25f3fcfa1b9c064d6a0863fd442a2..b9fcc1ea2bbb501996fe0f0e6de7a4fe89d8423d 100644 (file)
@@ -126,6 +126,7 @@ class Thrasher:
         self.chance_thrash_pg_upmap = self.config.get('chance_thrash_pg_upmap', 1.0)
         self.chance_thrash_pg_upmap_items = self.config.get('chance_thrash_pg_upmap', 1.0)
         self.random_eio = self.config.get('random_eio')
+        self.chance_force_recovery = self.config.get('chance_force_recovery', 0.3)
 
         num_osds = self.in_osds + self.out_osds
         self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds
@@ -603,6 +604,39 @@ class Thrasher:
         except CommandFailedError:
             self.log('Failed to rm-pg-upmap-items, ignoring')
 
+    def force_recovery(self):
+        """
+        Force recovery on some of PGs
+        """
+        backfill = random.random() >= 0.5
+        j = self.ceph_manager.get_pgids_to_force(backfill)
+        if j:
+            if backfill:
+                self.ceph_manager.raw_cluster_cmd('pg', 'force-backfill', *j)
+            else:
+                self.ceph_manager.raw_cluster_cmd('pg', 'force-recovery', *j)
+
+    def cancel_force_recovery(self):
+        """
+        Force recovery on some of PGs
+        """
+        backfill = random.random() >= 0.5
+        j = self.ceph_manager.get_pgids_to_cancel_force(backfill)
+        if j:
+            if backfill:
+                self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-backfill', *j)
+            else:
+                self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-recovery', *j)
+
+    def force_cancel_recovery(self):
+        """
+        Force or cancel forcing recovery
+        """
+        if random.random() >= 0.4:
+           self.force_recovery()
+        else:
+           self.cancel_force_recovery()
+
     def all_up(self):
         """
         Make sure all osds are up and not out.
@@ -841,6 +875,8 @@ class Thrasher:
             actions.append((self.thrash_pg_upmap, self.chance_thrash_pg_upmap,))
         if self.chance_thrash_pg_upmap_items > 0:
             actions.append((self.thrash_pg_upmap_items, self.chance_thrash_pg_upmap_items,))
+        if self.chance_force_recovery > 0:
+            actions.append((self.force_cancel_recovery, self.chance_force_recovery))
 
         for key in ['heartbeat_inject_failure', 'filestore_inject_stall']:
             for scenario in [
@@ -1786,6 +1822,40 @@ class CephManager:
         j = json.loads('\n'.join(out.split('\n')[1:]))
         return j['pg_stats']
 
+    def get_pgids_to_force(self, backfill):
+        """
+        Return the randomized list of PGs that can have their recovery/backfill forced
+        """
+        j = self.get_pg_stats();
+        pgids = []
+        if backfill:
+            wanted = ['degraded', 'backfilling', 'backfill_wait']
+        else:
+            wanted = ['recovering', 'degraded', 'recovery_wait']
+        for pg in j:
+            status = pg['state'].split('+')
+            for t in wanted:
+                if random.random() > 0.5 and not ('forced_backfill' in status or 'forced_recovery' in status) and t in status:
+                    pgids.append(pg['pgid'])
+                    break
+        return pgids
+
+    def get_pgids_to_cancel_force(self, backfill):
+       """
+       Return the randomized list of PGs whose recovery/backfill priority is forced
+       """
+       j = self.get_pg_stats();
+       pgids = []
+       if backfill:
+           wanted = 'forced_backfill'
+       else:
+           wanted = 'forced_recovery'
+       for pg in j:
+           status = pg['state'].split('+')
+           if wanted in status and random.random() > 0.5:
+               pgids.append(pg['pgid'])
+       return pgids
+
     def compile_pg_status(self):
         """
         Return a histogram of pg state values