qa/erasure-code: Teach the OSDThrasher to enable allow_ec_optimizations on pools

author Connor Fawcett <connorfa@uk.ibm.com>

Fri, 10 Oct 2025 11:28:31 +0000 (12:28 +0100)

committer Connor Fawcett <connorfa@uk.ibm.com>

Mon, 13 Oct 2025 11:11:57 +0000 (12:11 +0100)
author Connor Fawcett <connorfa@uk.ibm.com>
Fri, 10 Oct 2025 11:28:31 +0000 (12:28 +0100)
committer Connor Fawcett <connorfa@uk.ibm.com>
Mon, 13 Oct 2025 11:11:57 +0000 (12:11 +0100)
diff --git a/qa/suites/rados/thrash-erasure-code-big/ec_optimizations/ec_optimizations_off_then_on.yaml b/qa/suites/rados/thrash-erasure-code-big/ec_optimizations/ec_optimizations_off_then_on.yaml

new file mode 100644 (file)

index 0000000..c61dd03
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-big/ec_optimizations/ec_optimizations_off_then_on.yaml
@@ -0,0 +1,8 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        enable experimental unrecoverable data corrupting features: '*'
+        osd_pool_default_flag_ec_optimizations: false
+  thrashosds:
+    ec_optimizations_off_then_on: true
diff --git a/qa/suites/rados/thrash-erasure-code-crush-4-nodes/ec_optimizations/ec_optimizations_off_then_on.yaml b/qa/suites/rados/thrash-erasure-code-crush-4-nodes/ec_optimizations/ec_optimizations_off_then_on.yaml

new file mode 100644 (file)

index 0000000..c61dd03
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-crush-4-nodes/ec_optimizations/ec_optimizations_off_then_on.yaml
@@ -0,0 +1,8 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        enable experimental unrecoverable data corrupting features: '*'
+        osd_pool_default_flag_ec_optimizations: false
+  thrashosds:
+    ec_optimizations_off_then_on: true
diff --git a/qa/suites/rados/thrash-erasure-code-isa/ec_optimizations/ec_optimizations_off_then_on.yaml b/qa/suites/rados/thrash-erasure-code-isa/ec_optimizations/ec_optimizations_off_then_on.yaml

new file mode 100644 (file)

index 0000000..c61dd03
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-isa/ec_optimizations/ec_optimizations_off_then_on.yaml
@@ -0,0 +1,8 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        enable experimental unrecoverable data corrupting features: '*'
+        osd_pool_default_flag_ec_optimizations: false
+  thrashosds:
+    ec_optimizations_off_then_on: true
diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/ec_optimizations/ec_optimizations_off_then_on.yaml b/qa/suites/rados/thrash-erasure-code-overwrites/ec_optimizations/ec_optimizations_off_then_on.yaml

new file mode 100644 (file)

index 0000000..c61dd03
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-overwrites/ec_optimizations/ec_optimizations_off_then_on.yaml
@@ -0,0 +1,8 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        enable experimental unrecoverable data corrupting features: '*'
+        osd_pool_default_flag_ec_optimizations: false
+  thrashosds:
+    ec_optimizations_off_then_on: true
diff --git a/qa/suites/rados/thrash-erasure-code-shec/ec_optimizations/ec_optimizations_off_then_on.yaml b/qa/suites/rados/thrash-erasure-code-shec/ec_optimizations/ec_optimizations_off_then_on.yaml

new file mode 100644 (file)

index 0000000..c61dd03
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-shec/ec_optimizations/ec_optimizations_off_then_on.yaml
@@ -0,0 +1,8 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        enable experimental unrecoverable data corrupting features: '*'
+        osd_pool_default_flag_ec_optimizations: false
+  thrashosds:
+    ec_optimizations_off_then_on: true
diff --git a/qa/suites/rados/thrash-erasure-code/ec_optimizations/ec_optimizations_off_then_on.yaml b/qa/suites/rados/thrash-erasure-code/ec_optimizations/ec_optimizations_off_then_on.yaml

new file mode 100644 (file)

index 0000000..cd95c46
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code/ec_optimizations/ec_optimizations_off_then_on.yaml
@@ -0,0 +1,9 @@
+overrides:
+  ceph:
+    conf:
+      global:
+        enable experimental unrecoverable data corrupting features: '*'
+        osd_pool_default_flag_ec_optimizations: false
+        ec_optimizations_on_then_off: true
+  thrashosds:
+    ec_optimizations_off_then_on: true
diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py

index 0f7e92c5c2fb98af3c1dacbc43656e7fe53545e1..51671b084bafcb18d600376fafdb971c59c3e09a 100644 (file)
--- a/qa/tasks/ceph_manager.py
+++ b/qa/tasks/ceph_manager.py
@@ -238,6 +238,7 @@ class OSDThrasher(Thrasher):
          self.chance_force_recovery = self.config.get('chance_force_recovery', 0.3)
          self.chance_reset_purged_snaps_last = self.config.get('chance_reset_purged_snaps_last', 0.3)
          self.chance_trim_stale_osdmaps = self.config.get('chance_trim_stale_osdmaps', 0.3)
+        self.ec_opts_off_then_on = self.config.get('ec_optimizations_off_then_on', False)
  
          num_osds = self.in_osds + self.out_osds
          self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * len(num_osds)
@@ -283,6 +284,12 @@ class OSDThrasher(Thrasher):
              self.dump_ops_thread = gevent.spawn(self.do_dump_ops)
          if self.noscrub_toggle_delay:
              self.noscrub_toggle_thread = gevent.spawn(self.do_noscrub_toggle)
+        if self.ec_opts_off_then_on:
+            # need delay to let some objects be written before enabling opts
+            self.log("ec_opts_off_then_on detected by thrasher")
+            delay = random.uniform(300, 900)
+            self.turn_on_opts_thread = gevent.spawn_later(delay,
+                                                          self.do_enable_ec_opts)
  
      def log(self, msg, *args, **kwargs):
          self.logger.info(msg, *args, **kwargs)
@@ -893,6 +900,9 @@ class OSDThrasher(Thrasher):
          if self.noscrub_toggle_delay:
              self.log("joining the do_noscrub_toggle greenlet")
              self.noscrub_toggle_thread.join()
+        if self.ec_opts_off_then_on:
+            self.log("joining the do_enable_ec_opts greenlet")
+            self.turn_on_opts_thread.join()
  
      def stop_and_join(self):
          """
@@ -1459,6 +1469,21 @@ class OSDThrasher(Thrasher):
          self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'noscrub')
          self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub')
  
+    @log_exc
+    def do_enable_ec_opts(self):
+        """
+        Loop through pools and enable allow_ec_optimizations on
+        any EC pools with optimizations disabled.
+        """
+        for pool in self.ceph_manager.pools:
+            opts_enabled = self.get_pool_property(self, pool, 'allow_ec_optimizations')
+            # Pools with opts enabled will return 'true', non ec pools will return an error
+            if opts_enabled is 'false':
+                set_pool_property(self, pool, 'allow_ec_optimizations', 'true')
+                self.log('Enabled ec optimizations on pool %s' % pool)
+            else:
+                self.log('Unable to enable ec optimizations on pool %s, ignoring' % pool)
+
      @log_exc
      def _do_thrash(self):
          """
author	Connor Fawcett <connorfa@uk.ibm.com>
	Fri, 10 Oct 2025 11:28:31 +0000 (12:28 +0100)
committer	Connor Fawcett <connorfa@uk.ibm.com>
	Mon, 13 Oct 2025 11:11:57 +0000 (12:11 +0100)
qa/suites/rados/thrash-erasure-code-big/ec_optimizations/ec_optimizations_off_then_on.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/rados/thrash-erasure-code-crush-4-nodes/ec_optimizations/ec_optimizations_off_then_on.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/rados/thrash-erasure-code-isa/ec_optimizations/ec_optimizations_off_then_on.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/rados/thrash-erasure-code-overwrites/ec_optimizations/ec_optimizations_off_then_on.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/rados/thrash-erasure-code-shec/ec_optimizations/ec_optimizations_off_then_on.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/rados/thrash-erasure-code/ec_optimizations/ec_optimizations_off_then_on.yaml	[new file with mode: 0644]	patch \| blob
qa/tasks/ceph_manager.py		patch \| blob \| history