]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
pybind/mgr/progress: disable pg recovery event by default 45043/head
authorKamoltat <ksirivad@redhat.com>
Fri, 14 Jan 2022 02:44:16 +0000 (02:44 +0000)
committerKamoltat <ksirivad@redhat.com>
Tue, 15 Feb 2022 15:31:18 +0000 (15:31 +0000)
The progress module disabled the pg recovery event by default
since the event is expensive and has interrupted other serviceis
when there is OSDs being marked in/out from the the cluster.

To turn the event on manually:

ceph config set mgr mgr/progress/allow_pg_recovery_event true

Updated qa/tasks/mgr/test_progress.py to enable
the pg recovery event when testing the progress module.

Signed-off-by: Kamoltat <ksirivad@redhat.com>
(cherry picked from commit f06da20dff141dc239900f944001d55fb8296014)

PendingReleaseNotes
doc/mgr/progress.rst
qa/tasks/mgr/test_progress.py
src/pybind/mgr/progress/module.py

index a43aa59f87ccbd4ec9f2e80e33feb0d38bc5a7eb..36f335d8450a236aa95e349aec30240e2065c664 100644 (file)
 
   https://docs.ceph.com/en/latest/mgr/telemetry/
 
+* MGR: The progress module disables the pg recovery event by default
+  since the event is expensive and has interrupted other service when
+  there are OSDs being marked in/out from the the cluster. However,
+  the user may still enable this event anytime. For more details, see:
+
+  https://docs.ceph.com/en/latest/mgr/progress/
+
 >=16.0.0
 --------
 * mgr/nfs: ``nfs`` module is moved out of volumes plugin. Prior using the
index 19c1e11f94ac171b57b3f6af807ff5716f34a0ae..77a8a408a9aadf36f72bd858a543800d8234909d 100644 (file)
@@ -45,3 +45,14 @@ Clear all ongoing and completed events:
 .. prompt:: bash #
 
   ceph progress clear
+
+PG Recovery Event
+-----------------
+
+An event for each PG affected by recovery event can be shown in
+`ceph progress` This is completely optional, and disabled by default
+due to CPU overheard:
+
+.. prompt:: bash #
+
+  ceph config set mgr mgr/progress/allow_pg_recovery_event true
index 082653f625294c5620dcd27b421d4ffaddd51745..a80600c6a803587ef4cc758ec7108b0dd8f768f8 100644 (file)
@@ -280,6 +280,11 @@ class TestProgress(MgrTestCase):
                 self.mgr_cluster.mon_manager.raw_cluster_cmd(
                     'osd', 'in', str(osd['osd']))
 
+        # Unset allow_pg_recovery_event in case it's set to true
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'config', 'set', 'mgr',
+            'mgr/progress/allow_pg_recovery_event', 'false')
+
         super(TestProgress, self).tearDown()
 
     def test_osd_healthy_recovery(self):
@@ -288,6 +293,10 @@ class TestProgress(MgrTestCase):
         placement, and we wait for the PG to get healthy in its new
         locations.
         """
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'config', 'set', 'mgr',
+            'mgr/progress/allow_pg_recovery_event', 'true')
+
         ev = self._simulate_failure()
 
         # Wait for progress event to ultimately reach completion
@@ -301,6 +310,10 @@ class TestProgress(MgrTestCase):
         progress event to be correctly marked complete once there
         is no more data to move.
         """
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'config', 'set', 'mgr',
+            'mgr/progress/allow_pg_recovery_event', 'true')
+
         ev = self._simulate_failure()
 
         self.mgr_cluster.mon_manager.remove_pool(self.POOL)
@@ -317,6 +330,10 @@ class TestProgress(MgrTestCase):
         It should create another event for when osd is marked in
         and cancel the one that is still ongoing.
         """
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'config', 'set', 'mgr',
+            'mgr/progress/allow_pg_recovery_event', 'true')
+
         ev1 = self._simulate_failure()
 
         ev2 = self._simulate_back_in([0], ev1)
@@ -336,6 +353,9 @@ class TestProgress(MgrTestCase):
         coming in from other module, however, once it is turned
         back, on creating an event should be working as it is.
         """
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            'config', 'set', 'mgr',
+            'mgr/progress/allow_pg_recovery_event', 'true')
 
         pool_size = 3
         self._setup_pool(size=pool_size)
@@ -378,3 +398,26 @@ class TestProgress(MgrTestCase):
                              check_fn=lambda: self._is_inprogress_or_complete(ev1['id']),
                              timeout=self.RECOVERY_PERIOD)
         self.assertTrue(self._is_quiet())
+
+    def test_default_progress_test(self):
+        """
+        progress module disabled the event of pg recovery event
+        by default, we test this to see if this holds true
+        """
+        pool_size = 3
+        self._setup_pool(size=pool_size)
+        self._write_some_data(self.WRITE_PERIOD)        
+
+        with self.recovery_backfill_disabled():
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                    'osd', 'out', '0')
+
+        time.sleep(self.EVENT_CREATION_PERIOD/2)
+
+        with self.recovery_backfill_disabled():
+            self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                    'osd', 'in', '0')
+
+        time.sleep(self.EVENT_CREATION_PERIOD/2)
+
+        self.assertEqual(self._osd_in_out_events_count(), 0)
index 422aba962a1c0da4c63748f081dbe8e01b03d8e6..7c98200faa6a7f1fc1f1d3936303ab1c5356ed98 100644 (file)
@@ -450,6 +450,13 @@ class Module(MgrModule):
             'enabled',
             default=True,
             type='bool',
+        ),
+        Option(
+            'allow_pg_recovery_event',
+            default=False,
+            type='bool',
+            desc='allow the module to show pg recovery progress',
+            runtime=True
         )
     ]
 
@@ -476,6 +483,7 @@ class Module(MgrModule):
             self.max_completed_events = 0
             self.sleep_interval = 0
             self.enabled = True
+            self.allow_pg_recovery_event = False
 
     def config_notify(self):
         for opt in self.MODULE_OPTIONS:
@@ -718,7 +726,8 @@ class Module(MgrModule):
                 self._dirty = False
 
             if self.enabled:
-                self._process_osdmap()
+                if self.allow_pg_recovery_event:
+                    self._process_osdmap()
                 self._process_pg_summary()
 
             self._shutdown.wait(timeout=self.sleep_interval)