From 69a8d257394f26be901c06c847e2e816fe9183ff Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Fri, 14 Jan 2022 02:44:16 +0000 Subject: [PATCH] pybind/mgr/progress: disable pg recovery event by default The progress module disabled the pg recovery event by default since the event is expensive and has interrupted other serviceis when there is OSDs being marked in/out from the the cluster. To turn the event on manually: ceph config set mgr mgr/progress/allow_pg_recovery_event true Updated qa/tasks/mgr/test_progress.py to enable the pg recovery event when testing the progress module. Signed-off-by: Kamoltat (cherry picked from commit f06da20dff141dc239900f944001d55fb8296014) --- PendingReleaseNotes | 7 +++++ doc/mgr/progress.rst | 11 ++++++++ qa/tasks/mgr/test_progress.py | 43 +++++++++++++++++++++++++++++++ src/pybind/mgr/progress/module.py | 11 +++++++- 4 files changed, 71 insertions(+), 1 deletion(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index a43aa59f87cc..36f335d8450a 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -126,6 +126,13 @@ https://docs.ceph.com/en/latest/mgr/telemetry/ +* MGR: The progress module disables the pg recovery event by default + since the event is expensive and has interrupted other service when + there are OSDs being marked in/out from the the cluster. However, + the user may still enable this event anytime. For more details, see: + + https://docs.ceph.com/en/latest/mgr/progress/ + >=16.0.0 -------- * mgr/nfs: ``nfs`` module is moved out of volumes plugin. Prior using the diff --git a/doc/mgr/progress.rst b/doc/mgr/progress.rst index 19c1e11f94ac..77a8a408a9aa 100644 --- a/doc/mgr/progress.rst +++ b/doc/mgr/progress.rst @@ -45,3 +45,14 @@ Clear all ongoing and completed events: .. prompt:: bash # ceph progress clear + +PG Recovery Event +----------------- + +An event for each PG affected by recovery event can be shown in +`ceph progress` This is completely optional, and disabled by default +due to CPU overheard: + +.. prompt:: bash # + + ceph config set mgr mgr/progress/allow_pg_recovery_event true diff --git a/qa/tasks/mgr/test_progress.py b/qa/tasks/mgr/test_progress.py index 082653f62529..a80600c6a803 100644 --- a/qa/tasks/mgr/test_progress.py +++ b/qa/tasks/mgr/test_progress.py @@ -280,6 +280,11 @@ class TestProgress(MgrTestCase): self.mgr_cluster.mon_manager.raw_cluster_cmd( 'osd', 'in', str(osd['osd'])) + # Unset allow_pg_recovery_event in case it's set to true + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'config', 'set', 'mgr', + 'mgr/progress/allow_pg_recovery_event', 'false') + super(TestProgress, self).tearDown() def test_osd_healthy_recovery(self): @@ -288,6 +293,10 @@ class TestProgress(MgrTestCase): placement, and we wait for the PG to get healthy in its new locations. """ + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'config', 'set', 'mgr', + 'mgr/progress/allow_pg_recovery_event', 'true') + ev = self._simulate_failure() # Wait for progress event to ultimately reach completion @@ -301,6 +310,10 @@ class TestProgress(MgrTestCase): progress event to be correctly marked complete once there is no more data to move. """ + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'config', 'set', 'mgr', + 'mgr/progress/allow_pg_recovery_event', 'true') + ev = self._simulate_failure() self.mgr_cluster.mon_manager.remove_pool(self.POOL) @@ -317,6 +330,10 @@ class TestProgress(MgrTestCase): It should create another event for when osd is marked in and cancel the one that is still ongoing. """ + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'config', 'set', 'mgr', + 'mgr/progress/allow_pg_recovery_event', 'true') + ev1 = self._simulate_failure() ev2 = self._simulate_back_in([0], ev1) @@ -336,6 +353,9 @@ class TestProgress(MgrTestCase): coming in from other module, however, once it is turned back, on creating an event should be working as it is. """ + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'config', 'set', 'mgr', + 'mgr/progress/allow_pg_recovery_event', 'true') pool_size = 3 self._setup_pool(size=pool_size) @@ -378,3 +398,26 @@ class TestProgress(MgrTestCase): check_fn=lambda: self._is_inprogress_or_complete(ev1['id']), timeout=self.RECOVERY_PERIOD) self.assertTrue(self._is_quiet()) + + def test_default_progress_test(self): + """ + progress module disabled the event of pg recovery event + by default, we test this to see if this holds true + """ + pool_size = 3 + self._setup_pool(size=pool_size) + self._write_some_data(self.WRITE_PERIOD) + + with self.recovery_backfill_disabled(): + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'out', '0') + + time.sleep(self.EVENT_CREATION_PERIOD/2) + + with self.recovery_backfill_disabled(): + self.mgr_cluster.mon_manager.raw_cluster_cmd( + 'osd', 'in', '0') + + time.sleep(self.EVENT_CREATION_PERIOD/2) + + self.assertEqual(self._osd_in_out_events_count(), 0) diff --git a/src/pybind/mgr/progress/module.py b/src/pybind/mgr/progress/module.py index 422aba962a1c..7c98200faa6a 100644 --- a/src/pybind/mgr/progress/module.py +++ b/src/pybind/mgr/progress/module.py @@ -450,6 +450,13 @@ class Module(MgrModule): 'enabled', default=True, type='bool', + ), + Option( + 'allow_pg_recovery_event', + default=False, + type='bool', + desc='allow the module to show pg recovery progress', + runtime=True ) ] @@ -476,6 +483,7 @@ class Module(MgrModule): self.max_completed_events = 0 self.sleep_interval = 0 self.enabled = True + self.allow_pg_recovery_event = False def config_notify(self): for opt in self.MODULE_OPTIONS: @@ -718,7 +726,8 @@ class Module(MgrModule): self._dirty = False if self.enabled: - self._process_osdmap() + if self.allow_pg_recovery_event: + self._process_osdmap() self._process_pg_summary() self._shutdown.wait(timeout=self.sleep_interval) -- 2.47.3