From ec7934bb42fa17754249147474c18d7dfdf23f5b Mon Sep 17 00:00:00 2001 From: Adam King Date: Tue, 7 Nov 2023 15:49:57 -0500 Subject: [PATCH] mgr/cephadm: fix reweighting of OSD when OSD removal is stopped Previously, when you ran "ceph orch osd rm stop " cephadm would pass in a new OSD object to the removal queue that would not have any of the fields set previously for the OSD. This was mostly fine when removing it from the queue as those fields were no longer needed, but an exception was the initial weight, which you need if you want to set the weight back when you stop removal. This patch changes it so it will now remove the actual OSD object the removal queue stores so that we will get to use the previously set original weight. It also changes when we grab the original weight to make it happen earlier and adds it to the to_json so it survives any potential mgr failovers. Fixes: https://tracker.ceph.com/issues/63481 Signed-off-by: Adam King (cherry picked from commit 99fc4a8d406291b65a53f157442bc54bc67e8b0d) --- src/pybind/mgr/cephadm/module.py | 3 +- src/pybind/mgr/cephadm/services/osd.py | 13 ++++++++- src/pybind/mgr/cephadm/tests/test_cephadm.py | 30 ++++++++++++++------ 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 7c495357a1f48..ce7e0faffbad4 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -3171,8 +3171,7 @@ Then run the following: """ for osd_id in osd_ids: try: - self.to_remove_osds.rm(OSD(osd_id=int(osd_id), - remove_util=self.to_remove_osds.rm_util)) + self.to_remove_osds.rm_by_osd_id(int(osd_id)) except (NotFoundError, KeyError, ValueError): return f'Unable to find OSD in the queue: {osd_id}' diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py index d777d7fc6fe43..9adab337c5d54 100644 --- a/src/pybind/mgr/cephadm/services/osd.py +++ b/src/pybind/mgr/cephadm/services/osd.py @@ -663,6 +663,7 @@ class OSD: return None self.started = True self.stopped = False + self.original_weight = self.rm_util.get_weight(self) def start_draining(self) -> bool: if self.stopped: @@ -671,7 +672,6 @@ class OSD: if self.replace: self.rm_util.set_osd_flag([self], 'out') else: - self.original_weight = self.rm_util.get_weight(self) self.rm_util.reweight_osd(self, 0.0) self.drain_started_at = datetime.utcnow() self.draining = True @@ -760,6 +760,7 @@ class OSD: out['force'] = self.force out['zap'] = self.zap out['hostname'] = self.hostname # type: ignore + out['original_weight'] = self.original_weight for k in ['drain_started_at', 'drain_stopped_at', 'drain_done_at', 'process_started_at']: if getattr(self, k): @@ -952,6 +953,16 @@ class OSDRemovalQueue(object): self.osds.add(osd) osd.start() + def rm_by_osd_id(self, osd_id: int) -> None: + osd: Optional["OSD"] = None + for o in self.osds: + if o.osd_id == osd_id: + osd = o + if not osd: + logger.debug(f"Could not find osd with id {osd_id} in queue.") + raise KeyError(f'No osd with id {osd_id} in removal queue') + self.rm(osd) + def rm(self, osd: "OSD") -> None: if not osd.exists: raise NotFoundError() diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 8e09eb9400116..61f99775d719f 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -1090,7 +1090,11 @@ class TestCephadm(object): )) @mock.patch("cephadm.services.osd.OSD.exists", True) @mock.patch("cephadm.services.osd.RemoveUtil.get_pg_count", lambda _, __: 0) - def test_remove_osds(self, cephadm_module): + @mock.patch("cephadm.services.osd.RemoveUtil.get_weight") + @mock.patch("cephadm.services.osd.RemoveUtil.reweight_osd") + def test_remove_osds(self, _reweight_osd, _get_weight, cephadm_module): + osd_initial_weight = 2.1 + _get_weight.return_value = osd_initial_weight with with_host(cephadm_module, 'test'): CephadmServe(cephadm_module)._refresh_host_daemons('test') c = cephadm_module.list_daemons() @@ -1100,13 +1104,23 @@ class TestCephadm(object): out = wait(cephadm_module, c) assert out == ["Removed osd.0 from host 'test'"] - cephadm_module.to_remove_osds.enqueue(OSD(osd_id=0, - replace=False, - force=False, - hostname='test', - process_started_at=datetime_now(), - remove_util=cephadm_module.to_remove_osds.rm_util - )) + osd_0 = OSD(osd_id=0, + replace=False, + force=False, + hostname='test', + process_started_at=datetime_now(), + remove_util=cephadm_module.to_remove_osds.rm_util + ) + + cephadm_module.to_remove_osds.enqueue(osd_0) + _get_weight.assert_called() + + # test that OSD is properly reweighted on removal + cephadm_module.stop_remove_osds([0]) + _reweight_osd.assert_called_with(mock.ANY, osd_initial_weight) + + # add OSD back to queue and test normal removal queue processing + cephadm_module.to_remove_osds.enqueue(osd_0) cephadm_module.to_remove_osds.process_removal_queue() assert cephadm_module.to_remove_osds == OSDRemovalQueue(cephadm_module) -- 2.39.5