From 99fc4a8d406291b65a53f157442bc54bc67e8b0d Mon Sep 17 00:00:00 2001 From: Adam King Date: Tue, 7 Nov 2023 15:49:57 -0500 Subject: [PATCH] mgr/cephadm: fix reweighting of OSD when OSD removal is stopped Previously, when you ran "ceph orch osd rm stop " cephadm would pass in a new OSD object to the removal queue that would not have any of the fields set previously for the OSD. This was mostly fine when removing it from the queue as those fields were no longer needed, but an exception was the initial weight, which you need if you want to set the weight back when you stop removal. This patch changes it so it will now remove the actual OSD object the removal queue stores so that we will get to use the previously set original weight. It also changes when we grab the original weight to make it happen earlier and adds it to the to_json so it survives any potential mgr failovers. Fixes: https://tracker.ceph.com/issues/63481 Signed-off-by: Adam King --- src/pybind/mgr/cephadm/module.py | 3 +- src/pybind/mgr/cephadm/services/osd.py | 13 ++++++++- src/pybind/mgr/cephadm/tests/test_cephadm.py | 30 ++++++++++++++------ 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index a83256d0bb7d2..05cd8d6b78557 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -3379,8 +3379,7 @@ Then run the following: """ for osd_id in osd_ids: try: - self.to_remove_osds.rm(OSD(osd_id=int(osd_id), - remove_util=self.to_remove_osds.rm_util)) + self.to_remove_osds.rm_by_osd_id(int(osd_id)) except (NotFoundError, KeyError, ValueError): return f'Unable to find OSD in the queue: {osd_id}' diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py index bfecc57230abc..3616d42deee4e 100644 --- a/src/pybind/mgr/cephadm/services/osd.py +++ b/src/pybind/mgr/cephadm/services/osd.py @@ -664,6 +664,7 @@ class OSD: return None self.started = True self.stopped = False + self.original_weight = self.rm_util.get_weight(self) def start_draining(self) -> bool: if self.stopped: @@ -672,7 +673,6 @@ class OSD: if self.replace: self.rm_util.set_osd_flag([self], 'out') else: - self.original_weight = self.rm_util.get_weight(self) self.rm_util.reweight_osd(self, 0.0) self.drain_started_at = datetime.utcnow() self.draining = True @@ -761,6 +761,7 @@ class OSD: out['force'] = self.force out['zap'] = self.zap out['hostname'] = self.hostname # type: ignore + out['original_weight'] = self.original_weight for k in ['drain_started_at', 'drain_stopped_at', 'drain_done_at', 'process_started_at']: if getattr(self, k): @@ -953,6 +954,16 @@ class OSDRemovalQueue(object): self.osds.add(osd) osd.start() + def rm_by_osd_id(self, osd_id: int) -> None: + osd: Optional["OSD"] = None + for o in self.osds: + if o.osd_id == osd_id: + osd = o + if not osd: + logger.debug(f"Could not find osd with id {osd_id} in queue.") + raise KeyError(f'No osd with id {osd_id} in removal queue') + self.rm(osd) + def rm(self, osd: "OSD") -> None: if not osd.exists: raise NotFoundError() diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index b6aef018cf09b..ebdf7eea5aa0c 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -1287,7 +1287,11 @@ class TestCephadm(object): )) @mock.patch("cephadm.services.osd.OSD.exists", True) @mock.patch("cephadm.services.osd.RemoveUtil.get_pg_count", lambda _, __: 0) - def test_remove_osds(self, cephadm_module): + @mock.patch("cephadm.services.osd.RemoveUtil.get_weight") + @mock.patch("cephadm.services.osd.RemoveUtil.reweight_osd") + def test_remove_osds(self, _reweight_osd, _get_weight, cephadm_module): + osd_initial_weight = 2.1 + _get_weight.return_value = osd_initial_weight with with_host(cephadm_module, 'test'): CephadmServe(cephadm_module)._refresh_host_daemons('test') c = cephadm_module.list_daemons() @@ -1297,13 +1301,23 @@ class TestCephadm(object): out = wait(cephadm_module, c) assert out == ["Removed osd.0 from host 'test'"] - cephadm_module.to_remove_osds.enqueue(OSD(osd_id=0, - replace=False, - force=False, - hostname='test', - process_started_at=datetime_now(), - remove_util=cephadm_module.to_remove_osds.rm_util - )) + osd_0 = OSD(osd_id=0, + replace=False, + force=False, + hostname='test', + process_started_at=datetime_now(), + remove_util=cephadm_module.to_remove_osds.rm_util + ) + + cephadm_module.to_remove_osds.enqueue(osd_0) + _get_weight.assert_called() + + # test that OSD is properly reweighted on removal + cephadm_module.stop_remove_osds([0]) + _reweight_osd.assert_called_with(mock.ANY, osd_initial_weight) + + # add OSD back to queue and test normal removal queue processing + cephadm_module.to_remove_osds.enqueue(osd_0) cephadm_module.to_remove_osds.process_removal_queue() assert cephadm_module.to_remove_osds == OSDRemovalQueue(cephadm_module) -- 2.39.5