]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: fix reweighting of OSD when OSD removal is stopped 54401/head
authorAdam King <adking@redhat.com>
Tue, 7 Nov 2023 20:49:57 +0000 (15:49 -0500)
committerAdam King <adking@redhat.com>
Tue, 7 Nov 2023 20:58:35 +0000 (15:58 -0500)
Previously, when you ran "ceph orch osd rm stop <osd-id>"
cephadm would pass in a new OSD object to the removal
queue that would not have any of the fields set previously
for the OSD. This was mostly fine when removing it from
the queue as those fields were no longer needed, but an
exception was the initial weight, which you need if
you want to set the weight back when you stop removal.

This patch changes it so it will now remove the actual
OSD object the removal queue stores so that we will
get to use the previously set original weight. It also
changes when we grab the original weight to make it
happen earlier and adds it to the to_json so it survives
any potential mgr failovers.

Fixes: https://tracker.ceph.com/issues/63481
Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/services/osd.py
src/pybind/mgr/cephadm/tests/test_cephadm.py

index a83256d0bb7d23fc4788cf671bc3c0ad3f24ea12..05cd8d6b78557aec186482ec5a8a953042c70ef8 100644 (file)
@@ -3379,8 +3379,7 @@ Then run the following:
         """
         for osd_id in osd_ids:
             try:
-                self.to_remove_osds.rm(OSD(osd_id=int(osd_id),
-                                           remove_util=self.to_remove_osds.rm_util))
+                self.to_remove_osds.rm_by_osd_id(int(osd_id))
             except (NotFoundError, KeyError, ValueError):
                 return f'Unable to find OSD in the queue: {osd_id}'
 
index bfecc57230abc61fe312762470caa46a87734f77..3616d42deee4e9fdea9638a835ad688fed90e5fa 100644 (file)
@@ -664,6 +664,7 @@ class OSD:
             return None
         self.started = True
         self.stopped = False
+        self.original_weight = self.rm_util.get_weight(self)
 
     def start_draining(self) -> bool:
         if self.stopped:
@@ -672,7 +673,6 @@ class OSD:
         if self.replace:
             self.rm_util.set_osd_flag([self], 'out')
         else:
-            self.original_weight = self.rm_util.get_weight(self)
             self.rm_util.reweight_osd(self, 0.0)
         self.drain_started_at = datetime.utcnow()
         self.draining = True
@@ -761,6 +761,7 @@ class OSD:
         out['force'] = self.force
         out['zap'] = self.zap
         out['hostname'] = self.hostname  # type: ignore
+        out['original_weight'] = self.original_weight
 
         for k in ['drain_started_at', 'drain_stopped_at', 'drain_done_at', 'process_started_at']:
             if getattr(self, k):
@@ -953,6 +954,16 @@ class OSDRemovalQueue(object):
             self.osds.add(osd)
         osd.start()
 
+    def rm_by_osd_id(self, osd_id: int) -> None:
+        osd: Optional["OSD"] = None
+        for o in self.osds:
+            if o.osd_id == osd_id:
+                osd = o
+        if not osd:
+            logger.debug(f"Could not find osd with id {osd_id} in queue.")
+            raise KeyError(f'No osd with id {osd_id} in removal queue')
+        self.rm(osd)
+
     def rm(self, osd: "OSD") -> None:
         if not osd.exists:
             raise NotFoundError()
index b6aef018cf09b807099b64494bf21d937214f35b..ebdf7eea5aa0cba70bd6139161086cf950cc50c3 100644 (file)
@@ -1287,7 +1287,11 @@ class TestCephadm(object):
     ))
     @mock.patch("cephadm.services.osd.OSD.exists", True)
     @mock.patch("cephadm.services.osd.RemoveUtil.get_pg_count", lambda _, __: 0)
-    def test_remove_osds(self, cephadm_module):
+    @mock.patch("cephadm.services.osd.RemoveUtil.get_weight")
+    @mock.patch("cephadm.services.osd.RemoveUtil.reweight_osd")
+    def test_remove_osds(self, _reweight_osd, _get_weight, cephadm_module):
+        osd_initial_weight = 2.1
+        _get_weight.return_value = osd_initial_weight
         with with_host(cephadm_module, 'test'):
             CephadmServe(cephadm_module)._refresh_host_daemons('test')
             c = cephadm_module.list_daemons()
@@ -1297,13 +1301,23 @@ class TestCephadm(object):
             out = wait(cephadm_module, c)
             assert out == ["Removed osd.0 from host 'test'"]
 
-            cephadm_module.to_remove_osds.enqueue(OSD(osd_id=0,
-                                                      replace=False,
-                                                      force=False,
-                                                      hostname='test',
-                                                      process_started_at=datetime_now(),
-                                                      remove_util=cephadm_module.to_remove_osds.rm_util
-                                                      ))
+            osd_0 = OSD(osd_id=0,
+                        replace=False,
+                        force=False,
+                        hostname='test',
+                        process_started_at=datetime_now(),
+                        remove_util=cephadm_module.to_remove_osds.rm_util
+                        )
+
+            cephadm_module.to_remove_osds.enqueue(osd_0)
+            _get_weight.assert_called()
+
+            # test that OSD is properly reweighted on removal
+            cephadm_module.stop_remove_osds([0])
+            _reweight_osd.assert_called_with(mock.ANY, osd_initial_weight)
+
+            # add OSD back to queue and test normal removal queue processing
+            cephadm_module.to_remove_osds.enqueue(osd_0)
             cephadm_module.to_remove_osds.process_removal_queue()
             assert cephadm_module.to_remove_osds == OSDRemovalQueue(cephadm_module)