]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/cephadm: fix reweighting of OSD when OSD removal is stopped
authorAdam King <adking@redhat.com>
Tue, 7 Nov 2023 20:49:57 +0000 (15:49 -0500)
committerAdam King <adking@redhat.com>
Sun, 10 Mar 2024 20:08:06 +0000 (16:08 -0400)
Previously, when you ran "ceph orch osd rm stop <osd-id>"
cephadm would pass in a new OSD object to the removal
queue that would not have any of the fields set previously
for the OSD. This was mostly fine when removing it from
the queue as those fields were no longer needed, but an
exception was the initial weight, which you need if
you want to set the weight back when you stop removal.

This patch changes it so it will now remove the actual
OSD object the removal queue stores so that we will
get to use the previously set original weight. It also
changes when we grab the original weight to make it
happen earlier and adds it to the to_json so it survives
any potential mgr failovers.

Fixes: https://tracker.ceph.com/issues/63481
Signed-off-by: Adam King <adking@redhat.com>
(cherry picked from commit 99fc4a8d406291b65a53f157442bc54bc67e8b0d)

src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/services/osd.py
src/pybind/mgr/cephadm/tests/test_cephadm.py

index b59cf6687f9f46df331ebdb051deef25c7b459d6..f9cfacecdabebcaa216d40158d8bc10484f4c853 100644 (file)
@@ -3456,8 +3456,7 @@ Then run the following:
         """
         for osd_id in osd_ids:
             try:
-                self.to_remove_osds.rm(OSD(osd_id=int(osd_id),
-                                           remove_util=self.to_remove_osds.rm_util))
+                self.to_remove_osds.rm_by_osd_id(int(osd_id))
             except (NotFoundError, KeyError, ValueError):
                 return f'Unable to find OSD in the queue: {osd_id}'
 
index bfecc57230abc61fe312762470caa46a87734f77..3616d42deee4e9fdea9638a835ad688fed90e5fa 100644 (file)
@@ -664,6 +664,7 @@ class OSD:
             return None
         self.started = True
         self.stopped = False
+        self.original_weight = self.rm_util.get_weight(self)
 
     def start_draining(self) -> bool:
         if self.stopped:
@@ -672,7 +673,6 @@ class OSD:
         if self.replace:
             self.rm_util.set_osd_flag([self], 'out')
         else:
-            self.original_weight = self.rm_util.get_weight(self)
             self.rm_util.reweight_osd(self, 0.0)
         self.drain_started_at = datetime.utcnow()
         self.draining = True
@@ -761,6 +761,7 @@ class OSD:
         out['force'] = self.force
         out['zap'] = self.zap
         out['hostname'] = self.hostname  # type: ignore
+        out['original_weight'] = self.original_weight
 
         for k in ['drain_started_at', 'drain_stopped_at', 'drain_done_at', 'process_started_at']:
             if getattr(self, k):
@@ -953,6 +954,16 @@ class OSDRemovalQueue(object):
             self.osds.add(osd)
         osd.start()
 
+    def rm_by_osd_id(self, osd_id: int) -> None:
+        osd: Optional["OSD"] = None
+        for o in self.osds:
+            if o.osd_id == osd_id:
+                osd = o
+        if not osd:
+            logger.debug(f"Could not find osd with id {osd_id} in queue.")
+            raise KeyError(f'No osd with id {osd_id} in removal queue')
+        self.rm(osd)
+
     def rm(self, osd: "OSD") -> None:
         if not osd.exists:
             raise NotFoundError()
index 24fcb0280949ddcf7bdb7f5839027db5fe851518..23ca87c67df8f362a8bca9830ad815d052d95381 100644 (file)
@@ -1251,7 +1251,11 @@ class TestCephadm(object):
     ))
     @mock.patch("cephadm.services.osd.OSD.exists", True)
     @mock.patch("cephadm.services.osd.RemoveUtil.get_pg_count", lambda _, __: 0)
-    def test_remove_osds(self, cephadm_module):
+    @mock.patch("cephadm.services.osd.RemoveUtil.get_weight")
+    @mock.patch("cephadm.services.osd.RemoveUtil.reweight_osd")
+    def test_remove_osds(self, _reweight_osd, _get_weight, cephadm_module):
+        osd_initial_weight = 2.1
+        _get_weight.return_value = osd_initial_weight
         with with_host(cephadm_module, 'test'):
             CephadmServe(cephadm_module)._refresh_host_daemons('test')
             c = cephadm_module.list_daemons()
@@ -1261,13 +1265,23 @@ class TestCephadm(object):
             out = wait(cephadm_module, c)
             assert out == ["Removed osd.0 from host 'test'"]
 
-            cephadm_module.to_remove_osds.enqueue(OSD(osd_id=0,
-                                                      replace=False,
-                                                      force=False,
-                                                      hostname='test',
-                                                      process_started_at=datetime_now(),
-                                                      remove_util=cephadm_module.to_remove_osds.rm_util
-                                                      ))
+            osd_0 = OSD(osd_id=0,
+                        replace=False,
+                        force=False,
+                        hostname='test',
+                        process_started_at=datetime_now(),
+                        remove_util=cephadm_module.to_remove_osds.rm_util
+                        )
+
+            cephadm_module.to_remove_osds.enqueue(osd_0)
+            _get_weight.assert_called()
+
+            # test that OSD is properly reweighted on removal
+            cephadm_module.stop_remove_osds([0])
+            _reweight_osd.assert_called_with(mock.ANY, osd_initial_weight)
+
+            # add OSD back to queue and test normal removal queue processing
+            cephadm_module.to_remove_osds.enqueue(osd_0)
             cephadm_module.to_remove_osds.process_removal_queue()
             assert cephadm_module.to_remove_osds == OSDRemovalQueue(cephadm_module)