]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: fix reweighting of OSD when OSD removal is stopped 56083/head
authorAdam King <adking@redhat.com>
Tue, 7 Nov 2023 20:49:57 +0000 (15:49 -0500)
committerAdam King <adking@redhat.com>
Sun, 10 Mar 2024 19:07:13 +0000 (15:07 -0400)
Previously, when you ran "ceph orch osd rm stop <osd-id>"
cephadm would pass in a new OSD object to the removal
queue that would not have any of the fields set previously
for the OSD. This was mostly fine when removing it from
the queue as those fields were no longer needed, but an
exception was the initial weight, which you need if
you want to set the weight back when you stop removal.

This patch changes it so it will now remove the actual
OSD object the removal queue stores so that we will
get to use the previously set original weight. It also
changes when we grab the original weight to make it
happen earlier and adds it to the to_json so it survives
any potential mgr failovers.

Fixes: https://tracker.ceph.com/issues/63481
Signed-off-by: Adam King <adking@redhat.com>
(cherry picked from commit 99fc4a8d406291b65a53f157442bc54bc67e8b0d)

src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/services/osd.py
src/pybind/mgr/cephadm/tests/test_cephadm.py

index 7c495357a1f4891f22d7aa69562d0d3b59c9b6fc..ce7e0faffbad4ecf8bdf3e327ca303dc325e725a 100644 (file)
@@ -3171,8 +3171,7 @@ Then run the following:
         """
         for osd_id in osd_ids:
             try:
-                self.to_remove_osds.rm(OSD(osd_id=int(osd_id),
-                                           remove_util=self.to_remove_osds.rm_util))
+                self.to_remove_osds.rm_by_osd_id(int(osd_id))
             except (NotFoundError, KeyError, ValueError):
                 return f'Unable to find OSD in the queue: {osd_id}'
 
index d777d7fc6fe43c4381571da439ec6ae01176f88e..9adab337c5d54e05841864896c750b3a3da45c74 100644 (file)
@@ -663,6 +663,7 @@ class OSD:
             return None
         self.started = True
         self.stopped = False
+        self.original_weight = self.rm_util.get_weight(self)
 
     def start_draining(self) -> bool:
         if self.stopped:
@@ -671,7 +672,6 @@ class OSD:
         if self.replace:
             self.rm_util.set_osd_flag([self], 'out')
         else:
-            self.original_weight = self.rm_util.get_weight(self)
             self.rm_util.reweight_osd(self, 0.0)
         self.drain_started_at = datetime.utcnow()
         self.draining = True
@@ -760,6 +760,7 @@ class OSD:
         out['force'] = self.force
         out['zap'] = self.zap
         out['hostname'] = self.hostname  # type: ignore
+        out['original_weight'] = self.original_weight
 
         for k in ['drain_started_at', 'drain_stopped_at', 'drain_done_at', 'process_started_at']:
             if getattr(self, k):
@@ -952,6 +953,16 @@ class OSDRemovalQueue(object):
             self.osds.add(osd)
         osd.start()
 
+    def rm_by_osd_id(self, osd_id: int) -> None:
+        osd: Optional["OSD"] = None
+        for o in self.osds:
+            if o.osd_id == osd_id:
+                osd = o
+        if not osd:
+            logger.debug(f"Could not find osd with id {osd_id} in queue.")
+            raise KeyError(f'No osd with id {osd_id} in removal queue')
+        self.rm(osd)
+
     def rm(self, osd: "OSD") -> None:
         if not osd.exists:
             raise NotFoundError()
index 8e09eb9400116679093bbc5b7eb10c05ec756587..61f99775d719f9819683a11863dd434c91318ce9 100644 (file)
@@ -1090,7 +1090,11 @@ class TestCephadm(object):
     ))
     @mock.patch("cephadm.services.osd.OSD.exists", True)
     @mock.patch("cephadm.services.osd.RemoveUtil.get_pg_count", lambda _, __: 0)
-    def test_remove_osds(self, cephadm_module):
+    @mock.patch("cephadm.services.osd.RemoveUtil.get_weight")
+    @mock.patch("cephadm.services.osd.RemoveUtil.reweight_osd")
+    def test_remove_osds(self, _reweight_osd, _get_weight, cephadm_module):
+        osd_initial_weight = 2.1
+        _get_weight.return_value = osd_initial_weight
         with with_host(cephadm_module, 'test'):
             CephadmServe(cephadm_module)._refresh_host_daemons('test')
             c = cephadm_module.list_daemons()
@@ -1100,13 +1104,23 @@ class TestCephadm(object):
             out = wait(cephadm_module, c)
             assert out == ["Removed osd.0 from host 'test'"]
 
-            cephadm_module.to_remove_osds.enqueue(OSD(osd_id=0,
-                                                      replace=False,
-                                                      force=False,
-                                                      hostname='test',
-                                                      process_started_at=datetime_now(),
-                                                      remove_util=cephadm_module.to_remove_osds.rm_util
-                                                      ))
+            osd_0 = OSD(osd_id=0,
+                        replace=False,
+                        force=False,
+                        hostname='test',
+                        process_started_at=datetime_now(),
+                        remove_util=cephadm_module.to_remove_osds.rm_util
+                        )
+
+            cephadm_module.to_remove_osds.enqueue(osd_0)
+            _get_weight.assert_called()
+
+            # test that OSD is properly reweighted on removal
+            cephadm_module.stop_remove_osds([0])
+            _reweight_osd.assert_called_with(mock.ANY, osd_initial_weight)
+
+            # add OSD back to queue and test normal removal queue processing
+            cephadm_module.to_remove_osds.enqueue(osd_0)
             cephadm_module.to_remove_osds.process_removal_queue()
             assert cephadm_module.to_remove_osds == OSDRemovalQueue(cephadm_module)