Failure without fix looks like:
2023-12-21T16:05:55.737+0000
7fbe585b0700 0 [devicehealth DEBUG root] loading object ABC_DEADB33F_FA
2023-12-21T16:05:55.737+0000
7fbe585b0700 -1 log_channel(cluster) log [ERR] : Unhandled exception from module 'devicehealth' while running on mgr.x: [errno 2] RADOS object not found (Failed to operate read op for oid ABC_DEADB33F_FA)
2023-12-21T16:05:55.737+0000
7fbe585b0700 -1 devicehealth.serve:
2023-12-21T16:05:55.737+0000
7fbe585b0700 -1 Traceback (most recent call last):
File "/home/pdonnell/ceph/src/pybind/mgr/devicehealth/module.py", line 394, in serve
self._do_serve()
File "/home/pdonnell/ceph/src/pybind/mgr/mgr_module.py", line 524, in check
return func(self, *args, **kwargs)
File "/home/pdonnell/ceph/src/pybind/mgr/devicehealth/module.py", line 354, in _do_serve
finished_loading_legacy = self.check_legacy_pool()
File "/home/pdonnell/ceph/src/pybind/mgr/devicehealth/module.py", line 326, in check_legacy_pool
if self._load_legacy_object(ioctx, obj.key):
File "/home/pdonnell/ceph/src/pybind/mgr/devicehealth/module.py", line 300, in _load_legacy_object
ioctx.operate_read_op(op, oid)
File "rados.pyx", line 3723, in rados.Ioctx.operate_read_op
rados.ObjectNotFound: [errno 2] RADOS object not found (Failed to operate read op for oid ABC_DEADB33F_FA)
Credit to Greg Farnum for postulating the cause.
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
(cherry picked from commit
aa30adbaa7616c8e24b3ccaad9dbcda7c0c663fc)
--- /dev/null
+from io import StringIO
+import logging
+
+from .mgr_test_case import MgrTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestDeviceHealth(MgrTestCase):
+ MGRS_REQUIRED = 1
+
+ def setUp(self):
+ super(TestDeviceHealth, self).setUp()
+ self.setup_mgrs()
+
+ def tearDown(self):
+ self.mgr_cluster.mon_manager.raw_cluster_cmd('mgr', 'set', 'down', 'true')
+ self.mgr_cluster.mon_manager.raw_cluster_cmd('config', 'set', 'mon', 'mon_allow_pool_delete', 'true')
+ self.mgr_cluster.mon_manager.raw_cluster_cmd('osd', 'pool', 'rm', '.mgr', '.mgr', '--yes-i-really-really-mean-it-not-faking')
+ self.mgr_cluster.mon_manager.raw_cluster_cmd('mgr', 'set', 'down', 'false')
+
+ def test_legacy_upgrade_snap(self):
+ """
+ """
+
+ o = "ABC_DEADB33F_FA"
+ self.mon_manager.do_rados(["put", o, "-"], pool=".mgr", stdin=StringIO("junk"))
+ self.mon_manager.do_rados(["mksnap", "foo"], pool=".mgr")
+ self.mon_manager.do_rados(["rm", o], pool=".mgr")
+ self.mgr_cluster.mgr_fail()
+
+ with self.assert_cluster_log("Unhandled exception from module 'devicehealth' while running", present=False):
+ self.wait_until_true(lambda: self.mgr_cluster.get_active_id() is not None, timeout=60)