]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
cephadm: wait for latest osd map after ceph-volume before OSD deploy 68367/head
authorGuillaume Abrioux <gabrioux@ibm.com>
Fri, 10 Apr 2026 14:37:58 +0000 (16:37 +0200)
committerGuillaume Abrioux <gabrioux@ibm.com>
Tue, 14 Apr 2026 07:53:04 +0000 (09:53 +0200)
after ceph-volume creates an OSD, the cached osd map of the mgr can
lag behind the monitors, then get_osd_uuid_map() misses the new osd
id and deploy_osd_daemons_for_existing_osds() skips deploying the
cephadm daemon, which reports a misleading "Created no osd(s)" while
the osd exists.

This behavior is often seen with raw devices. (lvm list returns quicker).

This also fixes get_osd_uuid_map(only_up=True) as the previous branch
never populated the map when 'only_up' was true.
Now we only include osds with 'up==1' so a new OSD created (but still down)
is not treated as already present.

Fixes: https://tracker.ceph.com/issues/75965
Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/services/osd.py
src/pybind/mgr/cephadm/tests/fixtures.py
src/pybind/mgr/cephadm/tests/test_cephadm.py

index 347305c649954ccdf4ac903df1b6f46a9a6104f0..d5d7817b3579f3da5fd5499ad283debe493af280 100644 (file)
@@ -2988,13 +2988,16 @@ Then run the following:
         osd_map = self.get('osd_map')
         r = {}
         for o in osd_map['osds']:
-            # only include OSDs that have ever started in this map.  this way
-            # an interrupted osd create can be repeated and succeed the second
-            # time around.
+            # when only_up, only include OSDs that are currently up, so we do not
+            # treat a "just created" (still down) osd as "already present" for
+            # deploy_osd_daemons_for_existing_osds().
             osd_id = o.get('osd')
             if osd_id is None:
                 raise OrchestratorError("Could not retrieve osd_id from osd_map")
-            if not only_up:
+            if only_up:
+                if o.get('up') == 1:
+                    r[str(osd_id)] = o.get('uuid', '')
+            else:
                 r[str(osd_id)] = o.get('uuid', '')
         return r
 
index ea78b5c38147bafc940cb581ffb14e576b277452..1c8bc5c6818cab8e6850cafa50a796414cc7ed15 100644 (file)
@@ -1,6 +1,6 @@
 import json
 import logging
-from asyncio import gather
+from asyncio import gather, to_thread
 from threading import Lock
 from typing import List, Dict, Any, Set, Tuple, cast, Optional, TYPE_CHECKING
 
@@ -116,6 +116,17 @@ class OSDService(CephService):
             replace_osd_ids = OsdIdClaims(self.mgr).filtered_by_host(host)
             assert replace_osd_ids is not None
 
+        # ceph-volume registers new OSDs with the monitor before returning.
+        # the mgr's view of the osd map can briefly lag, so get_osd_uuid_map()
+        # would miss the new id and we would skip deploying the cephadm
+        # daemon (misleading "Created no osd(s)" while the osd exists but is still down).
+        # wait_for_latest_osdmap() is synchronous:
+        # We need to run it in a thread pool so we do not block the cephadm asyncio event loop.
+        ret = await to_thread(self.mgr.rados.wait_for_latest_osdmap)
+        if ret < 0:
+            raise OrchestratorError(
+                'wait_for_latest_osdmap failed with %d' % ret)
+
         # check result: lvm
         osds_elems: dict = await CephadmServe(self.mgr)._run_cephadm_json(
             host, 'osd', 'ceph-volume',
index dda0c6720ac6cbda35b9dad29fc04d78d1a60f9e..92fa87ac82906d90c507e1f12e8a74d00342a850 100644 (file)
@@ -133,6 +133,11 @@ def with_cephadm_module(module_options=None, store=None):
         m.event_loop = MockEventLoopThread()
         m.tkey = NamedTemporaryFile(prefix='test-cephadm-identity-')
 
+        # librados wait_for_latest_osdmap() returns int (0 = success).
+        # Tests use a MagicMock for rados unless configured.
+        # OSD deploy compares ret < 0.
+        m.rados.wait_for_latest_osdmap.return_value = 0
+
         yield m
 
 
index 34b3a4e6083678d8bb78f3293ed977540b5349f0..264919cd0f95909c2ad220d6c6eb5257b354912a 100644 (file)
@@ -87,12 +87,15 @@ def with_daemon(cephadm_module: CephadmOrchestrator, spec: ServiceSpec, host: st
 
 @contextmanager
 def with_osd_daemon(cephadm_module: CephadmOrchestrator, _run_cephadm, host: str, osd_id: int, ceph_volume_lvm_list=None):
+    # OSD is in the cluster map but still down (no cephadm daemon yet). If up==1,
+    # get_osd_uuid_map(only_up=True) would list it in before_osd_uuid_map and
+    # deploy_osd_daemons_for_existing_osds would skip deploying the daemon.
     cephadm_module.mock_store_set('_ceph_get', 'osd_map', {
         'osds': [
             {
                 'osd': 1,
                 'up_from': 0,
-                'up': True,
+                'up': 0,
                 'uuid': 'uuid'
             }
         ]
@@ -147,6 +150,39 @@ def with_osd_daemon(cephadm_module: CephadmOrchestrator, _run_cephadm, host: str
 
 class TestCephadm(object):
 
+    def test_get_osd_uuid_map_includes_down_osds_when_only_up_false(self, cephadm_module):
+        cephadm_module.mock_store_set('_ceph_get', 'osd_map', {
+            'osds': [
+                {'osd': 0, 'up': 1, 'uuid': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'},
+                {'osd': 1, 'up': 0, 'uuid': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb'},
+            ]
+        })
+        m = cephadm_module.get_osd_uuid_map(only_up=False)
+        assert m == {
+            '0': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa',
+            '1': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb',
+        }
+
+    def test_get_osd_uuid_map_only_up_includes_only_up_osds(self, cephadm_module):
+        cephadm_module.mock_store_set('_ceph_get', 'osd_map', {
+            'osds': [
+                {'osd': 0, 'up': 1, 'uuid': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'},
+                {'osd': 1, 'up': 0, 'uuid': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb'},
+            ]
+        })
+        m = cephadm_module.get_osd_uuid_map(only_up=True)
+        assert m == {'0': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'}
+
+    @mock.patch('cephadm.services.osd.to_thread', new_callable=mock.AsyncMock)
+    def test_deploy_osd_daemons_wait_for_latest_osdmap_failure(self, mock_to_thread, cephadm_module):
+        mock_to_thread.return_value = -5
+        with pytest.raises(OrchestratorError, match='wait_for_latest_osdmap failed'):
+            cephadm_module.wait_async(
+                cephadm_module.osd_service.deploy_osd_daemons_for_existing_osds(
+                    'test',
+                    DriveGroupSpec(service_type='osd', service_id=''),
+                    replace_osd_ids=[]))
+
     def test_get_unique_name(self, cephadm_module):
         # type: (CephadmOrchestrator) -> None
         existing = [
@@ -1576,6 +1612,13 @@ class TestCephadm(object):
                         dd2.status = DaemonDescriptionStatus.error
                         cephadm_module.cache.update_host_daemons(dd1.hostname, {dd1.name(): dd1})
                         cephadm_module.cache.update_host_daemons(dd2.hostname, {dd2.name(): dd2})
+                        # _check_for_moved_osds removes the stray duplicate only if
+                        # get_osd_by_id reports the osd as up
+                        cephadm_module.mock_store_set('_ceph_get', 'osd_map', {
+                            'osds': [
+                                {'osd': 1, 'up_from': 0, 'up': 1, 'uuid': 'uuid'}
+                            ]
+                        })
                         CephadmServe(cephadm_module)._check_for_moved_osds()
                         assert len(cephadm_module.cache.get_daemons()) == 1