From: Guillaume Abrioux Date: Fri, 10 Apr 2026 14:37:58 +0000 (+0200) Subject: cephadm: wait for latest osd map after ceph-volume before OSD deploy X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=56123af6477a93c62df74e23b6cf3b4fdf6b19e9;p=ceph.git cephadm: wait for latest osd map after ceph-volume before OSD deploy after ceph-volume creates an OSD, the cached osd map of the mgr can lag behind the monitors, then get_osd_uuid_map() misses the new osd id and deploy_osd_daemons_for_existing_osds() skips deploying the cephadm daemon, which reports a misleading "Created no osd(s)" while the osd exists. This behavior is often seen with raw devices. (lvm list returns quicker). This also fixes get_osd_uuid_map(only_up=True) as the previous branch never populated the map when 'only_up' was true. Now we only include osds with 'up==1' so a new OSD created (but still down) is not treated as already present. Fixes: https://tracker.ceph.com/issues/75965 Signed-off-by: Guillaume Abrioux --- diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 347305c64995..d5d7817b3579 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -2988,13 +2988,16 @@ Then run the following: osd_map = self.get('osd_map') r = {} for o in osd_map['osds']: - # only include OSDs that have ever started in this map. this way - # an interrupted osd create can be repeated and succeed the second - # time around. + # when only_up, only include OSDs that are currently up, so we do not + # treat a "just created" (still down) osd as "already present" for + # deploy_osd_daemons_for_existing_osds(). osd_id = o.get('osd') if osd_id is None: raise OrchestratorError("Could not retrieve osd_id from osd_map") - if not only_up: + if only_up: + if o.get('up') == 1: + r[str(osd_id)] = o.get('uuid', '') + else: r[str(osd_id)] = o.get('uuid', '') return r diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py index ea78b5c38147..1c8bc5c6818c 100644 --- a/src/pybind/mgr/cephadm/services/osd.py +++ b/src/pybind/mgr/cephadm/services/osd.py @@ -1,6 +1,6 @@ import json import logging -from asyncio import gather +from asyncio import gather, to_thread from threading import Lock from typing import List, Dict, Any, Set, Tuple, cast, Optional, TYPE_CHECKING @@ -116,6 +116,17 @@ class OSDService(CephService): replace_osd_ids = OsdIdClaims(self.mgr).filtered_by_host(host) assert replace_osd_ids is not None + # ceph-volume registers new OSDs with the monitor before returning. + # the mgr's view of the osd map can briefly lag, so get_osd_uuid_map() + # would miss the new id and we would skip deploying the cephadm + # daemon (misleading "Created no osd(s)" while the osd exists but is still down). + # wait_for_latest_osdmap() is synchronous: + # We need to run it in a thread pool so we do not block the cephadm asyncio event loop. + ret = await to_thread(self.mgr.rados.wait_for_latest_osdmap) + if ret < 0: + raise OrchestratorError( + 'wait_for_latest_osdmap failed with %d' % ret) + # check result: lvm osds_elems: dict = await CephadmServe(self.mgr)._run_cephadm_json( host, 'osd', 'ceph-volume', diff --git a/src/pybind/mgr/cephadm/tests/fixtures.py b/src/pybind/mgr/cephadm/tests/fixtures.py index dda0c6720ac6..92fa87ac8290 100644 --- a/src/pybind/mgr/cephadm/tests/fixtures.py +++ b/src/pybind/mgr/cephadm/tests/fixtures.py @@ -133,6 +133,11 @@ def with_cephadm_module(module_options=None, store=None): m.event_loop = MockEventLoopThread() m.tkey = NamedTemporaryFile(prefix='test-cephadm-identity-') + # librados wait_for_latest_osdmap() returns int (0 = success). + # Tests use a MagicMock for rados unless configured. + # OSD deploy compares ret < 0. + m.rados.wait_for_latest_osdmap.return_value = 0 + yield m diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 34b3a4e60836..264919cd0f95 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -87,12 +87,15 @@ def with_daemon(cephadm_module: CephadmOrchestrator, spec: ServiceSpec, host: st @contextmanager def with_osd_daemon(cephadm_module: CephadmOrchestrator, _run_cephadm, host: str, osd_id: int, ceph_volume_lvm_list=None): + # OSD is in the cluster map but still down (no cephadm daemon yet). If up==1, + # get_osd_uuid_map(only_up=True) would list it in before_osd_uuid_map and + # deploy_osd_daemons_for_existing_osds would skip deploying the daemon. cephadm_module.mock_store_set('_ceph_get', 'osd_map', { 'osds': [ { 'osd': 1, 'up_from': 0, - 'up': True, + 'up': 0, 'uuid': 'uuid' } ] @@ -147,6 +150,39 @@ def with_osd_daemon(cephadm_module: CephadmOrchestrator, _run_cephadm, host: str class TestCephadm(object): + def test_get_osd_uuid_map_includes_down_osds_when_only_up_false(self, cephadm_module): + cephadm_module.mock_store_set('_ceph_get', 'osd_map', { + 'osds': [ + {'osd': 0, 'up': 1, 'uuid': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'}, + {'osd': 1, 'up': 0, 'uuid': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb'}, + ] + }) + m = cephadm_module.get_osd_uuid_map(only_up=False) + assert m == { + '0': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', + '1': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', + } + + def test_get_osd_uuid_map_only_up_includes_only_up_osds(self, cephadm_module): + cephadm_module.mock_store_set('_ceph_get', 'osd_map', { + 'osds': [ + {'osd': 0, 'up': 1, 'uuid': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'}, + {'osd': 1, 'up': 0, 'uuid': 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb'}, + ] + }) + m = cephadm_module.get_osd_uuid_map(only_up=True) + assert m == {'0': 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'} + + @mock.patch('cephadm.services.osd.to_thread', new_callable=mock.AsyncMock) + def test_deploy_osd_daemons_wait_for_latest_osdmap_failure(self, mock_to_thread, cephadm_module): + mock_to_thread.return_value = -5 + with pytest.raises(OrchestratorError, match='wait_for_latest_osdmap failed'): + cephadm_module.wait_async( + cephadm_module.osd_service.deploy_osd_daemons_for_existing_osds( + 'test', + DriveGroupSpec(service_type='osd', service_id=''), + replace_osd_ids=[])) + def test_get_unique_name(self, cephadm_module): # type: (CephadmOrchestrator) -> None existing = [ @@ -1576,6 +1612,13 @@ class TestCephadm(object): dd2.status = DaemonDescriptionStatus.error cephadm_module.cache.update_host_daemons(dd1.hostname, {dd1.name(): dd1}) cephadm_module.cache.update_host_daemons(dd2.hostname, {dd2.name(): dd2}) + # _check_for_moved_osds removes the stray duplicate only if + # get_osd_by_id reports the osd as up + cephadm_module.mock_store_set('_ceph_get', 'osd_map', { + 'osds': [ + {'osd': 1, 'up_from': 0, 'up': 1, 'uuid': 'uuid'} + ] + }) CephadmServe(cephadm_module)._check_for_moved_osds() assert len(cephadm_module.cache.get_daemons()) == 1