From 695680876eb8af0891e3776888b6361dc8728c86 Mon Sep 17 00:00:00 2001 From: Adam King Date: Wed, 7 May 2025 16:02:56 -0400 Subject: [PATCH] mgr/cephadm: don't mark nvmeof daemons without pool and group in name as stray Cephadm's naming of these daemons always includes the pool and group name associated with the nvmeof service. Nvmeof recently has started to register with the cluster using names that don't include that, resulting in warnings likes ``` [WRN] CEPHADM_STRAY_DAEMON: 1 stray daemon(s) not managed by cephadm stray daemon nvmeof.vm-01.hwwhfc on host vm-01 not managed by cephadm ``` where cephadm knew that nvmeof daemon as ``` [ceph: root@vm-00 /]# ceph orch ps --daemon-type nvmeof NAME HOST PORTS STATUS REFRESHED AGE MEM USE MEM LIM VERSION IMAGE ID nvmeof.foo.group1.vm-01.hwwhfc vm-01 *:5500,4420,8009,10008 stopped 5m ago 25m - - ``` Signed-off-by: Adam King --- src/pybind/mgr/cephadm/services/nvmeof.py | 24 +++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py index f08ba9462a070..6d997bb9848c5 100644 --- a/src/pybind/mgr/cephadm/services/nvmeof.py +++ b/src/pybind/mgr/cephadm/services/nvmeof.py @@ -220,6 +220,30 @@ class NvmeofService(CephService): warn_message = f'It is presumed safe to stop {names}' return HandleCommandResult(0, warn_message, '') + def ignore_possible_stray( + self, service_type: str, daemon_id: str, name: str + ) -> bool: + if service_type == 'nvmeof': + return False + # Some newer versions of nvmeof will register with the cluster + # with a name that does not include the pool or group name + # getting us from "nvmeof....<6-random-chars>" + # to "nvmeof..<6-random-chars>" + # + # This isn't a perfect solution, but we're assuming here if the + # random chars at the end of the daemon name match a daemon + # we know, it's likely not a stray + try: + random_chars = daemon_id.split('.')[-1] + except ValueError: + logger.debug('got nvmeof daemon id: "%s" with no dots', daemon_id) + return False + for nvmeof_daemon in self.mgr.cache.get_daemons_by_type('nvmeof'): + if nvmeof_daemon.name().endswith(random_chars): + logger.debug('ignoring possibly stray nvmeof daemon: %s', name) + return True + return False + def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None: """ Called after the daemon is removed. -- 2.39.5