]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/cephadm: don't mark nvmeof daemons without pool and group in name as stray
authorAdam King <adking@redhat.com>
Wed, 7 May 2025 20:02:56 +0000 (16:02 -0400)
committerAdam King <adking@redhat.com>
Thu, 18 Sep 2025 23:44:57 +0000 (19:44 -0400)
Cephadm's naming of these daemons always includes the pool and
group name associated with the nvmeof service. Nvmeof recently
has started to register with the cluster using names that
don't include that, resulting in warnings likes

```
[WRN] CEPHADM_STRAY_DAEMON: 1 stray daemon(s) not managed by cephadm
    stray daemon nvmeof.vm-01.hwwhfc on host vm-01 not managed by cephadm
```

where cephadm knew that nvmeof daemon as

```
[ceph: root@vm-00 /]# ceph orch ps --daemon-type nvmeof
NAME                            HOST   PORTS                   STATUS   REFRESHED  AGE  MEM USE  MEM LIM  VERSION    IMAGE ID
nvmeof.foo.group1.vm-01.hwwhfc  vm-01  *:5500,4420,8009,10008  stopped     5m ago  25m        -        -  <unknown>  <unknown>
```

Signed-off-by: Adam King <adking@redhat.com>
(cherry picked from commit 695680876eb8af0891e3776888b6361dc8728c86)

src/pybind/mgr/cephadm/services/nvmeof.py

index f34f107adf37255f1adf86e84662655e0d386956..fe30c63b4e10b6f1b0a4b8019a1eb7247e17aaa9 100644 (file)
@@ -216,6 +216,30 @@ class NvmeofService(CephService):
         warn_message = f'It is presumed safe to stop {names}'
         return HandleCommandResult(0, warn_message, '')
 
+    def ignore_possible_stray(
+        self, service_type: str, daemon_id: str, name: str
+    ) -> bool:
+        if service_type == 'nvmeof':
+            return False
+        # Some newer versions of nvmeof will register with the cluster
+        # with a name that does not include the pool or group name
+        # getting us from "nvmeof.<pool>.<group>.<hostname>.<6-random-chars>"
+        # to "nvmeof.<hostname>.<6-random-chars>"
+        #
+        # This isn't a perfect solution, but we're assuming here if the
+        # random chars at the end of the daemon name match a daemon
+        # we know, it's likely not a stray
+        try:
+            random_chars = daemon_id.split('.')[-1]
+        except ValueError:
+            logger.debug('got nvmeof daemon id: "%s" with no dots', daemon_id)
+            return False
+        for nvmeof_daemon in self.mgr.cache.get_daemons_by_type('nvmeof'):
+            if nvmeof_daemon.name().endswith(random_chars):
+                logger.debug('ignoring possibly stray nvmeof daemon: %s', name)
+                return True
+        return False
+
     def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None:
         """
         Called after the daemon is removed.