]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: don't mark nvmeof daemons without pool and group in name as stray 63170/head
authorAdam King <adking@redhat.com>
Wed, 7 May 2025 20:02:56 +0000 (16:02 -0400)
committerAdam King <adking@redhat.com>
Sun, 17 Aug 2025 16:58:40 +0000 (12:58 -0400)
Cephadm's naming of these daemons always includes the pool and
group name associated with the nvmeof service. Nvmeof recently
has started to register with the cluster using names that
don't include that, resulting in warnings likes

```
[WRN] CEPHADM_STRAY_DAEMON: 1 stray daemon(s) not managed by cephadm
    stray daemon nvmeof.vm-01.hwwhfc on host vm-01 not managed by cephadm
```

where cephadm knew that nvmeof daemon as

```
[ceph: root@vm-00 /]# ceph orch ps --daemon-type nvmeof
NAME                            HOST   PORTS                   STATUS   REFRESHED  AGE  MEM USE  MEM LIM  VERSION    IMAGE ID
nvmeof.foo.group1.vm-01.hwwhfc  vm-01  *:5500,4420,8009,10008  stopped     5m ago  25m        -        -  <unknown>  <unknown>
```

Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/services/nvmeof.py

index f08ba9462a0704650e06c25c4fa68b49f53f69bd..6d997bb9848c59701058789e1999fac2fb07438f 100644 (file)
@@ -220,6 +220,30 @@ class NvmeofService(CephService):
         warn_message = f'It is presumed safe to stop {names}'
         return HandleCommandResult(0, warn_message, '')
 
+    def ignore_possible_stray(
+        self, service_type: str, daemon_id: str, name: str
+    ) -> bool:
+        if service_type == 'nvmeof':
+            return False
+        # Some newer versions of nvmeof will register with the cluster
+        # with a name that does not include the pool or group name
+        # getting us from "nvmeof.<pool>.<group>.<hostname>.<6-random-chars>"
+        # to "nvmeof.<hostname>.<6-random-chars>"
+        #
+        # This isn't a perfect solution, but we're assuming here if the
+        # random chars at the end of the daemon name match a daemon
+        # we know, it's likely not a stray
+        try:
+            random_chars = daemon_id.split('.')[-1]
+        except ValueError:
+            logger.debug('got nvmeof daemon id: "%s" with no dots', daemon_id)
+            return False
+        for nvmeof_daemon in self.mgr.cache.get_daemons_by_type('nvmeof'):
+            if nvmeof_daemon.name().endswith(random_chars):
+                logger.debug('ignoring possibly stray nvmeof daemon: %s', name)
+                return True
+        return False
+
     def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None:
         """
         Called after the daemon is removed.