]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: add interval control for stray daemon checks
authorAdam King <adking@redhat.com>
Wed, 30 Jul 2025 19:49:20 +0000 (15:49 -0400)
committerAdam King <adking@redhat.com>
Wed, 30 Jul 2025 19:49:20 +0000 (15:49 -0400)
Primarily to avoid running list_servers (which we kind of
need to do stray daemon checks since the whole point is
to check against a source that isn't cephadm). It was
found on larger clusters calling into list_servers
often can cause issues with the core ceph mgr

Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/serve.py

index fe36b9467474436c2356190164211d537db810a6..0e53c2e9f0ac56499181b5a3f025ee9961f4ca72 100644 (file)
@@ -204,6 +204,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
             default=10 * 60,
             desc='how frequently to perform a host check',
         ),
+        Option(
+            'stray_daemon_check_interval',
+            type='secs',
+            default=30 * 60,
+            desc='how frequently cephadm should check for the presence of stray daemons',
+        ),
         Option(
             'mode',
             type='str',
@@ -519,6 +525,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
             self.daemon_cache_timeout = 0
             self.facts_cache_timeout = 0
             self.host_check_interval = 0
+            self.stray_daemon_check_interval = 0
             self.max_count_per_host = 0
             self.mode = ''
             self.container_image_base = ''
@@ -693,6 +700,8 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
 
         self.ceph_volume: CephVolume = CephVolume(self)
 
+        self.last_stray_daemon_check: Optional[datetime.datetime] = None
+
     def shutdown(self) -> None:
         self.log.debug('shutdown')
         self._worker_pool.close()
index 88e4982367d3179df611e49d0f1c52b5d3f99040..c6196e22f0866eeb59d0b426f4cd8c68411a8811 100644 (file)
@@ -1,4 +1,4 @@
-from datetime import datetime
+import datetime
 import ipaddress
 import hashlib
 import json
@@ -64,7 +64,7 @@ class CephadmServe:
     def __init__(self, mgr: "CephadmOrchestrator"):
         self.mgr: "CephadmOrchestrator" = mgr
         self.log = logger
-        self.last_certificates_check: Optional[datetime] = None
+        self.last_certificates_check: Optional[datetime.datetime] = None
 
     def serve(self) -> None:
         """
@@ -173,6 +173,7 @@ class CephadmServe:
                 self.mgr.facts_cache_timeout,
                 self.mgr.daemon_cache_timeout,
                 self.mgr.device_cache_timeout,
+                self.mgr.stray_daemon_check_interval,
             )
         )
         self.log.debug('Sleeping for %d seconds', sleep_interval)
@@ -466,6 +467,9 @@ class CephadmServe:
             (self.mgr.scheduled_async_actions.pop(0))()
 
     def _check_for_strays(self) -> None:
+        cutoff = datetime_now() - datetime.timedelta(seconds=self.mgr.stray_daemon_check_interval)
+        if self.mgr.last_stray_daemon_check is not None and self.mgr.last_stray_daemon_check >= cutoff:
+            return
         self.log.debug('_check_for_strays')
         for k in ['CEPHADM_STRAY_HOST',
                   'CEPHADM_STRAY_DAEMON']:
@@ -516,6 +520,7 @@ class CephadmServe:
             if self.mgr.warn_on_stray_daemons and daemon_detail:
                 self.mgr.set_health_warning(
                     'CEPHADM_STRAY_DAEMON', f'{len(daemon_detail)} stray daemon(s) not managed by cephadm', len(daemon_detail), daemon_detail)
+            self.mgr.last_stray_daemon_check = datetime_now()
 
     def _service_reference_name(self, service_type: str, daemon_id: str) -> str:
         if service_type not in ['rbd-mirror', 'cephfs-mirror', 'rgw', 'rgw-nfs']: