From 61c2089acc2dfb5f6c676895596758471b303df9 Mon Sep 17 00:00:00 2001 From: Adam King Date: Wed, 30 Jul 2025 15:49:20 -0400 Subject: [PATCH] mgr/cephadm: add interval control for stray daemon checks Primarily to avoid running list_servers (which we kind of need to do stray daemon checks since the whole point is to check against a source that isn't cephadm). It was found on larger clusters calling into list_servers often can cause issues with the core ceph mgr Signed-off-by: Adam King (cherry picked from commit ee0364761e1ee29e6ad527dddd0eafc01c1f1aaa) --- src/pybind/mgr/cephadm/module.py | 9 +++++++++ src/pybind/mgr/cephadm/serve.py | 9 +++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 5e1167161b013..1c88ad735ed56 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -203,6 +203,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, default=10 * 60, desc='how frequently to perform a host check', ), + Option( + 'stray_daemon_check_interval', + type='secs', + default=30 * 60, + desc='how frequently cephadm should check for the presence of stray daemons', + ), Option( 'mode', type='str', @@ -514,6 +520,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.daemon_cache_timeout = 0 self.facts_cache_timeout = 0 self.host_check_interval = 0 + self.stray_daemon_check_interval = 0 self.max_count_per_host = 0 self.mode = '' self.container_image_base = '' @@ -688,6 +695,8 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, self.ceph_volume: CephVolume = CephVolume(self) + self.last_stray_daemon_check: Optional[datetime.datetime] = None + def shutdown(self) -> None: self.log.debug('shutdown') self._worker_pool.close() diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 88e4982367d31..c6196e22f0866 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -1,4 +1,4 @@ -from datetime import datetime +import datetime import ipaddress import hashlib import json @@ -64,7 +64,7 @@ class CephadmServe: def __init__(self, mgr: "CephadmOrchestrator"): self.mgr: "CephadmOrchestrator" = mgr self.log = logger - self.last_certificates_check: Optional[datetime] = None + self.last_certificates_check: Optional[datetime.datetime] = None def serve(self) -> None: """ @@ -173,6 +173,7 @@ class CephadmServe: self.mgr.facts_cache_timeout, self.mgr.daemon_cache_timeout, self.mgr.device_cache_timeout, + self.mgr.stray_daemon_check_interval, ) ) self.log.debug('Sleeping for %d seconds', sleep_interval) @@ -466,6 +467,9 @@ class CephadmServe: (self.mgr.scheduled_async_actions.pop(0))() def _check_for_strays(self) -> None: + cutoff = datetime_now() - datetime.timedelta(seconds=self.mgr.stray_daemon_check_interval) + if self.mgr.last_stray_daemon_check is not None and self.mgr.last_stray_daemon_check >= cutoff: + return self.log.debug('_check_for_strays') for k in ['CEPHADM_STRAY_HOST', 'CEPHADM_STRAY_DAEMON']: @@ -516,6 +520,7 @@ class CephadmServe: if self.mgr.warn_on_stray_daemons and daemon_detail: self.mgr.set_health_warning( 'CEPHADM_STRAY_DAEMON', f'{len(daemon_detail)} stray daemon(s) not managed by cephadm', len(daemon_detail), daemon_detail) + self.mgr.last_stray_daemon_check = datetime_now() def _service_reference_name(self, service_type: str, daemon_id: str) -> str: if service_type not in ['rbd-mirror', 'cephfs-mirror', 'rgw', 'rgw-nfs']: -- 2.39.5