From 33e1b119569249030b6b1a78b0a1d5b9bbc0e3a3 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Mon, 12 Aug 2024 10:56:36 -0400 Subject: [PATCH] mgr/cephadm: extend stray service detection with a general ignore hook Extend the system's current stray service detection with a new method on the service classes so that new classes can hook into the stray services in the case that ceph services and cephadm services have differing names or use subsystems that call into ceph with different names (my use case). Signed-off-by: John Mulligan --- src/pybind/mgr/cephadm/serve.py | 35 +++++++++++++++---- .../mgr/cephadm/services/cephadmservice.py | 10 ++++++ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index d95f44dc818..822194e117c 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -482,7 +482,9 @@ class CephadmServe: if self.mgr.warn_on_stray_hosts or self.mgr.warn_on_stray_daemons: ls = self.mgr.list_servers() self.log.debug(ls) - managed = self.mgr.cache.get_daemon_names() + managed_daemons = self.mgr.cache.get_daemons() + stray_filter = self._build_stray_filter(managed_daemons) + managed = [d.name() for d in managed_daemons] host_detail = [] # type: List[str] host_num_daemons = 0 daemon_detail = [] # type: List[str] @@ -496,11 +498,7 @@ class CephadmServe: daemon_id = s.get('id') assert daemon_id name = self._service_reference_name(s.get('type'), daemon_id) - if s.get('type') == 'tcmu-runner': - # because we don't track tcmu-runner daemons in the host cache - # and don't have a way to check if the daemon is part of iscsi service - # we assume that all tcmu-runner daemons are managed by cephadm - managed.append(name) + managed.extend(stray_filter(s.get('type'), daemon_id, name)) # Don't mark daemons we just created/removed in the last minute as stray. # It may take some time for the mgr to become aware the daemon # had been created/removed. @@ -544,6 +542,31 @@ class CephadmServe: ) return name + def _build_stray_filter( + self, managed: List[orchestrator.DaemonDescription] + ) -> Callable[[str, str, str], List[str]]: + svcs = { + daemon_type_to_service(cast(str, dd.daemon_type)) + for dd in managed + } + _services = [self.mgr.cephadm_services[dt] for dt in svcs] + + def _filter( + service_type: str, daemon_id: str, name: str + ) -> List[str]: + if service_type == 'tcmu-runner': + # because we don't track tcmu-runner daemons in the host cache + # and don't have a way to check if the daemon is part of iscsi service + # we assume that all tcmu-runner daemons are managed by cephadm + return [name] + out = [] + for svc in _services: + if svc.ignore_possible_stray(service_type, daemon_id, name): + out.append(name) + return out + + return _filter + def _check_for_moved_osds(self) -> None: self.log.debug('_check_for_moved_osds') all_osds: DefaultDict[int, List[orchestrator.DaemonDescription]] = defaultdict(list) diff --git a/src/pybind/mgr/cephadm/services/cephadmservice.py b/src/pybind/mgr/cephadm/services/cephadmservice.py index 8a41d3a54c2..f7a360b7243 100644 --- a/src/pybind/mgr/cephadm/services/cephadmservice.py +++ b/src/pybind/mgr/cephadm/services/cephadmservice.py @@ -564,6 +564,16 @@ class CephadmService(metaclass=ABCMeta): """Called to carry out any purge tasks following service removal""" logger.debug(f'Purge called for {self.TYPE} - no action taken') + def ignore_possible_stray( + self, service_type: str, daemon_id: str, name: str + ) -> bool: + """Called to decide if a possible stray service should be ignored + because it "virtually" belongs to a service. + This is mainly needed when properly managed services spawn layered ceph + services with different names (for example). + """ + return False + class CephService(CephadmService): def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]: -- 2.39.5