From: Sage Weil Date: Tue, 21 Jan 2020 16:40:07 +0000 (-0600) Subject: mgr/cephadm: health alert for stray hosts and/or services X-Git-Tag: v15.1.0~73^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2ff06d75f8ea5b4377748783fb4207deec7c53bc;p=ceph.git mgr/cephadm: health alert for stray hosts and/or services Fixes: https://tracker.ceph.com/issues/43714 Signed-off-by: Sage Weil --- diff --git a/doc/mgr/cephadm.rst b/doc/mgr/cephadm.rst index 1d5266b9d0b4..a6d4c88366a9 100644 --- a/doc/mgr/cephadm.rst +++ b/doc/mgr/cephadm.rst @@ -37,3 +37,44 @@ To clear this value use the command: :: # ceph cephadm clear-ssh-config + +Health checks +------------- + +CEPHADM_STRAY_HOST +^^^^^^^^^^^^^^^^^^ + +One or more hosts have running Ceph daemons but are not registered as +hosts managed by *cephadm*. This means that those services cannot +currently be managed by cephadm (e.g., restarted, upgraded, included +in `ceph orchestrator service ls`). + +You can manage the host(s) with:: + + ceph orchestrator host add ** + +Note that you may need to configure SSH access to the remote host +before this will work. + +Alternatively, you can manually connect to the host and ensure that +services on that host are removed and/or migrated to a host that is +managed by *cephadm*. + +You can also disable this warning entirely with:: + + ceph config set mgr mgr/cephadm/warn_on_stray_hosts false + +CEPHADM_STRAY_SERVICE +^^^^^^^^^^^^^^^^^^^^^ + +One or more Ceph daemons are running but not are not managed by +*cephadm*, perhaps because they were deploy using a different tool, or +were started manually. This means that those services cannot +currently be managed by cephadm (e.g., restarted, upgraded, included +in `ceph orchestrator service ls`). + +**FIXME:** We need to implement and document an adopt procedure here. + +You can also disable this warning entirely with:: + + ceph config set mgr mgr/cephadm/warn_on_stray_services false diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 31ba5b321b3b..6f46d1d877c2 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -303,12 +303,30 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): 'desc': 'Container image name, without the tag', 'runtime': True, }, + { + 'name': 'warn_on_stray_hosts', + 'type': 'bool', + 'default': True, + 'desc': 'raise a health warning if services are detected on a host ' + 'that is not managed by cephadm', + }, + { + 'name': 'warn_on_stray_services', + 'type': 'bool', + 'default': True, + 'desc': 'raise a health warning if services are detected ' + 'that are not managed by cephadm', + }, ] def __init__(self, *args, **kwargs): super(CephadmOrchestrator, self).__init__(*args, **kwargs) self._cluster_fsid = self.get('mon_map')['fsid'] + # for serve() + self.run = True + self.event = Event() + self.config_notify() path = self.get_ceph_option('cephadm_path') @@ -368,10 +386,6 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): if h not in self.inventory: del self.service_cache[h] - # for serve() - self.run = True - self.event = Event() - def shutdown(self): self.log.info('shutdown') self._worker_pool.close() @@ -405,8 +419,10 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): (s.service_type, s.service_instance)) return True - def _clear_health_checks(self): - self.health_checks = {} + def _clear_upgrade_health_checks(self): + for k in ['UPGRADE_NO_STANDBY_MGR']: + if k in self.health_checks: + del self.health_checks[k] self.set_health_checks(self.health_checks) def _do_upgrade(self, daemons): @@ -557,6 +573,61 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): self._save_upgrade_state() return None + def _check_for_strays(self): + self.log.debug('_check_for_strays') + for k in ['CEPHADM_STRAY_HOST', + 'CEPHADM_STRAY_SERVICE']: + if k in self.health_checks: + del self.health_checks[k] + if self.warn_on_stray_hosts or self.warn_on_stray_services: + ls = self.list_servers() + managed = [] + if self.warn_on_stray_services: + completion = self._get_services() + self._orchestrator_wait([completion]) + orchestrator.raise_if_exception(completion) + self.log.debug('services %s' % completion.result) + for s in completion.result: + managed.append(s.name()) + self.log.debug('cephadm daemons %s' % managed) + host_detail = [] # type: List[str] + host_num_services = 0 + service_detail = [] # type: List[str] + for item in ls: + host = item.get('hostname') + services = item.get('services') + missing_names = [] + for s in services: + name = '%s.%s' % (s.get('type'), s.get('id')) + if host not in self.inventory: + missing_names.append(name) + host_num_services += 1 + if name not in managed: + service_detail.append( + 'stray service %s on host %s not managed by cephadm' % (name, host)) + if missing_names: + host_detail.append( + 'stray host %s has %d stray daemons: %s' % ( + host, len(missing_names), missing_names)) + if host_detail: + self.health_checks['CEPHADM_STRAY_HOST'] = { + 'severity': 'warning', + 'summary': '%d stray host(s) with %s service(s) ' + 'not managed by cephadm' % ( + len(host_detail), host_num_services), + 'count': len(host_detail), + 'detail': host_detail, + } + if service_detail: + self.health_checks['CEPHADM_STRAY_SERVICE'] = { + 'severity': 'warning', + 'summary': '%d stray service(s) not managed by cephadm' % ( + len(service_detail)), + 'count': len(service_detail), + 'detail': service_detail, + } + self.set_health_checks(self.health_checks) + def serve(self): # type: () -> None self.log.info("serve starting") @@ -578,7 +649,9 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): orchestrator.raise_if_exception(completion) self.log.debug('did _do_upgrade') - sleep_interval = 60*60 # this really doesn't matter + self._check_for_strays() + + sleep_interval = 600 self.log.debug('Sleeping for %d seconds', sleep_interval) ret = self.event.wait(sleep_interval) self.event.clear() @@ -599,6 +672,10 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): opt, # type: ignore self.get_ceph_option(opt)) self.log.debug(' native option %s = %s', opt, getattr(self, opt)) # type: ignore + self.event.set() + + def notify(self, notify_type, notify_id): + self.event.set() def get_unique_name(self, existing, prefix=None, forcename=None): """ @@ -929,6 +1006,7 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): self._save_inventory() self.inventory_cache[host] = orchestrator.OutdatableData() self.service_cache[host] = orchestrator.OutdatableData() + self.event.set() # refresh stray health check return "Added host '{}'".format(host) @async_completion @@ -942,6 +1020,7 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): self._save_inventory() del self.inventory_cache[host] del self.service_cache[host] + self.event.set() # refresh stray health check return "Removed host '{}'".format(host) @trivial_completion @@ -1807,7 +1886,7 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): 'target_version': target_version, } self._save_upgrade_state() - self._clear_health_checks() + self._clear_upgrade_health_checks() self.event.set() return trivial_result('Initiating upgrade to %s %s' % (image, target_id)) @@ -1840,7 +1919,7 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): target_name = self.upgrade_state.get('target_name') self.upgrade_state = None self._save_upgrade_state() - self._clear_health_checks() + self._clear_upgrade_health_checks() self.event.set() return trivial_result('Stopped upgrade to %s' % target_name)